In [2]:
import os
import sys
import re
import csv
import pandas as pd
import numpy as np
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

from demyst.analytics import Analytics
from demyst.analytics.report import *

analytics = Analytics()

# Inputs

Load in consumer records

In [3]:
inputs = pd.read_csv('https://s3.amazonaws.com/demyst-apis/demos/100.csv', dtype = {'phone': object, 'post_code': object})
inputs.head()

Unnamed: 0,client_id,first_name,last_name,street_address,city,state,post_code,country,phone,email_address
0,5004,Carlton,Penaloza,652 N. Marengo Ave. #202,Pasadena,CA,91101,us,6262246101,cpena82@yahoo.com
1,5005,Gilberto,De La Cruz,1303 W 168th St. Apt.9,Gardena,CA,90247,us,3105941950,dispatcher72@hotmail.com
2,5006,Richard,Garcia,14666 Hiawatha St,Mission Hills,CA,91345,us,8184478257,richard818sf@gmail.com
3,5007,Michelle,Garcia,PO Box 1055,Fulton,TX,78358,us,12104451205,michellegarcia1943@icloud.com
4,5008,ROGER,LEE,4101 Elm Hill Dr,plano,tx,75024,us,(214) 733-9860,leechen0427@gmail.com


# Clean The Input Data

First, validate the input file


In [3]:
analytics.validate(inputs)

Column,Type,Error %
first_name,FirstName,0.0
last_name,LastName,0.0
city,City,0.0
state,State,0.0
post_code,PostCode,0.0
country,Country,0.0
phone,Phone,0.0
email_address,EmailAddress,1.9

Column,Suggestions
client_id,This column should be of type string
street_address,No suggestions found for this column


# Rename Street Column

In [4]:
inputs.rename(columns = {'street_address':'street'}, inplace = True)

# Validate Email Address

In [5]:
pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
inputs['email_address'] = inputs['email_address'].apply(lambda x: x if pattern.match(x) else np.nan)

# Drop Null Values 

In [6]:
inputs = inputs.dropna()

In [7]:
analytics.validate(inputs)

Column,Type,Error %
first_name,FirstName,0.0
last_name,LastName,0.0
street,Street,0.0
city,City,0.0
state,State,0.0
post_code,PostCode,0.0
country,Country,0.0
phone,Phone,0.0
email_address,EmailAddress,0.0

Column,Suggestions
client_id,This column should be of type string


# Search For Data Products

The headers of your dataframe are sent to the Demyst Platform and matching Data Products are returned

In [8]:
analytics.search(inputs)

Please enter your username: hsingh@demystdata.com
Please enter your password: ········


Unnamed: 0,last_name,city,street,state,post_code,first_name,country
Option 1,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,last_name,phone,email_address,city,street,state,ip4,post_code,first_name,country
Option 1,☒,,,☒,☒,☒,,☒,☒,☒
Option 2,☒,☒,,☒,☒,☒,,☒,☒,☒
Option 3,☒,,☒,,,,,,☒,
Option 4,,☒,,☒,☒,☒,☐,☒,,☒
Option 5,,,,☒,☒,☒,☐,☒,,☒
Option 6,,☒,,,,,☐,,,

Unnamed: 0,phone,city,street,state,post_code
Option 1,☒,☒,☒,☒,☒

Unnamed: 0,phone
Option 1,☒

Unnamed: 0,last_name,email_address,city,street,state,first_name
Option 1,☒,☒,☒,☒,☒,☒

Unnamed: 0,phone,country
Option 1,☒,☒

Unnamed: 0,street,country
Option 1,☒,☒

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,phone,city,street,state,post_code
Option 1,☒,☒,☒,☒,☒

Unnamed: 0,city,street,post_code,state
Option 1,☒,☒,☒,☒

Unnamed: 0,last_name,city,street,state,post_code,first_name
Option 1,☒,☒,☒,☒,☒,☒

Unnamed: 0,last_name,city,street,state,post_code,first_name
Option 1,☒,☒,☒,☒,☒,☒

Unnamed: 0,phone,country
Option 1,☒,☒

Unnamed: 0,phone,country
Option 1,☒,☒

Unnamed: 0,phone,country
Option 1,☒,☒

Unnamed: 0,phone,country
Option 1,☒,☒

Unnamed: 0,ip,email_address
Option 1,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,phone,email_address,city,street,state,post_code,country
Option 1,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,last_name,business_name,first_name
Option 1,☒,☐,☒

Unnamed: 0,city,street,post_code,state
Option 1,☒,☒,☒,☒

Unnamed: 0,city,street,post_code,state
Option 1,☒,☒,☒,☒

Unnamed: 0,last_name,phone,email_address,city,state,model,post_code,first_name,street_line_1
Option 1,☒,☒,☒,☒,☒,☐,☒,☒,☐

Unnamed: 0,last_name,phone,email_address,city,state,post_code,first_name,street_line_1
Option 1,☒,☒,☒,☒,☒,☒,☒,☐

Unnamed: 0,last_name,phone,email_address,city,state,post_code,first_name,street_line_1
Option 1,☒,☒,☒,☒,☒,☒,☒,☐

Unnamed: 0,latitude,last_name,city,street,state,post_code,longitude,first_name,country
Option 1,☐,☒,☒,☒,☒,☒,☐,☒,☒

Unnamed: 0,city,business_name,post_code,state
Option 1,☒,☐,☒,☒

Unnamed: 0,latitude,city,post_code,longitude,city_id,country
Option 1,☐,☒,☒,☐,☐,☒

Unnamed: 0,latitude,city,post_code,longitude,city_id,country
Option 1,☐,☒,☒,☐,☐,☒

Unnamed: 0,latitude,city,post_code,longitude,city_id,country
Option 1,☐,☒,☒,☐,☐,☒

Unnamed: 0,latitude,city,number_of_hours,date_time,longitude,city_id,country
Option 1,☐,☒,☐,☐,☐,☐,☒

Unnamed: 0,city,street,state,post_code,freeform,country
Option 1,☒,☒,☒,☒,☐,☒

Unnamed: 0,city,business_name,state
Option 1,☒,☐,☒

Unnamed: 0,city,business_name,state
Option 1,☒,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,last_name,email_address,city,street,state,first_name
Option 1,☒,☒,☒,☒,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code,block_id
Option 1,☒,☒,☐

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,latitude,city,street,state,post_code,longitude
Option 1,☐,☒,☒,☒,☒,☐

Unnamed: 0,latitude,city,street,state,post_code,longitude
Option 1,☐,☒,☒,☒,☒,☐

Unnamed: 0,street,post_code
Option 1,☒,☒

Unnamed: 0,last_name,phone,email_address,city,street,state,post_code,first_name
Option 1,☒,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,last_name,phone,email_address,post_code,first_name
Option 1,☒,☒,☒,☒,☒

Unnamed: 0,last_name,phone,email_address,city,street,state,post_code,first_name
Option 1,☒,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,city,street,state,post_code,country
Option 1,☒,☒,☒,☒,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,last_name,business_name,first_name,domain
Option 1,☒,☐,☒,☐

Unnamed: 0,latitude,phone,business_name,radius,partner_id,longitude,country
Option 1,☐,☒,☐,☐,☐,☐,☒

Unnamed: 0,city,street,post_code,state
Option 1,☒,☒,☒,☒

Unnamed: 0,city,business_name,min_conf,street,state_region
Option 1,☒,☐,☐,☒,☐

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,phone,email_address,city,business_name,url,street,state,duns_number,registration_number,country
Option 1,☒,☒,☒,☐,☐,☒,☒,☐,☐,☒

Unnamed: 0,city,category
Option 1,☒,☐

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,last_name,court_case_number,first_name
Option 1,☒,☐,☒

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,last_name,first_name
Option 1,☒,☒

Unnamed: 0,phone,email_address,city,street,state,post_code,country
Option 1,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,phone,email_address,city,street,state,post_code,country
Option 1,☒,☒,☒,☒,☒,☒,☒

Unnamed: 0,post_code
Option 1,☒

Unnamed: 0,last_name,acn,company_name,abn,first_name
Option 1,☒,☐,☐,☐,☒

Unnamed: 0,last_name,date_of_birth,city,inp_state,street,post_code,first_name,country
Option 1,☒,☐,☒,☐,☒,☒,☒,☒

Unnamed: 0,country,number
Option 1,☒,☐

Unnamed: 0,country,number
Option 1,☒,☐

Unnamed: 0,name,country
Option 1,☐,☒

Unnamed: 0,name,country
Option 1,☐,☒

Unnamed: 0,country
Option 1,☒

Unnamed: 0,last_name,date_of_birth,city,inp_state,street,post_code,first_name
Option 1,☒,☐,☒,☐,☒,☒,☒

Unnamed: 0,required_input_example_two,email_address
Option 1,☐,☒

Unnamed: 0,required_input_example_two,email_address
Option 1,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,required_input_example_two,email_address
Option 1,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,required_input_example_two,email_address
Option 1,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,country
Option 1,☒

Unnamed: 0,city,business_name,street,state,post_code,country
Option 1,☒,☐,☒,☒,☒,☒

Unnamed: 0,business_name,country
Option 1,☐,☒

Unnamed: 0,company_code,country
Option 1,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,business_name,country
Option 1,☐,☒

Unnamed: 0,company_code,country
Option 1,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,required_input_example_two,email_address
Option 1,☐,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,email_address
Option 1,☒

Unnamed: 0,application_id,verification_type,product_type,date_of_upload,country
Option 1,☐,☐,☐,☐,☒

Unnamed: 0,country,application_id
Option 1,☒,☐

Unnamed: 0,business_name,country
Option 1,☐,☒

Unnamed: 0,company_code,country
Option 1,☐,☒


# Enrich Input File

In [10]:
results = analytics.enrich_and_download(['white_pages_pro_find_person', 'rap_leaf', 'info_connect_individual'], inputs[:5])

Starting enrichment...
Uploading data...


This enrichment will use 11.8 credits of the 999931514 credits your organization currently has.


Enrich Job ID: 4300


IntProgress(value=1, max=2)

Label(value='Checking status...')

# Display Results


In [14]:
results.T

Unnamed: 0,0,1
inputs.city,Pasadena,Gardena
inputs.country,us,us
inputs.email_address,cpena82@yahoo.com,dispatcher72@hotmail.com
inputs.first_name,Carlton,Gilberto
inputs.last_name,Penaloza,De La Cruz
inputs.phone,6262246101,3105941950
inputs.post_code,91101,90247
inputs.state,CA,CA
inputs.street,652 N. Marengo Ave. #202,1303 W 168th St. Apt.9
white_pages_pro_find_person.row_id,0,1


# Construct Attributes Data Quality DataFrame
Get the attributes returned from every connector as a dataframe

In [15]:
attributes = report(inputs, results)

In [16]:
attributes

Unnamed: 0,connector,attribute,max_nested,match_rate,fill_rate,nunique,column_0
0,info_connect_individual,client_id,1,1.0,1.0,2,info_connect_individual.client_id
1,info_connect_individual,error,1,1.0,0.0,1,info_connect_individual.error
2,info_connect_individual,is_hit,1,1.0,1.0,1,info_connect_individual.is_hit
3,info_connect_individual,results[*].address.number,1,1.0,1.0,2,info_connect_individual.results[0].address.number
4,info_connect_individual,results[*].address.post_directional,1,1.0,0.0,1,info_connect_individual.results[0].address.post_directional
5,info_connect_individual,results[*].address.pre_directional,1,1.0,0.5,2,info_connect_individual.results[0].address.pre_directional
6,info_connect_individual,results[*].address.street_name,1,1.0,1.0,2,info_connect_individual.results[0].address.street_name
7,info_connect_individual,results[*].address.suffix,1,1.0,0.5,2,info_connect_individual.results[0].address.suffix
8,info_connect_individual,results[*].address.unit_number,1,1.0,0.5,2,info_connect_individual.results[0].address.unit_number
9,info_connect_individual,results[*].address.unit_type,1,1.0,0.5,2,info_connect_individual.results[0].address.unit_type


# Select attributes
Select the attributes we want to keep

In [17]:
selected = attributes.query('match_rate >= .8 & fill_rate >= .5 & nunique >= 2')
selected

Unnamed: 0,connector,attribute,max_nested,match_rate,fill_rate,nunique,column_0
0,info_connect_individual,client_id,1,1.0,1.0,2,info_connect_individual.client_id
3,info_connect_individual,results[*].address.number,1,1.0,1.0,2,info_connect_individual.results[0].address.number
5,info_connect_individual,results[*].address.pre_directional,1,1.0,0.5,2,info_connect_individual.results[0].address.pre_directional
6,info_connect_individual,results[*].address.street_name,1,1.0,1.0,2,info_connect_individual.results[0].address.street_name
7,info_connect_individual,results[*].address.suffix,1,1.0,0.5,2,info_connect_individual.results[0].address.suffix
8,info_connect_individual,results[*].address.unit_number,1,1.0,0.5,2,info_connect_individual.results[0].address.unit_number
9,info_connect_individual,results[*].address.unit_type,1,1.0,0.5,2,info_connect_individual.results[0].address.unit_type
13,info_connect_individual,results[*].city,1,1.0,1.0,2,info_connect_individual.results[0].city
15,info_connect_individual,results[*].first_name,1,1.0,1.0,2,info_connect_individual.results[0].first_name
16,info_connect_individual,results[*].full_postcode,1,1.0,1.0,2,info_connect_individual.results[0].full_postcode


# Final Data
Show the final dataset of chosen attributes (first level nested only)

In [18]:
inputs_columns = ["inputs."+ col for col in inputs.columns if col!="client_id"]
final_columns = inputs_columns + selected.column_0.values.tolist()
results[final_columns]

Unnamed: 0,inputs.first_name,inputs.last_name,inputs.street,inputs.city,inputs.state,inputs.post_code,inputs.country,inputs.phone,inputs.email_address,info_connect_individual.client_id,info_connect_individual.results[0].address.number,info_connect_individual.results[0].address.pre_directional,info_connect_individual.results[0].address.street_name,info_connect_individual.results[0].address.suffix,info_connect_individual.results[0].address.unit_number,info_connect_individual.results[0].address.unit_type,info_connect_individual.results[0].city,info_connect_individual.results[0].first_name,info_connect_individual.results[0].full_postcode,info_connect_individual.results[0].gender,info_connect_individual.results[0].last_name,info_connect_individual.results[0].location.latitude,info_connect_individual.results[0].location.longitude,info_connect_individual.results[0].post_code,info_connect_individual.results[0].street,info_connect_individual.row_id,inputs.city.1,inputs.email_address.1,inputs.first_name.1,inputs.last_name.1,inputs.phone.1,inputs.post_code.1,inputs.street.1,rap_leaf.age_range_high,rap_leaf.age_range_low,rap_leaf.client_id,rap_leaf.education,rap_leaf.has_children,rap_leaf.home_market_value_range_high,rap_leaf.home_market_value_range_low,rap_leaf.home_owner_status,rap_leaf.household_income_range_high,rap_leaf.household_income_range_low,rap_leaf.occupation,rap_leaf.post_code,rap_leaf.row_id,white_pages_pro_find_person.client_id,white_pages_pro_find_person.row_id
0,Carlton,Penaloza,652 N. Marengo Ave. #202,Pasadena,CA,91101,us,6262246101,cpena82@yahoo.com,5004,1605,,Calle Ciervos,,,,San Dimas,Zueleyma,91773-4119,,Penaloza,34.083565,-117.840487,91773,1605 Calle Ciervos,0,Pasadena,cpena82@yahoo.com,Carlton,Penaloza,6262246101,91101,652 N. Marengo Ave. #202,,65,5004,Completed High School,False,1000000.0,500000.0,Own,100000,75000,White Collar Worker,91773,0,5004,0
1,Gilberto,De La Cruz,1303 W 168th St. Apt.9,Gardena,CA,90247,us,3105941950,dispatcher72@hotmail.com,5005,1328,W,Gardena,Blvd,4.0,APT,Gardena,Gilberto,90247-4840,Male,De La Cruz,33.880945,-118.29853,90247,1328 W Gardena Blvd Apt 4,1,Gardena,dispatcher72@hotmail.com,Gilberto,De La Cruz,3105941950,90247,1303 W 168th St. Apt.9,54.0,45,5005,,,,,Rent,25000,15000,,90250,1,5005,1


# Different age from sources

In [20]:
age_variations = results.loc[:, ['white_pages_pro_find_person.people[0].age_range', 'rap_leaf.age_range_low',
                                'rap_leaf.age_range_high', 'info_connect_individual.results[0].age_range.low',
                                'info_connect_individual.results[0].age_range.high']]
age_variations

Unnamed: 0,white_pages_pro_find_person.people[0].age_range,rap_leaf.age_range_low,rap_leaf.age_range_high,info_connect_individual.results[0].age_range.low,info_connect_individual.results[0].age_range.high
0,,65,,,
1,,45,54.0,,


# Data Functions 

Creating Data Functions:

    demyst-data-function create DATA_FUNCTION_NAME

Deploy Data Funcitons:

    ./data-function deploy

# Hosted Data Function with Feature Engineering
```python
from demyst.df.df2 import df2
import pandas as pd
import json
import itertools


def get_avg_age(infoconnect_age, rapleaf_age, whitepage_age):
    n = sum = 0
    for age in [infoconnect_age, rapleaf_age, whitepage_age]:
        if age:
            sum += int(age)
            n += 1
    if n == 0 or sum == 0:
        avg_age = None
    else:
        avg_age = sum/n
    return avg_age


@df2
def data_function(df):
    infoconnect_age_high = infoconnect_age_low = rapleaf_age_high = rapleaf_age_low = whitepage_age_high = whitepage_age_low = None

    # Feature engineering
    resp = df.connectors.fetch(['info_connect_individual', 'rap_leaf', 'white_pages_pro_find_person'])
    infoconnect = df.connectors.get('info_connect_individual', 'results', None)
    if len(infoconnect) > 0:
        infoconnect_age_high = infoconnect[0]['age_range']['high']
        infoconnect_age_low = infoconnect[0]['age_range']['low']

    rap_leaf = df.connectors.get('rap_leaf', '', None)
    rapleaf_age_high = rap_leaf.get('age_range_high', None)
    rapleaf_age_low = rap_leaf.get('age_range_low', None)

    white_page = df.connectors.get('white_pages_pro_find_person', 'people', None)

    if white_page:
        whitepage_age_range = white_page[0]['age_range']
    else:
        whitepage_age_range = None
    if whitepage_age_range:
        whitepage_age_low, whitepage_age_high = whitepage_age_range.split('-')

    derived_age_high = get_avg_age(infoconnect_age_high, rapleaf_age_high, whitepage_age_high)
    derived_age_low = get_avg_age(infoconnect_age_low, rapleaf_age_low, whitepage_age_low)

    dataframe = pd.DataFrame({'age_high': [infoconnect_age_high, rapleaf_age_high, whitepage_age_high],
                              'age_low': [infoconnect_age_low, rapleaf_age_low, whitepage_age_low]})
    dataframe = pd.get_dummies(dataframe, columns=dataframe.columns)
    
    # pairwise interaction of attributes 
    for pair in itertools.combinations(dataframe.columns, 2):
        dataframe[pair[0]+'_'+pair[1]] = dataframe[pair[0]]*dataframe[pair[1]]

    return {
        'derived_age_high': derived_age_high,
        'derived_age_low': derived_age_low,
        'feature_engineering': dataframe.to_dict()
    }

```

In [21]:
df_result = analytics.data_function_append("data-fn--256-attribute-demo", inputs[:3])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [22]:
df_result.T

Unnamed: 0,0,1,2
client_id,5004,5005,5006
first_name,Carlton,Gilberto,Richard
last_name,Penaloza,De La Cruz,Garcia
street,652 N. Marengo Ave. #202,1303 W 168th St. Apt.9,14666 Hiawatha St
city,Pasadena,Gardena,Mission Hills
state,CA,CA,CA
post_code,91101,90247,91345
country,us,us,us
phone,6262246101,3105941950,8184478257
email_address,cpena82@yahoo.com,dispatcher72@hotmail.com,richard818sf@gmail.com
