**Loading Data and creating benchmark model**

In [None]:
# Defining the path to the Github repository
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/adult.csv'

In [None]:
# Loading data using pandas
import pandas as pd
adultData = pd.read_csv(file_url,sep=",",na_values = " ?")
adultData.head()

In [None]:
# Dropping the na values
adultData = adultData.dropna(axis = 0, how = 'any')
adultData.shape

In [None]:
# Removing the target variable
Y = adultData.pop('label')

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(adultData, Y, test_size=0.3, random_state=123)

In [None]:
# Using pipeline to transform categorical variable and numeric variables
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])



In [None]:
# Defining data types for numeric and categorical features
numeric_features = adultData.select_dtypes(include=['int64', 'float64']).columns
categorical_features = adultData.select_dtypes(include=['object']).columns



In [None]:
# Defining preprocessor
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Defining the estimator for processing and classification
from sklearn.linear_model import LogisticRegression
estimator = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LogisticRegression(random_state=123))])


In [None]:
# Fit the estimator on the training set
estimator.fit(X_train, y_train)  
print("model score: %.2f" % estimator.score(X_test, y_test)) 

In [None]:
# Predict on the test set
pred = estimator.predict(X_test)

In [None]:
# Generating classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))

**Establishing entities and relationship**

In [None]:
# Creating the Ids for parent entity
adultData['parentID'] = adultData.index.values

adultData['parentID'] = 'record' + adultData['parentID'].astype(str)

In [None]:
# Creating unique Ids for entity workclass
adultData.loc[adultData.workclass == ' Federal-gov','workId']= 1
adultData.loc[adultData.workclass == ' Local-gov','workId']= 2
adultData.loc[adultData.workclass == ' Private','workId']= 3
adultData.loc[adultData.workclass == ' Self-emp-inc','workId']= 4
adultData.loc[adultData.workclass == ' Self-emp-not-inc','workId']= 5
adultData.loc[adultData.workclass == ' State-gov','workId']= 6
adultData.loc[adultData.workclass == ' Without-pay','workId']= 7

In [None]:
# Creating unique IDs for occupation
adultData.loc[adultData.occupation == ' Adm-clerical','occuId']= 1
adultData.loc[adultData.occupation == ' Armed-Forces','occuId']= 2
adultData.loc[adultData.occupation == ' Craft-repair','occuId']= 3
adultData.loc[adultData.occupation == ' Exec-managerial','occuId']= 4
adultData.loc[adultData.occupation == ' Farming-fishing','occuId']= 5
adultData.loc[adultData.occupation == ' Handlers-cleaners','occuId']= 6
adultData.loc[adultData.occupation == ' Machine-op-inspct','occuId']= 7
adultData.loc[adultData.occupation == ' Other-service','occuId']= 8
adultData.loc[adultData.occupation == ' Priv-house-serv','occuId']= 9
adultData.loc[adultData.occupation == ' Prof-specialty','occuId']= 10
adultData.loc[adultData.occupation == ' Protective-serv','occuId']= 11
adultData.loc[adultData.occupation == ' Sales','occuId']= 12
adultData.loc[adultData.occupation == ' Tech-support','occuId']= 13
adultData.loc[adultData.occupation == ' Transport-moving','occuId']= 14

In [None]:
# Importing necessary libraries
import featuretools as ft
import numpy as np

In [None]:
# creating the entity set 'adultentities'
adultentities = ft.EntitySet(id = 'Adult')

In [None]:
# Mapping a dataframe to the entityset to form the parent entity
adultentities.entity_from_dataframe(entity_id = 'Parent Data', dataframe = adultData, index = 'parentID')

In [None]:
# Mapping to parent entity and setting the relationship
adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='education', index = 'education-num', 
additional_variables = ['education'])

adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='Workclass', index = 'workId', 
additional_variables = ['workclass'])

adultentities.normalize_entity(base_entity_id='Parent Data', new_entity_id='Occupation', index = 'occuId', 
additional_variables = ['occupation'])


**Feature Engineering**

In [None]:
# Creating aggregation and transformation primitives
aggPrimitives=[
        'std', 'min', 'max', 'mean', 
         'last', 'count'
        
]
tranPrimitives=[
        'percentile', 
         'subtract', 'divide']

In [None]:
# Defining the new set of features
feature_set, feature_names = ft.dfs(entityset=adultentities, 
target_entity = 'Parent Data',
agg_primitives=aggPrimitives,
trans_primitives=tranPrimitives, 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

In [None]:
# Reindexing the feature_set
feature_set = feature_set.reindex(index=adultData['parentID'])
feature_set = feature_set.reset_index()

In [None]:
# Displaying the feature set 
feature_set.shape

**Cleaning na values and infinity values**

In [None]:
# Dropping all Ids
X = feature_set[feature_set.columns[~feature_set.columns.str.contains(
    'parentID|education-num|workId|occuId')]]


In [None]:
# Replacing all columns with infinity with nan
X = X.replace([np.inf, -np.inf], np.nan)



In [None]:
# Dropping all columns with nan
X = X.dropna(axis=1, how='any')
X.shape

**Modelling phase**

In [None]:
# Splitting train and test sets
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [None]:
# Creating the preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Creating the estimator function and fitting the training set
estimator = Pipeline(steps=[('preprocessor', preprocessor),                      
                           ('classifier',LogisticRegression(random_state=123))])
estimator.fit(X_train, y_train)  
print("model score: %.2f" % estimator.score(X_test, y_test)) 

In [None]:
# Predicting on the test set
pred = estimator.predict(X_test)

In [None]:
# Generating the classification report
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))