# Applying Model using Pipeline
---

In [5]:
## EDA Standard Libary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats as ss

In [6]:
#ML Library

#ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#ML TrainTest Split
from sklearn.model_selection import train_test_split
#ML Report
from sklearn.metrics import  accuracy_score, classification_report

In [7]:
from warnings import filterwarnings
filterwarnings('ignore')

# Load Data adult.csv

In [8]:
#Load data

data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [10]:
#Construct deep Info on columns & values:

datainfo = []
for i in data.columns:
    datainfo.append([i, data[i].dtypes,
                      data[i].isna().sum(),
                      round((((data[i].isna().sum())/(len(data)))*100),2), 
                    data[i].nunique(), 
                    data[i].sample(3).values])
pd.DataFrame(datainfo, columns = ['dataFeatures', 'dataType', 'null', 'nullPct', 'unique','uniqueSample'])


Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,age,int64,0,0.0,73,"[73, 22, 73]"
1,workclass,object,0,0.0,9,"[Private, Private, Private]"
2,fnlwgt,int64,0,0.0,21648,"[140988, 344425, 35448]"
3,education,object,0,0.0,16,"[HS-grad, Bachelors, Some-college]"
4,education.num,int64,0,0.0,16,"[12, 10, 9]"
5,marital.status,object,0,0.0,7,"[Married-civ-spouse, Married-civ-spouse, Divor..."
6,occupation,object,0,0.0,15,"[Exec-managerial, Craft-repair, Transport-moving]"
7,relationship,object,0,0.0,6,"[Husband, Unmarried, Own-child]"
8,race,object,0,0.0,5,"[White, White, White]"
9,sex,object,0,0.0,2,"[Male, Male, Female]"


In [11]:
#Replace ? with NaN
data.replace('?', np.nan, inplace=True)

In [12]:
#Construct deep Info on columns & values:

datainfo = []
for i in data.columns:
    datainfo.append([i, data[i].dtypes,
                      data[i].isna().sum(),
                      round((((data[i].isna().sum())/(len(data)))*100),2), 
                    data[i].nunique(), 
                    data[i].sample(3).values])
pd.DataFrame(datainfo, columns = ['dataFeatures', 'dataType', 'null', 'nullPct', 'unique','uniqueSample'])


Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,age,int64,0,0.0,73,"[70, 37, 27]"
1,workclass,object,1836,5.64,8,"[Self-emp-not-inc, State-gov, Private]"
2,fnlwgt,int64,0,0.0,21648,"[94041, 46868, 36012]"
3,education,object,0,0.0,16,"[HS-grad, Assoc-voc, Some-college]"
4,education.num,int64,0,0.0,16,"[8, 13, 13]"
5,marital.status,object,0,0.0,7,"[Married-civ-spouse, Separated, Married-civ-sp..."
6,occupation,object,1843,5.66,14,"[Sales, Prof-specialty, Sales]"
7,relationship,object,0,0.0,6,"[Husband, Own-child, Husband]"
8,race,object,0,0.0,5,"[White, White, White]"
9,sex,object,0,0.0,2,"[Female, Male, Female]"


## Schema for Feature Columns

**Categorical**

- workclass, occupation, native-country --> binary
- categort.no --> as is

**Numerical**

- 

In [13]:
#Scaling & Encoding Library
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

#Pipeline Library
from sklearn.pipeline import Pipeline

#Null Imputer
from sklearn.impute import SimpleImputer

In [14]:
binary_encode = Pipeline([
    ('Missing Value', SimpleImputer(strategy='constant', fill_value='Other')),
    ('BE', ce.BinaryEncoder())
])

In [15]:
#Create transformer schema
transformer2 = ColumnTransformer([
    ('OHE', OneHotEncoder(drop='first'), ['relationship', 'race', 'sex']),
    ('Binary Encoder', binary_encode, ['workclass', 'occupation', 'native.country', 'marital.status'])], remainder='passthrough'
)

In [16]:
#Create transformer schema
transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(drop='first'), ['relationship', 'race', 'sex']),
    ('Binary Encoder', binary_encode, ['workclass', 'occupation', 'native.country', 'marital.status']),
    ('Scaler', RobustScaler(), ['age', 'capital.gain', 'capital.loss', 'hours.per.week'])], remainder='passthrough'
)

In [17]:
transformer

## Train, Test Split

In [18]:
#Train, test split
y = data['income']
x = data.drop('income', axis=1)

In [19]:
#Convert target to binary
y = np.where(y == '<=50K', 0, 1)

In [20]:
#Drop unused column
x.drop(['fnlwgt', 'education'], axis=1, inplace=True)

In [21]:
#Splitting Test & Train
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.2, random_state=2023, stratify=y)

In [22]:
xtrain.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
597,24,Self-emp-not-inc,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,1902,40,United-States
28702,30,State-gov,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States
26557,39,Private,10,Separated,Prof-specialty,Unmarried,White,Female,0,0,40,United-States
13855,21,Private,9,Never-married,Other-service,Not-in-family,White,Male,0,0,35,United-States
5879,47,Private,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,70,United-States


## Create Pipeline

In [23]:
#DT Pipeline

dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=5,
)

In [24]:
#Create Pipeline
pipe = Pipeline([
    ('Preprocessing', transformer),
    ('DT Classification', dt)
])

pipe.fit(xtrain, ytrain)

In [25]:
#Predict
ypred = pipe.predict(xtest)

In [26]:
accuracy_score(ytest, ypred)

0.8418547520343928

In [27]:
pipe_without_prep = Pipeline([
    ('Transfomer 2', transformer2),
        ('DT', dt)])
pipe_without_prep.fit(xtrain, ytrain)

In [28]:
ypred2 = pipe_without_prep.predict(xtest)

In [29]:
accuracy_score(ytest, ypred2)

0.8418547520343928

---

# Cross Validation

In [30]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [35]:
#Cross Validation with KFold
kfold = StratifiedKFold(n_splits=5)

#Cross Validation with Pipeline
pipe_cv = cross_val_score(pipe, xtrain, ytrain, cv=kfold, scoring='f1')
pipe_cv

array([0.58952618, 0.57599226, 0.63563705, 0.61251792, 0.63239308])

In [36]:
pipe_cv.mean()

0.6092132978155625

## Model Benchmarking

In [58]:
#Model Benchmarking
#---------------------------------------------
#Decision Tree
dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=5,
)

#KNN
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    algorithm='auto'
)

#SVC
svc = SVC(
    C=1.0,
    kernel='rbf',
    degree=3,
    gamma='scale'
)

#Logistic Regression
logreg = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    C=1.0,
    max_iter=1000
)

#Create Pipeline
model_list = [dt, knn, svc, logreg]

model_scores = []
model_f1 = []
model_std = []

#Loop through all models
for i in model_list:
    model_pipe = Pipeline([
        ('Preprocessing', transformer),
        ('Model', i)
    ])
    model_cv = cross_val_score(model_pipe, xtrain, ytrain, cv=kfold, scoring='f1')
    model_scores.append(model_cv.mean())
    model_std.append(model_cv.std())
    model_f1.append(model_cv)




In [44]:
model_f1

[array([0.58952618, 0.57599226, 0.63563705, 0.61310378, 0.63239308]),
 array([0.6797066 , 0.65045342, 0.68061142, 0.67772708, 0.67412008]),
 array([0.40348837, 0.38533178, 0.4336938 , 0.38557067, 0.40278584]),
 array([0.64853006, 0.62218215, 0.64582403, 0.63640449, 0.62808434])]

In [53]:
df_model_cv = pd.DataFrame({'Model' : ['Decision Tree', 'KNN', 'SVC', 'Logistic Regression'],
                           'F1 Score Mean': model_scores,
                           'F1 Score STD':model_std})
df_model_cv.sort_values('F1 Score Mean', ascending=False)

Unnamed: 0,Model,F1 Score Mean,F1 Score STD
1,KNN,0.672524,0.011258
3,Logistic Regression,0.636205,0.01007
0,Decision Tree,0.60933,0.023417
2,SVC,0.402174,0.017635


- KNN Classifier scores as the best by F1 score means
- The deviation of F1 score is the 2nd least for KNN Classifier
- If there's a high std score, then the model is not stable and may not be a good model -> evaluate the list of scores

In [94]:
#Apply Best Model

#KNN
knn = KNeighborsClassifier()

pipe_knn = Pipeline([('Tranform', transformer),
                    ('Model', knn)])

In [96]:
pipe_knn.fit(xtrain, ytrain)

In [97]:
pipe_knn.score(xtest, ytest)

0.8506064793489944

## Hyperparameter Tuning

In [91]:
#Libraries
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [114]:
#setup the parameter grid for KNN

space = {'Model__n_neighbors' : np.arange(1, 21),
        'Model__weights' : ['uniform', 'distance'],
        'Model__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']}

#Initiate GridSearchCV
knn_grid = GridSearchCV(pipe_knn, param_grid=space, cv=kfold, scoring='f1', n_jobs=1)  #n_jobs=-1 means using all processor

#Initiate RandomizedSearchCV
# knn_random = RandomizedSearchCV(knn, space, cv=kfold, scoring='f1', n_jobs=-1, n_iter=20)


In [115]:
knn_grid

In [116]:
#Fit GridSearchCV
knn_grid.fit(xtrain, ytrain)