# Applying Model using Pipeline
---

In [39]:
## EDA Standard Libary

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats as ss

In [40]:
#ML Library

#ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#ML TrainTest Split
from sklearn.model_selection import train_test_split
#ML Report
from sklearn.metrics import  accuracy_score, classification_report

In [41]:
from warnings import filterwarnings
filterwarnings('ignore')

# Load Data adult.csv

In [42]:
#Load data

data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [44]:
#Construct deep Info on columns & values:

datainfo = []
for i in data.columns:
    datainfo.append([i, data[i].dtypes,
                      data[i].isna().sum(),
                      round((((data[i].isna().sum())/(len(data)))*100),2), 
                    data[i].nunique(), 
                    data[i].sample(3).values])
pd.DataFrame(datainfo, columns = ['dataFeatures', 'dataType', 'null', 'nullPct', 'unique','uniqueSample'])


Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,age,int64,0,0.0,73,"[35, 33, 28]"
1,workclass,object,0,0.0,9,"[Private, ?, Local-gov]"
2,fnlwgt,int64,0,0.0,21648,"[431426, 82283, 200973]"
3,education,object,0,0.0,16,"[Bachelors, HS-grad, Some-college]"
4,education.num,int64,0,0.0,16,"[13, 15, 10]"
5,marital.status,object,0,0.0,7,"[Never-married, Married-civ-spouse, Married-ci..."
6,occupation,object,0,0.0,15,"[Other-service, ?, Sales]"
7,relationship,object,0,0.0,6,"[Husband, Not-in-family, Unmarried]"
8,race,object,0,0.0,5,"[White, White, White]"
9,sex,object,0,0.0,2,"[Male, Male, Female]"


In [45]:
#Replace ? with NaN
data.replace('?', np.nan, inplace=True)

In [46]:
#Construct deep Info on columns & values:

datainfo = []
for i in data.columns:
    datainfo.append([i, data[i].dtypes,
                      data[i].isna().sum(),
                      round((((data[i].isna().sum())/(len(data)))*100),2), 
                    data[i].nunique(), 
                    data[i].sample(3).values])
pd.DataFrame(datainfo, columns = ['dataFeatures', 'dataType', 'null', 'nullPct', 'unique','uniqueSample'])


Unnamed: 0,dataFeatures,dataType,null,nullPct,unique,uniqueSample
0,age,int64,0,0.0,73,"[43, 37, 23]"
1,workclass,object,1836,5.64,8,"[Private, Local-gov, Private]"
2,fnlwgt,int64,0,0.0,21648,"[163729, 41281, 180246]"
3,education,object,0,0.0,16,"[HS-grad, HS-grad, Some-college]"
4,education.num,int64,0,0.0,16,"[10, 13, 13]"
5,marital.status,object,0,0.0,7,"[Married-civ-spouse, Married-civ-spouse, Divor..."
6,occupation,object,1843,5.66,14,"[Prof-specialty, Transport-moving, Adm-clerical]"
7,relationship,object,0,0.0,6,"[Husband, Husband, Husband]"
8,race,object,0,0.0,5,"[White, White, White]"
9,sex,object,0,0.0,2,"[Female, Male, Female]"


## Schema for Feature Columns

**Categorical**

- workclass, occupation, native-country --> binary
- categort.no --> as is

**Numerical**

- 

In [47]:
#Scaling & Encoding Library
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

#Pipeline Library
from sklearn.pipeline import Pipeline

#Null Imputer
from sklearn.impute import SimpleImputer

In [85]:
binary_encode = Pipeline([
    ('Missing Value', SimpleImputer(strategy='constant', fill_value='Other')),
    ('BE', ce.BinaryEncoder())
])

In [86]:
#Create transformer schema
transformer2 = ColumnTransformer([
    ('OHE', OneHotEncoder(drop='first'), ['relationship', 'race', 'sex']),
    ('Binary Encoder', binary_encode, ['workclass', 'occupation', 'native.country', 'marital.status'])], remainder='passthrough'
)

In [87]:
#Create transformer schema
transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(drop='first'), ['relationship', 'race', 'sex']),
    ('Binary Encoder', binary_encode, ['workclass', 'occupation', 'native.country', 'marital.status']),
    ('Scaler', RobustScaler(), ['age', 'capital.gain', 'capital.loss', 'hours.per.week'])], remainder='passthrough'
)

In [72]:
transformer

## Train, Test Split

In [50]:
#Train, test split
y = data['income']
x = data.drop('income', axis=1)

In [51]:
#Convert target to binary
y = np.where(y == '<=50K', 0, 1)

In [52]:
#Drop unused column
x.drop(['fnlwgt', 'education'], axis=1, inplace=True)

In [53]:
#Splitting Test & Train
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.2, random_state=2023, stratify=y)

In [61]:
xtrain.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
597,24,Self-emp-not-inc,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,1902,40,United-States
28702,30,State-gov,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States
26557,39,Private,10,Separated,Prof-specialty,Unmarried,White,Female,0,0,40,United-States
13855,21,Private,9,Never-married,Other-service,Not-in-family,White,Male,0,0,35,United-States
5879,47,Private,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,70,United-States


## Create Pipeline

In [79]:
#DT Pipeline

dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=5,
)

In [78]:
#Create Pipeline
pipe = Pipeline([
    ('Preprocessing', transformer),
    ('DT Classification', dt)
])

pipe.fit(xtrain, ytrain)

In [81]:
#Predict
ypred = pipe.predict(xtest)

In [82]:
accuracy_score(ytest, ypred)

0.8418547520343928

In [88]:
pipe_without_prep = Pipeline([
    ('Transfomer 2', transformer2),
        ('DT', dt)])
pipe_without_prep.fit(xtrain, ytrain)

In [89]:
ypred2 = pipe_without_prep.predict(xtest)

In [90]:
accuracy_score(ytest, ypred2)

0.8418547520343928