In [65]:
# Basic Operation 
import numpy as np
import pandas as pd

# ML Models
from sklearn.ensemble import RandomForestClassifier

#Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

#Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Model
import pickle

#warnings
import warnings
warnings.filterwarnings('ignore')

In [66]:
#data
df_adult = pd.read_csv('adult.csv')

In [67]:
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [68]:
# delete feature binary (education)
df_adult.drop('education', axis=1, inplace=True)

In [69]:
# preprocess method
df_adult.replace('?', np.nan, inplace = True)
binary_encoder_pipeline = Pipeline([
                                    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'NC')),
                                    ('binary encoder',ce.BinaryEncoder())
])
    
transformer= ColumnTransformer([
                                ('one hot encoder', OneHotEncoder(drop = 'first'),['relationship','race','sex']),
                                ('binary encoder', binary_encoder_pipeline,['workclass','marital.status','occupation','native.country'])
                                ],remainder = 'passthrough')

X = df_adult.drop(columns= ['income'])
y = np.where(df_adult['income']=='>50K',1,0)

In [70]:
# Model Selection
model = RandomForestClassifier()

estimator = Pipeline([
                    ('preprocess',transformer),
                    ('clf',model)
])

hyperparam_space = {
        'clf__max_depth' : [3,4,5,6,7],
        'clf__n_estimators' : [100, 200, 300]
}

skfold = StratifiedKFold(n_splits = 5)

grid_search = GridSearchCV(
    estimator, # model to tune
    param_grid = hyperparam_space, #h hyperparameter space
    cv = skfold, # evaluation model
    scoring = 'f1', # metrics
    verbose = True, #show progress
    n_jobs = -1 # ude all cores
)

grid_search.fit(X,y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one '
                                                                         'hot '
                                                                         'encoder',
                                                                         OneHotEncoder(drop='first'),
                                                                         ['relationship',
                                                                          'race',
                                                                          'sex']),
                                                                        ('binary '
                                                                         'encoder',
                

In [71]:
# Model Pickling
grid_search.best_estimator_.fit(X,y) # Final Model
filename = 'Model Final Adult.sav'
pickle.dump(grid_search.best_estimator_,open(filename,'wb'))