# Data

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_adult = pd.read_csv('adult.csv')

In [3]:
df_adult.replace('?',np.nan,inplace = True)

PREPROCESS

In [4]:
adult_train = df_adult[:20000]
adult_test = df_adult[20000:]
adult_test.drop(columns = ['income'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
adult_train.to_csv('adult_train.csv', index = False)
adult_test.to_csv('adult_test.csv', index = False)

# Additional Liblary

In [6]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 4.7MB/s eta 0:00:011
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


# Model Training

In [7]:
# Basic Operations
import pandas as pd
import numpy as np

# ML Models
from sklearn.linear_model import LogisticRegression

# Feature Engineering
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

# Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Model
import pickle
import joblib

# data
adult_train = pd.read_csv('adult_train.csv')

# preprocess
binary_encoder_pipeline = Pipeline([
                                    ('imputer',SimpleImputer(strategy = 'constant',fill_value = 'NC')),
                                    ('binary encoder',ce.BinaryEncoder())
])

transformer = ColumnTransformer([
                                 ('one hot encoder',OneHotEncoder(drop = 'first'),['relationship','race','sex']),
                                 ('binary encoder',binary_encoder_pipeline,['workclass','marital.status','occupation','native.country'])
                                ], remainder = 'passthrough')
# Data Splitting
X = df_adult.drop(columns = ['fnlwgt','income','education'])
y = np.where(df_adult['income']=='>50K',1,0)

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X,
    y,
    stratify = y, 
    random_state = 2020)

# Model Selection
model = LogisticRegression(solver = 'liblinear', random_state = 2020)

estimator = Pipeline([
                      ('preprocess',transformer),
                      ('clf',model)
])

hyperparam_space = {
    'clf__C':[100, 10, 1, 0.1, 0.01, 0.001],
    'clf__solver':['liblinear','newton-cg']
}

skfold = StratifiedKFold(n_splits = 5)

grid_search = GridSearchCV(
    estimator, # model to tune
    param_grid = hyperparam_space, # hyperparameter space
    cv = skfold, # evaluation method
    scoring = 'f1', # metrics
    n_jobs = -1 # use all cores
)
grid_search.fit(X_train_val, y_train_val)

  import pandas.util.testing as tm
  elif pd.api.types.is_categorical(cols):


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('one '
                                                                         'hot '
                                                                         'encoder',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop='first',
                       

# Prediction : Pickle

In [8]:
# Model Pickling
grid_search.best_estimator_.fit(X,y) # FINAL MODEL
filename = 'Model Final.sav'
pickle.dump(grid_search.best_estimator_,open(filename,'wb'))

# load dataset
df_adult_testing = pd.read_csv('adult_test.csv')
df_adult_testing= df_adult_testing.drop(columns = ['fnlwgt','education'])

# load model
filename = 'Model Final.sav'
loaded_model = pickle.load(open(filename,'rb'))

  elif pd.api.types.is_categorical(cols):


> ## Predict Test Dataset

In [9]:
loaded_model.predict(df_adult_testing)

array([0, 1, 0, ..., 0, 0, 0])

In [10]:
loaded_model.predict(df_adult_testing[0:1])

array([0])

> ## Predict an Observation

In [11]:
df_predict = pd.DataFrame({
    'age':[43],
    'workclass':['Self-emp-not-inc'],
    'education.num':[13],
    'marital.status':['Separated'],
    'occupation':['Craft-repair'],
    'relationship':['Unmarried'],
    'race':['White'],
    'sex':['Male'],
    'capital.gain':[0],
    'capital.loss':[0],
    'hours.per.week':[35],
    'native.country':['United-States']
})

print('predict class :',loaded_model.predict(df_predict))
print('predict proba :',loaded_model.predict_proba(df_predict))

predict class : [0]
predict proba : [[0.87146373 0.12853627]]


# Prediction Joblib

In [12]:
# saving model
joblib.dump(grid_search.best_estimator_, "model joblib")

# loading model
model_joblib = joblib.load("model joblib")

> ## Predict an Observation

In [1]:
df_predict = pd.DataFrame({
    'age':[43],
    'workclass':['Self-emp-not-inc'],
    'education.num':[13],
    'marital.status':['Separated'],
    'occupation':['Craft-repair'],
    'relationship':['Unmarried'],
    'race':['White'],
    'sex':['Male'],
    'capital.gain':[0],
    'capital.loss':[0],
    'hours.per.week':[35],
    'native.country':['United-States']
})

print('predict class :',loaded_model.predict(df_predict))
print('predict proba :',loaded_model.predict_proba(df_predict))

NameError: name 'pd' is not defined