In [1]:
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, ComplementNB
from supporting_funcs import *
import pickle
import os

In [3]:
fname = './data/data.csv'
df = pd.read_csv(fname)

y = df['Loan_Status']
df = df.drop(['Loan_Status', 'Loan_ID'], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=4, stratify=y, train_size=0.7)

In [5]:
# Engineering
log_feats = ['LoanAmount']
comb_col = ['ApplicantIncome', 'CoapplicantIncome']
term_col = 'Loan_Amount_Term'
# Data separation
num_feats_untouch = ['Credit_History']
cat_feats = df.columns[df.dtypes == 'object'].tolist()
num_feats = list(set(df.columns[df.dtypes != 'object'].tolist()) - {'ApplicantIncome', 'CoapplicantIncome'})
num_feats.append('CombinedIncome')

In [6]:
# CATEGORICAL OPERATION
cat_imputer = ColumnTransformer([
    ('simple_impute', SimpleImputer(strategy='most_frequent'), cat_feats)
])
cat_pipe = Pipeline([
    ('select_cat', FunctionTransformer(func=data_separator, kw_args={'cols': cat_feats})),
    ('cat_simple_imputer', cat_imputer),
    ('ohe', OneHotEncoder(sparse=True, drop='first')),
    ('to_dense', ToDenseTransformer()),
])
# NUMERIC OPERATIONS
num_transformations = ColumnTransformer([
    ('untouched', FunctionTransformer(func=data_separator, kw_args={'cols': num_feats_untouch}), num_feats_untouch),
    ('combine_income', FunctionTransformer(func=comb_income, kw_args={'cols': comb_col}), comb_col),
    ('to_log', FunctionTransformer(func=log_transformer, kw_args={'cols': log_feats}), log_feats),
    ('term', FunctionTransformer(func=term_transformer, kw_args={'col': term_col}), [term_col]),
])
num_pipe = Pipeline([
    ('num_transformations', num_transformations),
    ('knn', KNNImputer(n_neighbors=3))
])
# COMBINING NUM & CAT
combine_features = FeatureUnion([
    ('num_pipe', num_pipe),
    ('cat_pipe', cat_pipe),
])
# TARGET LABELING
# transforming_y = Pipeline([
#     ('target_labeling', LabelEncoder()),
# ])

In [7]:
pipeline = Pipeline(steps=[
    ('processed_features', combine_features),
    ('classifier', ClfSwitcher())  # defaults to LogisticRegression()
])

In [8]:
# pipeline.fit(X_train, y_train)

In [9]:
# pipeline.score(X_test, y_test)

In [10]:
params = [
    # {
    #     'log_reg__random_state': [4],
    #     'log_reg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    #     'log_reg__C': [100, 10, 1.0, 0.1, 0.01],
    #     'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    #     'log_reg__n_jobs': [-1],
    #
    # },
    # {
    #     'classifier__estimator': [LogisticRegression()],
    #     'classifier__estimator__C':[ 0.1],
    #     'classifier__estimator__n_jobs': [-1],
    #     'classifier__estimator__penalty': ['l1'],
    #     'classifier__estimator__random_state': [4],
    #     'classifier__estimator__solver': ['liblinear']
    # },
    {
            # 'classifier__estimator': [BernoulliNB()],
        #     'classifier__estimator__C':[ 0.1],
        #     'classifier__estimator__n_jobs': [-1],
        #     'classifier__estimator__penalty': ['l1'],
        #     'classifier__estimator__random_state': [4],
        #     'classifier__estimator__solver': ['liblinear']
    },
    {

        'classifier__estimator': [BernoulliNB()],
        'classifier__estimator__alpha':[ 0, 1, 2, 3, 4],
    }

]

In [11]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=params, n_jobs=-1, cv=3, verbose=4)

In [12]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits




GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('processed_features',
                                        FeatureUnion(transformer_list=[('num_pipe',
                                                                        Pipeline(steps=[('num_transformations',
                                                                                         ColumnTransformer(transformers=[('untouched',
                                                                                                                          FunctionTransformer(func=<function data_separator at 0x14883cdc0>,
                                                                                                                                              kw_args={'cols': ['Credit_History']}),
                                                                                                                          ['Credit_History']),
                                                                           

In [13]:
grid_search.score(X_test, y_test)

0.8324324324324325

In [14]:
# grid_search.best_params_
# grid_search.estimator.get_params().keys()

In [15]:
metrics.confusion_matrix(y_test, grid_search.predict(X_test))

array([[ 29,  29],
       [  2, 125]])

### Saving the Model

In [26]:
dirn = './data/'
model_fname = f'{dirn}bernNB.pkl'
try:
    os.mkdir(dirn)
except FileExistsError as err:
    print(f'{dirn} already exists...')

./data/ already exists...


In [27]:
with open(model_fname, 'wb') as f:
    pickle.dump(grid_search, f)
    print('saving...')

saving...


#### Alternative to doing pipeline

In [18]:
df = pd.read_csv(fname)
y = df['Loan_Status']
df = df.drop(['Loan_Status', 'Loan_ID'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=4, stratify=y, train_size=0.7)

In [19]:
def pipe(df_):
    df_ = pd.get_dummies(df_, drop_first=True)
    imputed = KNNImputer(n_neighbors=3)
    dft_impu = imputed.fit_transform(df_)
    df_ = pd.DataFrame(dft_impu, columns=df_.columns.tolist())
    df_['LoanAmount'] = np.log(df_['LoanAmount'])
    df_['Combined_Income'] = df_['ApplicantIncome'] + df_['CoapplicantIncome']
    df_ = df_.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)
    df_['Combined_Income'] = np.log(df_['Combined_Income'])
    df_['Loan_Amount_Term'] = df_['Loan_Amount_Term'] / 12
    return df_

In [20]:
X_train = pipe(X_train)
X_test = pipe(X_test)

In [21]:
lg = LogisticRegression(penalty="l1", C=0.1, fit_intercept=True, random_state=4, solver="liblinear", warm_start=False)

In [22]:
lg.fit(X_train, y_train)

LogisticRegression(C=0.1, penalty='l1', random_state=4, solver='liblinear')

In [23]:
lg.score(X_test, y_test)

0.8486486486486486

In [24]:
metrics.confusion_matrix(y_test, lg.predict(X_test))

array([[ 30,  28],
       [  0, 127]])