In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from supporting_funcs import *
import warnings
import pickle
import os

In [3]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
fname = './data/data.csv'
df = pd.read_csv(fname)

y = df['Loan_Status']
df = df.drop(['Loan_Status', 'Loan_ID'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=4, stratify=y, train_size=0.7)

In [6]:
# Engineering
log_feats = ['LoanAmount']
comb_col = ['ApplicantIncome', 'CoapplicantIncome']
term_col = 'Loan_Amount_Term'
# Data separation
num_feats_untouch = ['Credit_History']
cat_feats = df.columns[df.dtypes == 'object'].tolist()
num_feats = list(set(df.columns[df.dtypes != 'object'].tolist()) - {'ApplicantIncome', 'CoapplicantIncome'})
num_feats.append('CombinedIncome')

In [7]:
knn = KNNImputer(n_neighbors=5)

In [8]:
imputing_untouched = Pipeline([
    ('untouched', FunctionTransformer(func=data_separator, kw_args={'cols': num_feats_untouch})),
    ('knn', knn)
])
imputing_comb = Pipeline([
    ('combine_income', FunctionTransformer(func=comb_income, kw_args={'cols': comb_col})),
    # ('simple_imputer', SimpleImputer(strategy='median')),
    ('knn', knn)
])
imputing_log = Pipeline([
    ('to_log', FunctionTransformer(func=log_transformer, kw_args={'cols': log_feats})),
    ('knn', knn)
])
imputing_term = Pipeline([
    ('term', FunctionTransformer(func=term_transformer, kw_args={'col': term_col})),
    ('knn', knn)
])
imputing_cat = Pipeline([
    ('cat_feats', FunctionTransformer(func=data_separator, kw_args={'cols':cat_feats})),
    ('ohe', OneHotEncoder(sparse=True, drop='first')),
    ('to_dense', ToDenseTransformer()),
    ('knn', knn)
])

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ('log_transform', imputing_log, log_feats),
    ('term_transform', imputing_term, [term_col]),
    ('num_data_untouch', imputing_untouched, num_feats_untouch),
    ('combine_income', imputing_comb, comb_col),
    ('cat_ohe', imputing_cat, cat_feats),
])

In [10]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('log_reg', LogisticRegression())
])

In [11]:
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('log_transform',
                                                  Pipeline(steps=[('to_log',
                                                                   FunctionTransformer(func=<function log_transformer at 0x13a99b040>,
                                                                                       kw_args={'cols': ['LoanAmount']})),
                                                                  ('knn',
                                                                   KNNImputer())]),
                                                  ['LoanAmount']),
                                                 ('term_transform',
                                                  Pipeline(steps=[('term',
                                                                   FunctionTransformer(func=<function term_transformer at 0x1036df8b0>...
                                                           

In [12]:
pipeline.score(X_test, y_test)

0.8378378378378378

In [13]:
params = [
    {
        'log_reg__random_state': [4],
        'log_reg__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'log_reg__C': [100, 10, 1.0, 0.1, 0.01],
        'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'log_reg__n_jobs': [-1],

    }
]

In [14]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=params, n_jobs=-1, cv=3, verbose=4)

In [15]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('log_transform',
                                                                         Pipeline(steps=[('to_log',
                                                                                          FunctionTransformer(func=<function log_transformer at 0x13a99b040>,
                                                                                                              kw_args={'cols': ['LoanAmount']})),
                                                                                         ('knn',
                                                                                          KNNImputer())]),
                                                                         ['LoanAmount']),
                                                                        ('term_transform',
                                              

In [16]:
grid_search.best_params_

{'log_reg__C': 0.1,
 'log_reg__n_jobs': -1,
 'log_reg__penalty': 'l1',
 'log_reg__random_state': 4,
 'log_reg__solver': 'liblinear'}

In [17]:
grid_search.score(X_test, y_test)

0.8432432432432433

In [18]:
metrics.confusion_matrix(y_test, grid_search.predict(X_test))

array([[ 29,  29],
       [  0, 127]])

### Saving the Model

In [19]:
dirn = '../data/'
model_fname = f'{dirn}lg.pkl'
try:
    os.mkdir(dirn)
except FileExistsError as err:
    print(f'{dirn} already exists...')

In [20]:
with open(model_fname, 'wb') as f:
    pickle.dump(grid_search, f)

#### Alternative to doing pipeline

In [31]:
df = pd.read_csv(fname)
y = df['Loan_Status']
df = df.drop(['Loan_Status', 'Loan_ID'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=4, stratify=y, train_size=0.7)

In [34]:
def pipe(df_):
    df_ = pd.get_dummies(df_, drop_first=True)
    imputed = KNNImputer(n_neighbors=3)
    dft_impu = imputed.fit_transform(df_)
    df_ = pd.DataFrame(dft_impu, columns=df_.columns.tolist())
    df_['LoanAmount'] = np.log(df_['LoanAmount'])
    df_['Combined_Income'] = df_['ApplicantIncome'] + df_['CoapplicantIncome']
    df_ = df_.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)
    df_['Combined_Income'] = np.log(df_['Combined_Income'])
    df_['Loan_Amount_Term'] = df_['Loan_Amount_Term'] / 12
    return df_

In [35]:
X_train = pipe(X_train)
X_test = pipe(X_test)

In [51]:
lg = LogisticRegression(penalty="l1", C=0.1, fit_intercept=True, random_state=4, solver="liblinear", warm_start=False)

In [52]:
lg.fit(X_train, y_train)

LogisticRegression(C=0.1, penalty='l1', random_state=4, solver='liblinear')

In [53]:
lg.score(X_test, y_test)

0.8486486486486486

In [54]:
metrics.confusion_matrix(y_test, lg.predict(X_test))

array([[ 30,  28],
       [  0, 127]])