In [381]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, recall_score, \
    accuracy_score, precision_score, f1_score, plot_confusion_matrix, classification_report, roc_auc_score,\
    plot_roc_curve, plot_precision_recall_curve

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_columns', None)

In [317]:
df = pd.read_csv("./data/logistic_model_features.csv")

In [318]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year,funder,installer,basin,region,region_code,district_code,lga,ward,scheme_management,scheme_name,permit,extraction_type,management,payment,water_quality,quantity,source,source_class,waterpoint_type,status_group
0,0,69572,6000.0,1390,34.938093,-9.856322,109.0,1999.0,Roman,Roman,Lake Nyasa,Iringa,11,5,other,other,VWC,Roman,False,gravity,vwc,pay annually,soft,enough,spring,groundwater,communal standpipe,functional
1,1,8776,,1399,34.698766,-2.147466,280.0,2010.0,other,other,dif_other,Mara,20,2,Serengeti,Natta,Other,,True,gravity,wug,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,functional


Doing final preparation of data, flattening and converting 

In [326]:
df['permit'] = df['permit'].astype(str)
df['region_code'] = df['region_code'].astype(str)
df['district_code'] = df['district_code'].astype(str)

In [327]:
# Re-filling zeros in amount_tsh

df['amount_tsh'].fillna(0, inplace=True)

In [328]:
ohe = OneHotEncoder(sparse=False)
target = pd.DataFrame(ohe.fit_transform(df[['status_group']]), index=df.index, columns=[
    'Functional', "Needs_Repair", "Non-Functional"])

In [329]:
y_flat = np.argmax(np.array(target), axis=1)

y_flat
dbl_check = pd.Series(y_flat, name='Target')

### After several model iterations, I am going to investigate the data a bit further, and perhaps clean some more columns.

In [362]:
original_features = pd.read_csv('./Data/4910797b-ee55-40a7-8668-10efd5c1b960.csv')

In [361]:
df['latitude'].describe()

count    5.940000e+04
mean    -5.706033e+00
std      2.946019e+00
min     -1.164944e+01
25%     -8.540621e+00
50%     -5.021597e+00
75%     -3.326156e+00
max     -2.000000e-08
Name: latitude, dtype: float64

# Assigned values to X and y

In [330]:
X = df.drop(['status_group', 'Unnamed: 0', 'id'], axis=1)
y = dbl_check

In [363]:
X.describe()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,construction_year
count,59400.0,59400.0,57588.0,59400.0,38019.0,38691.0
mean,317.650385,668.297239,35.149669,-5.706033,281.087167,1996.814686
std,2997.574558,693.11635,2.607428,2.946019,564.68766,12.472045
min,0.0,-90.0,29.607122,-11.64944,1.0,1960.0
25%,0.0,0.0,33.2851,-8.540621,40.0,1987.0
50%,0.0,369.0,35.005943,-5.021597,150.0,2000.0
75%,20.0,1319.25,37.233712,-3.326156,324.0,2008.0
max,350000.0,2770.0,40.345193,-2e-08,30500.0,2013.0


In [332]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Class creation

In [234]:
models = []

In [235]:
class Model():
    model_list = []
    model_df = pd.DataFrame([[0, 0, 0]], columns=["Name", "train_score", "test_score"])
    
    def __init__(self, name, model):
        self.name = name
        self.params = model.get_params
        self.train_score = model.score(X_train, y_train)
        self.test_score = model.score(X_test, y_test)
    
        Model.model_list.append(self)
        attributes = np.array([self.name, self.train_score, self.test_score])
        self.attributes = attributes
        
        self.attributes_df = pd.DataFrame([attributes], columns=['Name', 'train_score', 'test_score'])
        Model.model_df = pd.concat([Model.model_df, self.attributes_df], axis=0)

    @classmethod
    def get_model_list(cls):
        return cls.model_list

# Functions

In [236]:
def catalogue(model, X_train, X_test, y_train, y_test):
    model = Model(model)
    global models
    return model

In [237]:
def record(model, X_train, X_test, y_train, y_test):
    stats = np.array(model.get_params, model.score(X_train, y_train), model.score(X_test, y_test))
    df = pd.DataFrame(stats, columns=["parameters", "train_score", 'test_score'])
    return df

# Pipeline construction

In [364]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44550 entries, 35240 to 49783
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         44550 non-null  float64
 1   gps_height         44550 non-null  int64  
 2   longitude          43195 non-null  float64
 3   latitude           44550 non-null  float64
 4   population         28541 non-null  float64
 5   construction_year  29050 non-null  float64
 6   funder             41815 non-null  object 
 7   installer          41804 non-null  object 
 8   basin              44550 non-null  object 
 9   region             44550 non-null  object 
 10  region_code        44550 non-null  object 
 11  district_code      44550 non-null  object 
 12  lga                44550 non-null  object 
 13  ward               44550 non-null  object 
 14  scheme_management  41626 non-null  object 
 15  scheme_name        23413 non-null  object 
 16  permit            

In [412]:
subpipe_numerics = Pipeline(steps=[
    ('mean_impute', SimpleImputer(add_indicator=True, strategy='mean')),
    ('ss', StandardScaler()),
    ('poly', PolynomialFeatures())
])

sub_pipe_cat = Pipeline(steps=[
    
    ('cat_impute', SimpleImputer(strategy='constant', fill_value="Missing", add_indicator=True)),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=True))  # sparse=False in failed version
], verbose=True)  # verbose = False on failed version

In [413]:
CT = ColumnTransformer(transformers=[
    ('subpipe_numerics', subpipe_numerics, selector(dtype_include=np.number)),
    ('subpipe_cat', sub_pipe_cat, selector(dtype_include=object))
], remainder='passthrough', n_jobs= -1)

In [414]:
poly_model2 = Pipeline(steps=[
    ('CT', CT),
    ('poly2', LogisticRegression(max_iter=200, n_jobs= -1, multi_class="multinomial", 
    solver='newton-cg', fit_intercept=False, tol=.01, C=0.1))
], verbose=True)

### Reference for hyperparameters

In [None]:
('poly2',
                 LogisticRegression(C=0.1, fit_intercept=False, max_iter=200,
                                    multi_class='multinomial', n_jobs=-1,
                                    solver='newton-cg', tol=0.01))],
         verbose=True)

### Optional Smoting

In [415]:
poly_smote = ImPipeline(steps=[
    ('ct', CT),
    ('sm', SMOTE(random_state=42, sampling_strategy='auto')),
    ('poly_smote', LogisticRegression(n_jobs= -1, C=0.1, max_iter=200, tol=.01, 
        multi_class="multinomial", solver='newton-cg', fit_intercept=False))
])

# ACTUAL PIPELINE FITTING! 

In [416]:
poly_smote.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('subpipe_numerics',
                                                  Pipeline(steps=[('mean_impute',
                                                                   SimpleImputer(add_indicator=True)),
                                                                  ('ss',
                                                                   StandardScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff59af32dc0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_i...
                                               

# Instantiation of models

In [417]:
poly_smote.score(X_train, y_train), poly_smote.score(X_test, y_test)

(0.684040404040404, 0.677979797979798)

In [308]:
log_model1.train_score, log_model1.train_score
# no real difference, so i will try some other shit

(0.7699214365881033, 0.7699214365881033)

In [419]:
poly_smote = Model("poly_smote", poly_smote)

In [249]:
model_1.train_score, model_1.test_score

(0.6697418630751965, 0.665050505050505)

In [287]:
log_model1 = Model("Baseline_log", log_model)

In [420]:
Model.model_df

Unnamed: 0,Name,train_score,test_score
0,0,0.0,0.0
0,baseline,0.6697418630751965,0.665050505050505
0,Smote_grid1,0.6697418630751965,0.665050505050505
0,Baseline_log,0.7699214365881033,0.7650505050505051
0,Baseline_log,0.7699214365881033,0.7650505050505051
0,Baseline_log_grid,0.7668462401795735,0.7647811447811448
0,poly1,0.7724354657687991,0.7688215488215489
0,Poly_grid_1,0.7724354657687991,0.7688215488215489
0,poly2,0.7701683501683502,0.7661279461279461
0,Poly_grid_3,0.7724354657687991,0.7688215488215489


In [390]:
for i in poly_model2.get_params().keys():
    print(i)

memory
steps
verbose
CT
poly2
CT__n_jobs
CT__remainder
CT__sparse_threshold
CT__transformer_weights
CT__transformers
CT__verbose
CT__subpipe_numerics
CT__subpipe_cat
CT__subpipe_numerics__memory
CT__subpipe_numerics__steps
CT__subpipe_numerics__verbose
CT__subpipe_numerics__mean_impute
CT__subpipe_numerics__ss
CT__subpipe_numerics__poly
CT__subpipe_numerics__mean_impute__add_indicator
CT__subpipe_numerics__mean_impute__copy
CT__subpipe_numerics__mean_impute__fill_value
CT__subpipe_numerics__mean_impute__missing_values
CT__subpipe_numerics__mean_impute__strategy
CT__subpipe_numerics__mean_impute__verbose
CT__subpipe_numerics__ss__copy
CT__subpipe_numerics__ss__with_mean
CT__subpipe_numerics__ss__with_std
CT__subpipe_numerics__poly__degree
CT__subpipe_numerics__poly__include_bias
CT__subpipe_numerics__poly__interaction_only
CT__subpipe_numerics__poly__order
CT__subpipe_cat__memory
CT__subpipe_cat__steps
CT__subpipe_cat__verbose
CT__subpipe_cat__cat_impute
CT__subpipe_cat__ohe
CT__subpipe

In [397]:
# poly_model2.get_params

In [398]:
params = {
    'poly2__solver' : ['saga', 'newton-cg'],  # 'newton-cholesky' not supported in logistic regression
    'poly2__max_iter': [200, 250, 300, 400],
    'poly2__C' : [0.01, .1, .001],
    'poly2__tol' : [.01, .1, .001]
}

In [399]:
gs = GridSearchCV(
    estimator= poly_model2,     # Subset pipeline
    param_grid=params,
    cv=5,
    verbose=2,
    n_jobs=-1)

In [400]:
%%time
gs.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 54.7min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 140.3min finished


[Pipeline] ................ (step 1 of 2) Processing CT, total=   1.2s
[Pipeline] ............. (step 2 of 2) Processing poly2, total= 5.6min
CPU times: user 35.7 s, sys: 2.66 s, total: 38.3 s
Wall time: 2h 25min 58s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_numerics',
                                                                         Pipeline(steps=[('mean_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler()),
                                                                                         ('poly',
                                                                                          PolynomialFeatures(degree=3))]),
                                 

In [347]:
gs.best_estimator_

Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_numerics',
                                                  Pipeline(steps=[('mean_impute',
                                                                   SimpleImputer(add_indicator=True)),
                                                                  ('ss',
                                                                   StandardScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff51d5826d0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                      

In [401]:
gs.best_params_

{'poly2__C': 0.1,
 'poly2__max_iter': 200,
 'poly2__solver': 'newton-cg',
 'poly2__tol': 0.01}

In [402]:
poly_3_grid = gs.best_estimator_
poly_3_grid.fit(X_train, y_train)

[Pipeline] ................ (step 1 of 2) Processing CT, total=   2.4s
[Pipeline] ............. (step 2 of 2) Processing poly2, total= 5.6min


Pipeline(steps=[('CT',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('subpipe_numerics',
                                                  Pipeline(steps=[('mean_impute',
                                                                   SimpleImputer(add_indicator=True)),
                                                                  ('ss',
                                                                   StandardScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures(degree=3))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff59b497ca0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=...
                                               

In [404]:
# poly_3_grid

In [405]:
poly_3_grid.score(X_train, y_train), poly_3_grid.score(X_test, y_test)

(0.7701907968574635, 0.7661279461279461)

In [406]:
poly3_grid = Model("Poly_grid_3", poly_grid)

In [407]:
print(classification_report(y_test, poly_3_grid.predict(X_test)))

              precision    recall  f1-score   support

           0       0.75      0.89      0.82      8065
           1       0.55      0.13      0.22      1079
           2       0.80      0.70      0.75      5706

    accuracy                           0.77     14850
   macro avg       0.70      0.58      0.59     14850
weighted avg       0.76      0.77      0.75     14850



In [408]:
Model.model_df

Unnamed: 0,Name,train_score,test_score
0,0,0.0,0.0
0,baseline,0.6697418630751965,0.665050505050505
0,Smote_grid1,0.6697418630751965,0.665050505050505
0,Baseline_log,0.7699214365881033,0.7650505050505051
0,Baseline_log,0.7699214365881033,0.7650505050505051
0,Baseline_log_grid,0.7668462401795735,0.7647811447811448
0,poly1,0.7724354657687991,0.7688215488215489
0,Poly_grid_1,0.7724354657687991,0.7688215488215489
0,poly2,0.7701683501683502,0.7661279461279461
0,Poly_grid_3,0.7724354657687991,0.7688215488215489


In [421]:
for i in Model.model_list:
    print(i.name)

baseline
Smote_grid1
Baseline_log
Baseline_log
Baseline_log_grid
poly1
Poly_grid_1
poly2
Poly_grid_3
poly_smote


# Observations:
- Changing sampling strategy to "auto" made no difference in SMOTE hyperparameters.
- Baseline log grid slightly worse than the uncalibrated logistic regression
- For polynomial features, the grid search did not improve the default settings at all, still not overfit
- After a 2 hour grid search, the best model only equals the best score thus far, still without overfitting, but without any improvement either.
- Polynomial degree 2 + smote improved upon the baseline smote, but still performed worse than other models
# I will revisit the model features in the morning

In [None]:


class ModelsList():
    def init(self, x,y):
        xtrain, xtest, ytrain, ytest = train_test_split(x,y)
        self.xtrain = xtrain
        self.xtest = xtest
        self.ytrain = ytrain
        self.ytest = ytest
        self.df = pd.DataFrame({'Model' : pd.Series(dtype='str'), 
                    'train_score' : pd.Series(dtype='float64'), 
                    'test_score': pd.Series(dtype='float64'),
                    'params': pd.Series(dtype='O')})

    def update(self, estimator, name, fit = True, params = None):
        if fit:
            estimator.fit(self.xtrain, self.ytrain)

        trainscore = estimator.score(self.xtrain, self.ytrain)
        testscore = estimator.score(self.xtest, self.ytest)
        model_to_add = [name, trainscore, testscore, params]
        self.df.loc[len(self.df.index)] = model_to_add

In [None]:
'amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
       'construction_year', 'funder', 'installer', 'basin', 'region',
       'region_code', 'district_code', 'lga', 'ward', 'scheme_management',
       'scheme_name', 'permit', 'extraction_type', 'management', 'payment',
       'water_quality', 'quantity', 'source', 'source_class',
       'waterpoint_type'],
these are my features atm
i think that the hard part is really done though tbh, i just want to write this function to keep track of modeling, and then have a slightly tighter modeling notebook for less scrolling
octo — Today at 12:43 PM
oh interesting
['amount_tsh', 'date_recorded', 'funder', 'installer', 'basin',
       'subvillage', 'region', 'lga', 'ward', 'population', 'public_meeting',
       'recorded_by', 'scheme_management', 'permit', 'construction_year',
       'extraction_type', 'payment', 'water_quality', 'quantity', 'source',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'days_since_recorded', 'status_group']