In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, recall_score, \
    accuracy_score, precision_score, f1_score, plot_confusion_matrix, classification_report, log_loss

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Creating the class that will store my data

class Model():
    model_list = []
    model_df = pd.DataFrame(columns=['name','train_accuracy','train_prec','train_recall','train_f1','train_logloss',\
                                     'test_accuracy','test_prec','test_recall','test_f1','test_logloss'])
    
    def __init__(self, name, model, X_train, X_test, y_train, y_test):
        self.name = name
        self.model = model
        self.params = model.get_params
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
        # Collection of training attributes
        self.train_results = cross_validate(self.model, self.X_train, self.y_train, scoring=[
            'precision_macro', 'accuracy', 'recall_macro', 'f1_macro', 'neg_log_loss'], n_jobs=4, verbose=1)
        # Train metrics
        self.train_acc = np.mean(self.train_results['test_accuracy'])
        self.train_prec = np.mean(self.train_results['test_precision_macro'])
        self.train_rec = np.mean(self.train_results['test_recall_macro'])
        self.train_f1 = np.mean(self.train_results['test_f1_macro'])
        self.train_logloss = -np.mean(self.train_results['test_neg_log_loss'])
        
        # Test metrics
        self.y_pred_proba = self.model.predict_proba(self.X_test)  # accuracy
        self.y_pred = self.model.predict(self.X_test)
        
        self.test_score = model.score(self.X_test, self.y_test)
        self.test_recall = recall_score(self.y_test, self.y_pred, average='macro', zero_division=0)
        self.test_prec = precision_score(self.y_test, self.y_pred, average='macro', zero_division=0)
        self.test_log_loss = log_loss(self.y_test, self.y_pred_proba)
        self.test_f1 = f1_score(self.y_test, self.y_pred, average='macro', zero_division=0)
        
        # Add model object to the class data container for access within the notebook
        Model.model_list.append(self)
        
        # Dictionary containing all of the metrics to add to the dataframe
        self.attributes = {'name':self.name, 'train_accuracy':self.train_acc, "train_prec": self.train_prec,
                           "train_recall": self.train_rec, "train_f1": self.train_f1, \
                           "train_logloss": self.train_logloss, \
                          'test_accuracy':self.test_score, "test_prec": self.test_prec,
                           "test_recall": self.test_recall, "test_f1": self.test_f1, \
                           "test_logloss": self.test_log_loss}
        
        # Add the metrics to the class dataframe
        Model.model_df.loc[len(Model.model_df)] = self.attributes
    
    def __repr__(self):
      return f"Model name: ({self.model})"
    
    
    @classmethod
    def get_model_list(cls):
        return cls.model_list

# Data Imports

In [3]:
# Open the csv file, print its shape, and examine the first 5 rows of data
original_features_df = pd.read_csv('./Data/4910797b-ee55-40a7-8668-10efd5c1b960.csv')
print(original_features_df.shape)
original_features_df.head()

(59400, 40)


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
df = pd.read_csv('/Users/samalainabayeva/Desktop/Water Project CSVs/updated_features.csv', index_col=0)

In [5]:
# Create the smaller data set to be used in grid searches
sample_data = df.sample(5000, random_state=42)

In [6]:
# Train/test split on the data
X = df.drop("Target", axis = 1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Pipelines for iterating over

In [7]:
# Create Pipelines for numeric and categorical data

subpipe_numerics = Pipeline(steps=[
    ('mean_impute', SimpleImputer(add_indicator=True, strategy='mean')),
    ('ss', StandardScaler())
], verbose=True)

sub_pipe_cat = Pipeline(steps=[
    
    ('cat_impute', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=True))  
], verbose=True)

In [8]:
# Create a column transformer to implement the above sub-pipelines

CT = ColumnTransformer(transformers=[
    ('subpipe_numerics', subpipe_numerics, selector(dtype_include=np.number)),
    ('subpipe_cat', sub_pipe_cat, selector(dtype_include=object))
], remainder='passthrough', n_jobs= 4, verbose=True)

# Dummy Model

In [9]:
# Create the final pipeline for model instantiation.
dummy_model = Pipeline(steps=[
    ('CT', CT),
    ('dummy', DummyClassifier(strategy="most_frequent"))
], verbose=True)

# Baseline Logistic Model

In [None]:
baseline_logistic = Pipeline(steps=[
    ('CT', CT),
    ('baseline_log', LogisticRegression(verbose=1, n_jobs=4))
], verbose=True)

# Grid Search

In [10]:
# Establishing a starting dictionary of parameters and optional values
parameters = {
    "baseline_log__penalty": ["l1", "l2"],
    "baseline_log__tol": [.0001, .001, .01],
    "baseline_log__C": [1, .1, .01],
    "baseline_log__fit_intercept": [True, False],
    "baseline_log__solver": ['newton-cg', 'sag', 'lbfgs'],
    "baseline_log__max_iter": [100, 250, 500]
}

In [11]:
grid_search = GridSearchCV(
    estimator=baseline_logistic,
    param_grid=parameters,
    n_jobs=4,
    cv=5,
    verbose=2)

NameError: name 'baseline_logistic' is not defined

# Smote Model

In [12]:
smote_pipeline = ImPipeline(steps=[
    ("CT", CT),
    ("smote", SMOTE(n_jobs= 4, sampling_strategy='auto')),  
    # auto is both default and equivalent to 'not_majority'
    
    ('baseline_log', LogisticRegression(verbose=1, n_jobs=4)), 
    # using the same hyper-parameters as baseline/best model
], verbose= True)

In [15]:
# a reference for the previous models
previous_models_df = pd.read_csv('/Users/samalainabayeva/Desktop/Water Project CSVs/abridged_table.csv')
second_previous = pd.read_csv('/Users/samalainabayeva/Desktop/Water Project CSVs/2nd_add_table.csv')

In [16]:
second_previous

Unnamed: 0.1,Unnamed: 0,Name,train_score,train_log_loss,test_score,test_log_loss,name,test_accuracy,test_f1,test_logloss,test_prec,test_recall,train_accuracy,train_f1,train_logloss,train_prec,train_recall
0,0,dummy,0.542334,15.807208,0.54532,15.704096,,,,,,,,,,,
1,1,logistic_1,0.771762,0.577305,0.763636,0.598705,,,,,,,,,,,
2,2,grid_logistic_1,0.766981,0.586082,0.76303,0.596011,,,,,,,,,,,
3,3,smote_1,0.683659,0.720839,0.669091,0.73358,,,,,,,,,,,
4,4,lr_C.01_mi200_solv_n.cg,0.753378,0.61442,0.750976,0.642552,,,,,,,,,,,
5,5,c.01_mi200_svr_ncg_tol_1,0.677621,0.747923,0.668215,0.773572,,,,,,,,,,,
6,6,poly1_lr_C.01_mi200_solv_n.cg,0.753378,0.61442,0.750976,0.642552,,,,,,,,,,,
7,7,poly2_lr_C.01_mi1000_solv_n.cg,0.753378,0.61442,0.750976,0.642552,,,,,,,,,,,
8,8,saga_l1_mi1000,0.727452,0.663176,0.728013,0.699459,,,,,,,,,,,
9,9,lbfgs,0.753378,0.614422,0.750909,0.642552,,,,,,,,,,,


In [14]:
previous_models_df

Unnamed: 0.1,Unnamed: 0,Name,train_score,test_score
0,0,0,0.0,0.0
1,0,baseline,0.669742,0.665051
2,0,Smote_grid1,0.669742,0.665051
3,0,Baseline_log,0.769921,0.765051
4,0,Baseline_log,0.769921,0.765051
5,0,Baseline_log_grid,0.766846,0.764781
6,0,poly1,0.772435,0.768822
7,0,Poly_grid_1,0.772435,0.768822
8,0,poly2,0.770168,0.766128
9,0,Poly_grid_3,0.772435,0.768822


# Observations:
- Changing sampling strategy to "auto" made no difference in SMOTE hyperparameters.
- Baseline log grid slightly worse than the uncalibrated logistic regression
- For polynomial features, the grid search did not improve the default settings at all, still not overfit
- After a 2 hour grid search, the best model only equals the best score thus far, still without overfitting, but without any improvement either.
- Polynomial degree 2 + smote improved upon the baseline smote, but still performed worse than other models

- Adding the dates_passed column improved the model very slightly. I will now re-check cleaning/feature selection.
#### Returning to the baseline logistic regression model and modifying from there
- Baseline Logistic Regression continues to be the best model
- StandardScaler and MinMaxScaler produced the same exact results
- Median fill and mean fill produced identical results