In [1]:
# To import configurations from config.ini files
import configparser
# For dataframe processes
import pandas as pd
import numpy as np

# For vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# To create a quick model to look at Feature Importances
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# To save csv's with current date information
import datetime as dt

import sys 
sys.path.append('../')
from src.etl_functions import *
from sklearn.metrics import precision_recall_fscore_support


In [2]:
# import and read my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

In [3]:
# Import my output path for saving data
output = config['paths']['data_path']

In [11]:
to_drop = ['extraction_type', 'extraction_type_group',
            'management_group',
            'payment_type',
            'quantity_group',
            'source_type','source_class', 
            'waterpoint_type_group',
            'district_code', 
            'construction_year',
            'num_private',
            'recorded_by',
            'id',
            'scheme_name', 
            'date_recorded']

fill_dict = {'funder':'Other',
                'installer': 'Other',
                'subvillage': 'Other', 
                'public_meeting': False,
                'scheme_management': 'Unknown',
                'permit': False}

dtype_dict = {'amount_tsh': 'float32',
            'funder': 'category',
            'gps_height': 'int16',
            'installer': 'category',
            'longitude': 'float16',
            'latitude': 'float16',
            'wpt_name': 'category',
            'num_private': 'int16',
            'basin': 'category',
            'subvillage': 'category',
            'region': 'category',
            'region_code': 'int8',
            'district_code': 'int8',
            'lga': 'category',
            'ward': 'category',
            'population': 'int16',
            'recorded_by': 'category',
            'scheme_management': 'category',
            'construction_year': 'int16',
            'extraction_type': 'category',
            'extraction_type_group': 'category',
            'extraction_type_class': 'category',
            'management': 'category',
            'management_group': 'category',
            'payment': 'category',
            'payment_type': 'category',
            'water_quality': 'category',
            'quality_group': 'category',
            'quantity': 'category',
            'quantity_group': 'category',
            'source': 'category',
            'source_type': 'category',
            'source_class': 'category',
            'waterpoint_type': 'category',
            'waterpoint_type_group': 'category'}

In [24]:
# Specified output path
output = config['paths']['data_path']
# Specified data paths
trn_data = import_me(config['paths']['train_data'], dtype_dict)
trn_lbls = pd.read_csv(config['paths']['train_labels'])
tst_data = import_me(config['paths']['test_data'], dtype_dict)
# Import submission format
sub_form = pd.read_csv(config['paths']['sub_form'])

In [25]:
trn_lbls.index=trn_lbls['id']
trn_lbls.drop(columns='id', inplace=True)

In [26]:
X_train, X_test, y_train,y_test = train_test_split(trn_data, trn_lbls,
                                                   test_size=.33,
                                                   random_state=42)

In [27]:
X_train, X_test, exp_output = get_cleaned_sets(X_train,X_test, to_drop, 
                                                output, fill_dict,
                                                dtype_dict=dtype_dict,
                                                return_output=True)



Cleaning successful.
Associated time is 280722_1041PM


In [46]:
rf = RandomForestClassifier(n_estimators=70, random_state=42)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{str(rf.set_params())}")
outcome=str(precision_recall_fscore_support(y_test, preds, average='weighted'))
print(outcome)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{outcome}")

  rf.fit(X_train, y_train)


(0.7835664268852196, 0.78961330476482, 0.7851922011001878, None)


In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [59]:
models = [KNeighborsClassifier(), 
          ExtraTreesClassifier(random_state=42), 
          BaggingClassifier(random_state=42), 
          AdaBoostClassifier(random_state=42), 
          GradientBoostingClassifier(random_state=42)]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    with open(f"{exp_output}experiment_notes.txt", 'a') as f:
            f.write(f"\n{str(model.set_params())}")
    outcome=str(precision_recall_fscore_support(y_test, preds, average='weighted'))
    print(outcome)
    with open(f"{exp_output}experiment_notes.txt", 'a') as f:
            f.write(f"\n{outcome}")

  return self._fit(X, y)


(0.6712860847548586, 0.6817671666156515, 0.6721154379221275, None)


  model.fit(X_train, y_train)


(0.7718766418584068, 0.7762983369043975, 0.7735402541527848, None)


  y = column_or_1d(y, warn=True)


(0.7771041126372281, 0.7817059483726151, 0.7778144066654447, None)


  y = column_or_1d(y, warn=True)


(0.7185841688673683, 0.7230894806652383, 0.6959207609554139, None)


  y = column_or_1d(y, warn=True)


(0.7624432776301867, 0.7532394653606774, 0.7328486649605241, None)


In [60]:
from sklearn.neural_network import MLPClassifier

In [61]:
model = MLPClassifier(random_state=42)

In [62]:
model.fit(X_train, y_train)
preds = model.predict(X_test)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{str(model.set_params())}")
outcome=str(precision_recall_fscore_support(y_test, preds, average='weighted'))
print(outcome)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{outcome}")

  y = column_or_1d(y, warn=True)


(0.7266251659669837, 0.7338026731966126, 0.7278167725113462, None)


In [63]:
from sklearn.model_selection import GridSearchCV

In [85]:
parameters = {'hidden_layer_sizes':[50,100,150],
              'activation':['identity', 'logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'learning_rate': ['constant', 'invscaling', 'adaptive']}

In [86]:
model = GridSearchCV(MLPClassifier(), parameters,
                     scoring = ['accuracy', 'f1', 'precision', 'recall'],
                     refit='recall',
                     return_train_score=True)


In [87]:
model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
Traceback (most recent call last):
  File "C:\Users\Cristian\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Cristian\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\Cristian\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\Cristian\AppData\Roaming\Python\Python310\site-packages\sklearn\metrics\_classification.py

In [None]:
preds = model.predict(X_test)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{str(model.get_params())}")
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{str(model.best_estimator_)}")
outcome=str(precision_recall_fscore_support(y_test, preds, average='weighted'))
print(outcome)
with open(f"{exp_output}experiment_notes.txt", 'a') as f:
        f.write(f"\n{outcome}")


(0.5235236303588116, 0.5756555453525151, 0.5331909005710738, None)
