In [1]:
# To import configurations from config.ini files
import configparser
# For dataframe processes
import pandas as pd
import numpy as np

# For vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# To create a quick model to look at Feature Importances
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# To save csv's with current date information
import datetime as dt

import sys 
sys.path.append('../')
from src.etl_functions import *


In [2]:
# import and read my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

In [3]:
# Import my output path for saving data
output = config['paths']['data_path']

In [4]:
# Dtype Dictionary used when importing csv files.
dtype_dict = {'amount_tsh': 'float32',
            'funder': 'category',
            'gps_height': 'int16',
            'installer': 'category',
            'longitude': 'float16',
            'latitude': 'float16',
            'wpt_name': 'category',
            'num_private': 'int16',
            'basin': 'category',
            'subvillage': 'category',
            'region': 'category',
            'region_code': 'int8',
            'district_code': 'int8',
            'lga': 'category',
            'ward': 'category',
            'population': 'int16',
            'recorded_by': 'category',
            'scheme_management': 'category',
            'construction_year': 'int16',
            'extraction_type': 'category',
            'extraction_type_group': 'category',
            'extraction_type_class': 'category',
            'management': 'category',
            'management_group': 'category',
            'payment': 'category',
            'payment_type': 'category',
            'water_quality': 'category',
            'quality_group': 'category',
            'quantity': 'category',
            'quantity_group': 'category',
            'source': 'category',
            'source_type': 'category',
            'source_class': 'category',
            'waterpoint_type': 'category',
            'waterpoint_type_group': 'category'}

In [5]:
trn_data = pd.read_csv(config['paths']['train_data'], dtype=dtype_dict)
trn_lbls = pd.read_csv(config['paths']['train_labels'], dtype=dtype_dict)
tst_data = pd.read_csv(config['paths']['test_data'], dtype=dtype_dict)

In [9]:
X_train, X_test, y_train,y_test = train_test_split(trn_data, trn_lbls,
                                                   test_size=.33,
                                                   random_state=42)

In [10]:
x_train, x_test = get_cleaned_sets(X_train, y_train, X_test, output, True)



Saved time: 280722_0246PM


In [12]:
x_train.shape

(39798, 268)

In [13]:
x_test.shape

(19602, 266)

In [15]:
x_train.index = x_train['id']
y_train = x_train['status_group']
x_train.drop(columns=['id', 'status_group'], inplace=True)

In [17]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
preds = rf.predict(x_test)

In [18]:
preds

array(['non functional', 'functional', 'functional', ..., 'functional',
       'non functional', 'functional'], dtype=object)

In [19]:
from sklearn.metrics import precision_recall_fscore_support

In [22]:
y_test.index = y_test['id']
y_test.drop(columns='id', inplace=True)
y_test

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
37098,non functional
14530,functional
62607,functional
46053,non functional
47083,functional
...,...
36974,functional
30037,non functional
67453,functional
46464,non functional


In [26]:
precision_recall_fscore_support(y_test, preds, average='weighted')

(0.7846104590345233, 0.7908376696255485, 0.7862769809678481, None)

# Main Model

In [4]:
train_data = pd.read_pickle(config['paths']['train_data_clean'])
test_data = pd.read_pickle(config['paths']['test_data_clean'])

# Import submission format
sub_form = pd.read_csv(config['paths']['sub_form'])

In [5]:
train_data.index = train_data['id']
train_lbls = train_data['status_group']
train_data.drop(columns=['id', 'status_group'], inplace=True)

In [6]:
rf = RandomForestClassifier(random_state=42)

In [7]:
rf.fit(train_data, train_lbls)

RandomForestClassifier(random_state=42)

In [8]:
preds = rf.predict(test_data)
sub_form['status_group'] = preds

In [9]:
current_time = dt.datetime.now().strftime("%d%m%Y_%I%M%p")
sub_form.to_csv(output+ 'my_submission'+current_time+'.csv',index=False)

In [10]:
sub_form

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional needs repair
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional
14848,33492,functional


In [21]:
from sklearn.model_selection import GridSearchCV

In [26]:
rf = RandomForestClassifier()

In [32]:
param_grid = {'n_estimators':[50,100,150],
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_features': ['sqrt', 'log2'],
              'random_state':[42]
              }

In [35]:
gs = GridSearchCV(rf,param_grid=param_grid)

In [36]:
gs.fit(train_data, train_lbls)

In [37]:
gs.best_estimator_

In [38]:
rf = RandomForestClassifier(n_estimators=150,random_state=42)
rf.fit(train_data, train_lbls)
preds = rf.predict(test_data)
sub_form['status_group'] = preds
current_time = dt.datetime.now().strftime("%d%m%Y_%I%M%p")
sub_form.to_csv(output+ 'my_submission'+current_time+'.csv',index=False)