In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [52]:
import warnings
warnings.filterwarnings('ignore')
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [51]:
x_train = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/xtrain_clean.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/kayoyin/datasets/master/xtest_clean.csv')

In [28]:
# Turn all data types into categorical data types
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): 
            df[n] = pd.factorize(df[n])[0]
    return df

In [29]:
x_train = train_cats(x_train)

In [30]:
y_num = y_train.replace(['functional','non functional', 'functional needs repair'], [0,1,2]) # replace labels by numerical values


In [23]:
#x_train.drop('id', axis = 1, inplace = True)
#y_train.drop('id', axis = 1, inplace = True)
x_test.drop('id', axis = 1, inplace = True)

In [31]:
xtrain, xvalid, ytrain, yvalid = train_test_split(x_train, y_num, test_size=0.2)

In [32]:
xtrain.head()

Unnamed: 0.1,Unnamed: 0,id,gps_height,longitude,latitude,basin,public_meeting,scheme_management,construction_year,extraction_type_class,...,water_quality,quantity,source,source_class,waterpoint_type,year_recorded,date_recordedMonth,date_recordedElapsed,population_log,age
23052,23052,29713,1517.0,35.6,-7.836973,7,1,0,1985.0,0,...,0,0,0,0,0,2011,3,59,0.693147,26.0
30020,30020,64806,237.0,38.62,-5.288758,2,1,0,2006.0,2,...,3,2,5,0,2,2011,3,61,6.552508,5.0
20859,20859,48188,1484.0,30.22,-4.339542,5,1,7,2007.0,0,...,0,0,2,1,1,2013,1,755,6.329721,6.0
51297,51297,42856,1267.333333,30.62,-1.198261,1,1,0,2004.0,0,...,0,0,0,0,0,2011,8,213,5.860786,7.0
5087,5087,61200,1046.29734,30.84,-1.461216,1,1,2,1997.0,0,...,0,3,1,1,0,2011,7,202,5.525453,14.0


# Decision Tree Classifier

In [33]:
dtc = DecisionTreeClassifier(max_depth=5).fit(xtrain, ytrain)
pred1 = dtc.predict(xvalid)

In [34]:
print('Training accuracy: ', accuracy_score(ytrain, dtc.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred1))

Training accuracy:  0.7111952861952862
Validation accuracy:  0.7082491582491582


# Random Forest Classifier

In [35]:
rf = RandomForestClassifier(max_depth=100).fit(xtrain,ytrain)
pred2 = rf.predict(xvalid)

In [36]:
print('Training accuracy: ', accuracy_score(ytrain, rf.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred2))

Training ccuracy:  0.9840488215488216
Validation accuracy:  0.7871212121212121


# Support Vector Machine

In [38]:
svm = SVC().fit(xtrain,ytrain)
pred3 = svm.predict(xvalid)

In [39]:
print('Training accuracy: ', accuracy_score(ytrain, svm.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred3))

Training ccuracy:  1.0
Validation accuracy:  0.5353535353535354


# SGD

In [41]:
sgd = SGDClassifier(loss="hinge", penalty="l2").fit(xtrain,ytrain)
pred4 = sgd.predict(xvalid)

In [42]:
print('Training accuracy: ', accuracy_score(ytrain, sgd.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred4))

Training accuracy:  0.54503367003367
Validation accuracy:  0.5361111111111111


# KNN

In [44]:
knn = KNeighborsClassifier(n_neighbors=7).fit(xtrain,ytrain)
pred5 = knn.predict(xvalid)

In [45]:
print('Training accuracy: ', accuracy_score(ytrain, knn.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred5))

Training accuracy:  0.6333122895622896
Validation accuracy:  0.49595959595959593


# Naive Bayes

In [47]:
gnb = GaussianNB().fit(xtrain,ytrain)
pred6 = gnb.predict(xvalid)

In [48]:
print('Training accuracy: ', accuracy_score(ytrain, gnb.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred6))

Training accuracy:  0.6289772727272728
Validation accuracy:  0.6266835016835017


# Neural Network

In [49]:
mlp = MLPClassifier().fit(xtrain,ytrain)
pred7 = mlp.predict(xvalid)

In [50]:
print('Training accuracy: ', accuracy_score(ytrain, mlp.predict(xtrain)))
print('Validation accuracy: ', accuracy_score(yvalid, pred7))

Training accuracy:  0.4540824915824916
Validation accuracy:  0.4535353535353535


# Optimizing the Random Forest

In [54]:
param_grid = { 
    'n_estimators': [300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,10,30,50],
    'criterion' :['gini', 'entropy']
}

In [None]:
gridRF = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 5)
gridRF.fit(xtrain, ytrain)

In [None]:
gridRF.best_params_
