## Random Forest Model

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [29]:
df = pd.read_csv('../clean_train.csv')
df_test = pd.read_csv('../clean_test.csv')

df.head()

Unnamed: 0,id,gps_height,longitude,latitude,region_code,district_code,population,payment,status_group,recordDay,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,69572,1390.0,34.938093,-9.856322,11,5,109.0,1.0,functional,14,...,1,0,0,0,1,0,0,0,0,0
1,8776,1399.0,34.698766,-2.147466,20,2,280.0,0.0,functional,6,...,0,1,0,0,1,0,0,0,0,0
2,34310,686.0,37.460664,-3.821329,21,4,250.0,1.0,functional,25,...,0,1,0,0,0,1,0,0,0,0
3,67743,263.0,38.486161,-11.155298,90,63,58.0,0.0,non functional,28,...,1,0,0,0,0,1,0,0,0,0
4,19728,1300.0,31.130847,-1.825359,18,1,250.0,0.0,functional,13,...,0,1,0,0,1,0,0,0,0,0


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import confusion_matrix,accuracy_score


In [31]:
df.columns.difference(df_test.columns)

Index(['extraction_type_other - mkulima/shinyanga', 'scheme_management_None',
       'status_group'],
      dtype='object')

In [32]:
df.drop(columns = ['extraction_type_other - mkulima/shinyanga','scheme_management_None'], inplace=True)

In [33]:
df.shape, df_test.shape

((59400, 63), (14850, 62))

### Modeling

In [34]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['status_group'], axis=1)
y = df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

print('Training set: ', X_train.shape, y_train.shape)
print('Testing set: ', X_test.shape, y_test.shape)

Training set:  (41580, 62) (41580,)
Testing set:  (17820, 62) (17820,)


## Random Forests

In [35]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(X_train,y_train)

predictions = rfc.predict(X_test)

print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))

[[8790  109  651]
 [ 805  314  211]
 [1844   75 5021]]
                         precision    recall  f1-score   support

             functional       0.77      0.92      0.84      9550
functional needs repair       0.63      0.24      0.34      1330
         non functional       0.85      0.72      0.78      6940

               accuracy                           0.79     17820
              macro avg       0.75      0.63      0.65     17820
           weighted avg       0.79      0.79      0.78     17820



In [36]:
# accuracy score of the model
print('Train set Accuracy: ', metrics.accuracy_score(y_train, rfc.predict(X_train)))
print('Test set accuracy = ', accuracy_score(y_test, predictions))

Train set Accuracy:  0.9319143819143819
Test set accuracy =  0.792648709315376


With GridSearch CV;

In [37]:
from sklearn.model_selection import GridSearchCV

estimator = RandomForestClassifier(random_state=1)
params = {'n_estimators':[100,200,500,1000]}
clf = GridSearchCV(estimator,params,scoring='accuracy')
clf.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=1),
             param_grid={'n_estimators': [100, 200, 500, 1000]},
             scoring='accuracy')

In [38]:
clf.best_params_

{'n_estimators': 1000}

In [39]:
clf.best_score_

0.7915103415103415

In [40]:
clf_preds = clf.predict(X_test)
print(confusion_matrix(y_test,clf_preds))
print(classification_report(y_test,clf_preds))

[[8464  225  861]
 [ 686  418  226]
 [1543  120 5277]]
                         precision    recall  f1-score   support

             functional       0.79      0.89      0.84      9550
functional needs repair       0.55      0.31      0.40      1330
         non functional       0.83      0.76      0.79      6940

               accuracy                           0.79     17820
              macro avg       0.72      0.65      0.68     17820
           weighted avg       0.79      0.79      0.79     17820



### Submission File for DrivenData

Create prerdiction on test data and create dataframe for competition submission.

In [41]:
X2 = df_test.values

prediction_test = rfc.predict(X2)



In [42]:
df_pred = pd.DataFrame(prediction_test)

In [43]:
df_test.shape, df_pred.shape

((14850, 62), (14850, 1))

In [44]:
df_pred.columns=[ 'status_group']

In [45]:
df_pred['status_group'].unique()

array(['non functional', 'functional', 'functional needs repair'],
      dtype=object)

In [46]:
df_pred.head()

df_pred.shape

(14850, 1)

In [47]:
dfid = pd.read_csv('../testSet-indep.csv')

df = pd.concat([dfid['id'], df_pred], axis=1)

df.to_csv('../submissions/Submission_rfc.csv', index=False)

In [48]:
df.shape

(14850, 2)