## GRADIENT BOOSTING

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [8]:
df = pd.read_csv('../clean_train.csv')
df_test = pd.read_csv('../clean_test.csv')

df.head()

Unnamed: 0,id,gps_height,longitude,latitude,region_code,district_code,population,payment,status_group,recordDay,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,69572,1390.0,34.938093,-9.856322,11,5,109.0,1.0,functional,14,...,1,0,0,0,1,0,0,0,0,0
1,8776,1399.0,34.698766,-2.147466,20,2,280.0,0.0,functional,6,...,0,1,0,0,1,0,0,0,0,0
2,34310,686.0,37.460664,-3.821329,21,4,250.0,1.0,functional,25,...,0,1,0,0,0,1,0,0,0,0
3,67743,263.0,38.486161,-11.155298,90,63,58.0,0.0,non functional,28,...,1,0,0,0,0,1,0,0,0,0
4,19728,1300.0,31.130847,-1.825359,18,1,250.0,0.0,functional,13,...,0,1,0,0,1,0,0,0,0,0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

from sklearn import tree


In [10]:
df.columns.difference(df_test.columns)

Index(['extraction_type_other - mkulima/shinyanga', 'scheme_management_None',
       'status_group'],
      dtype='object')

In [11]:
df.drop(columns = ['extraction_type_other - mkulima/shinyanga', 'scheme_management_None'], inplace=True)

df.shape, df_test.shape

((59400, 63), (14850, 62))

### Modeling

In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['status_group'], axis=1)
y = df['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)

print('Training set: ', X_train.shape, y_train.shape)
print('Testing set: ', X_test.shape, y_test.shape)

Training set:  (41580, 62) (41580,)
Testing set:  (17820, 62) (17820,)


In [13]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
predictions = gbc.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[8908   39  603]
 [1049  118  163]
 [2731   49 4160]]
                         precision    recall  f1-score   support

             functional       0.70      0.93      0.80      9550
functional needs repair       0.57      0.09      0.15      1330
         non functional       0.84      0.60      0.70      6940

               accuracy                           0.74     17820
              macro avg       0.71      0.54      0.55     17820
           weighted avg       0.75      0.74      0.71     17820



In [14]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, gbc.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, predictions))

Train set Accuracy:  0.7473304473304473
Test set Accuracy:  0.7399551066217733


### Submission File for DrivenData

Create prediction on test data and create dataframe for competition submission.

In [15]:
X2 = df_test.values

prediction_test = gbc.predict(X2)

df_pred = pd.DataFrame(prediction_test)



In [16]:
df_test.shape, df_pred.shape

((14850, 62), (14850, 1))

In [17]:
df_pred.columns=[ 'status_group']

In [18]:
df_pred.head()

df_pred.shape

(14850, 1)

In [19]:
df= pd.read_csv('../testSet-indep.csv')

df = pd.concat([df['id'], df_pred], axis=1)

df.to_csv('../submissions/Submission_gbc.csv', index=False)