### Logistic Regression

In [53]:
#import libraries
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
#preview the new data
import pandas as pd
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,basin,subvillage,region,local_government_area,ward,...,construction_year,extraction_type_group,management,management_group,payment_type,water_quality,quantity,source_type,waterpoint_type,status_group
0,6000.0,1300060800000000000,Roman,1390,Roman,Lake Nyasa,Mnyusi B,Iringa,Ludewa,Mundindi,...,1999,gravity,vwc,user-group,annually,soft,enough,spring,communal standpipe,functional
1,0.0,1362528000000000000,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,Mara,Serengeti,Natta,...,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional
2,25.0,1361750400000000000,Lottery Club,686,world vision,Pangani,Majengo,Manyara,Simanjiro,Ngorika,...,2009,gravity,vwc,user-group,per bucket,soft,enough,dam,communal standpipe multiple,functional
3,0.0,1359331200000000000,Unicef,263,Unicef,Ruvuma / Southern Coast,Mahakamani,Mtwara,Nanyumbu,Nanyumbu,...,1986,submersible,vwc,user-group,never pay,soft,dry,borehole,communal standpipe multiple,non functional
4,0.0,1310515200000000000,Action In A,0,Artisan,Lake Victoria,Kyanyamisa,Kagera,Karagwe,Nyakasimbi,...,0,gravity,other,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional


In [4]:
#getting a copy of our dataframe
df1 = df.copy()

In [5]:
#checking the value count of target group
df1['status_group'].value_counts()

functional                 29657
non functional             21197
functional needs repair     3835
Name: status_group, dtype: int64

In [6]:
new_status_group = {'functional':0, 'non functional': 1, 'functional needs repair' : 2}
df1['status_group'] = df1['status_group'].replace(new_status_group)

In [7]:
df1['status_group'].value_counts()

0    29657
1    21197
2     3835
Name: status_group, dtype: int64

Since the data type is object, we need to convert it to an integer

In [8]:
df1['status_group'].dtypes

dtype('int64')

In [19]:
categorical = ['source_type','quantity','water_quality','payment_type','management_group','basin']
# one hot encode categoricals
ohe = pd.get_dummies(df[categorical], prefix=categorical, drop_first=True)
ohe

Unnamed: 0,source_type_dam,source_type_other,source_type_rainwater harvesting,source_type_river/lake,source_type_shallow well,source_type_spring,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,...,management_group_unknown,management_group_user-group,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu
0,0,0,0,0,0,1,1,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54684,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
54685,0,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
54686,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
54687,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0


In [10]:
ohe.dtypes

source_type_dam                     uint8
source_type_other                   uint8
source_type_rainwater harvesting    uint8
source_type_river/lake              uint8
source_type_shallow well            uint8
source_type_spring                  uint8
quantity_enough                     uint8
quantity_insufficient               uint8
quantity_seasonal                   uint8
quantity_unknown                    uint8
water_quality_fluoride              uint8
water_quality_fluoride abandoned    uint8
water_quality_milky                 uint8
water_quality_salty                 uint8
water_quality_salty abandoned       uint8
water_quality_soft                  uint8
water_quality_unknown               uint8
payment_type_monthly                uint8
payment_type_never pay              uint8
payment_type_on failure             uint8
payment_type_other                  uint8
payment_type_per bucket             uint8
payment_type_unknown                uint8
management_group_other            

In [20]:
#define x and y
X = ohe
y = df1['status_group']

In [21]:
#split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [22]:
#preview head train
X_train.shape

(43751, 35)

In [32]:
y_train.shape

(43751,)

### Logistic Regression

In [39]:
#instantiate logistic regression
logreg = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42)
#Build a pipeline with standard scaler and logistic regression
scaled_pipeline = Pipeline([('ss', StandardScaler()), 
                              ('logreg', LogisticRegression())])
# Fit the training data to pipeline
scaled_pipeline.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline.score(X_test, y_test)

0.6693179740354727

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


Our model predicts the test data correctly with an accuracy of 67%

In [None]:
# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

#### Train the decision treee

In [24]:
#create a classifier
clf = DecisionTreeClassifier(criterion='entropy')
#fit train data to classifier
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

#### Plot the decision tree

In [26]:
y_pred = clf.predict(X_test)

In [27]:
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7004936917169501


Our model predicts the test data correctly with an accuracy of 70%

### K Nearest Neighbors

In [43]:
#instantiate knn
Knn = KNeighborsClassifier(n_neighbors=1, p=100)
#Build a pipeline with knn and scaler
scaled_knn_pipeline = Pipeline([('ss', StandardScaler()), 
                              ('knn', KNeighborsClassifier())])
#fit knn pipeline
scaled_knn_pipeline.fit(X_train, y_train)
#check the score
print("accuracy score :", scaled_knn_pipeline.score(X_test, y_test))

accuracy score : 0.6832144816236972


Our model predicts the test data correctly with an accuracy of 68%

### Random Forest

In [44]:
# Build a pipeline with StandardScaler and RandomForestClassifier
random_pipeline = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=0))])
#fit 
random_pipeline.fit(X_train, y_train)
#check the score
print('accuracy :', random_pipeline.score(X_test, y_test))

accuracy : 0.7015907844212836


Our model predicts the test data correctly with an accuracy of 70%

In [49]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 71.56%
Validation accuracy: 70.42%


In [51]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'max_depth': [6],
    'min_child_weight': [1, 2],
    'subsample': [0.5, 0.7],
    'n_estimators': [100],
}

In [None]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
test_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_scor
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X_train, y_train)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
test_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('')
print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))