In [1]:
# imports
import pandas as pd
# from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier
import statsmodels.api as sm


In [2]:
X_train = pd.read_csv('data/Training_set_values.csv', index_col='id')
X_test = pd.read_csv('data/Test_set_values.csv', index_col='id')
y_train = pd.read_csv('data/Training_set_labels.csv', index_col='id')

In [3]:
X_train.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
y_train.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
69572,functional
8776,functional
34310,functional
67743,non functional
19728,functional


## Merging the X and y train data.

In [5]:
df = X_train.merge(y_train, on='id')

We merged the X train and y train data to build a dataframe that we could use to testing our different models on. The data set from where the data came did not have a y test dataset. We will split the merged dataset and create a hold out group later.

In [6]:
df.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


# Data Cleaning

In [7]:
df.isna().sum()

amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity

In [8]:
df = df.drop(['scheme_name', 'date_recorded', 'wpt_name', 'subvillage', 'lga', 'ward', 'recorded_by', 'quantity_group', 'quantity_group'], axis = 1)
df = df.fillna('missing', axis = 1)

We are dropping scheme name, and filling the NaN values with a string `missing`. We dropped scheme name since 28,000 values are missing. We replaced NaN values with `missing` to keep the rows in our dataframe. We will onehotencode the data frame later.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   funder                 59400 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   installer              59400 non-null  object 
 4   longitude              59400 non-null  float64
 5   latitude               59400 non-null  float64
 6   num_private            59400 non-null  int64  
 7   basin                  59400 non-null  object 
 8   region                 59400 non-null  object 
 9   region_code            59400 non-null  int64  
 10  district_code          59400 non-null  int64  
 11  population             59400 non-null  int64  
 12  public_meeting         59400 non-null  object 
 13  scheme_management      59400 non-null  object 
 14  permit                 59400 non-null  object 
 15

We are creating a hold out data set which we will test our final model on.

In [10]:
df, holdout = train_test_split(df, test_size = .1)

In [11]:
X = df.drop('status_group', axis=1)
y = df['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [12]:
X_train.head()

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,num_private,basin,region,region_code,...,payment,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58149,0.0,missing,0,missing,33.82223,-9.170746,0,Lake Nyasa,Mbeya,12,...,never pay,never pay,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe
72555,0.0,Pidp,0,Community,33.461386,-3.101254,0,Lake Victoria,Mwanza,19,...,never pay,never pay,soft,good,enough,shallow well,shallow well,groundwater,hand pump,hand pump
20959,0.0,Government Of Tanzania,0,Government,33.149092,-3.953637,0,Internal,Shinyanga,17,...,never pay,never pay,soft,good,enough,machine dbh,borehole,groundwater,communal standpipe,communal standpipe
53369,10.0,Tasaf,964,J LH CO LTD,35.867217,-10.545548,0,Rufiji,Ruvuma,10,...,pay per bucket,per bucket,soft,good,enough,shallow well,shallow well,groundwater,hand pump,hand pump
52843,0.0,Halmashauri,0,Halmashauri/Quick win project,32.618733,-4.833787,0,Lake Tanganyika,Tabora,14,...,never pay,never pay,soft,good,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump


In [13]:
#X_train.to_csv('X_train.csv')
#X_test.to_csv('X_test.csv')
#y_train.to_csv('y_train.csv')
#y_test.to_csv('y_test.csv')
#
#df.to_csv('holdout_data.csv')

# Baseline Dummy Model

In [14]:
dummy_model = DummyClassifier(strategy='most_frequent', random_state=42)
dummy_model.fit(X_train, y_train)

DummyClassifier(random_state=42, strategy='most_frequent')

In [15]:
print('Accuracy Score Train:', dummy_model.score(X_train, y_train))
print('Accuracy Score Test:', dummy_model.score(X_test, y_test))

Accuracy Score Train: 0.5437835141538845
Accuracy Score Test: 0.543808454919566


In [16]:
log_loss_dummy = cross_val_score(dummy_model, X_train, y_train, scoring='neg_log_loss')
log_loss_dummy = -log_loss_dummy.mean()
print('Log Loss:', log_loss_dummy)

Log Loss: 15.757159192310917


# Logistic Regression Model -Sanjit

Importing relevant tools

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

import seaborn as sns

In [18]:
logreg = LogisticRegression()
ss = StandardScaler()

X_train_num = X_train.select_dtypes(exclude= object)
X_test_num = X_test.select_dtypes(exclude= object)

X_train_num_scaled = ss.fit_transform(X_train_num)
X_test_num_scaled = ss.transform(X_test_num)

logreg.fit(X_train_num_scaled, y_train)

preds = logreg.predict(X_test_num_scaled)

In [19]:
test_accuracy = accuracy_score(y_test, preds)
print(f'Model Accuracy on Test Data = {round(test_accuracy*100,2)}%')

Model Accuracy on Test Data = 55.89%


In [20]:
print('Accuracy Score Train:', logreg.score(X_train_num_scaled,y_train))
print('Accuracy Score Test:', logreg.score(X_test_num_scaled,y_test))


Accuracy Score Train: 0.5585484474373363
Accuracy Score Test: 0.5589225589225589


In [21]:
log_loss_logreg = cross_val_score(logreg, X_train_num_scaled, y_train, scoring='neg_log_loss')
log_loss_logreg = -log_loss_logreg.mean()
print('Log Loss:', log_loss_logreg)

Log Loss: 0.8686400323694308


Our scores have not improved as much on our dummy model. Let's be more selective with the columns we use.

# Finding the best predictor columns

OneHotEncoding our well status

In [22]:
status_df = pd.get_dummies(df['status_group'])

status_df

Unnamed: 0_level_0,functional,functional needs repair,non functional
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39800,1,0,0
17195,0,1,0
28312,0,1,0
19884,0,0,1
30229,1,0,0
...,...,...,...
47600,0,0,1
38777,1,0,0
47927,0,0,1
58707,0,0,1


'funder' and 'installer' have 1557 and 1748 unique values respectively. I would prefer not to OneHotEncode these columns. So I will drop these columns. 

In [23]:
new_df_objectonly = df.select_dtypes(include= object).drop(['funder','installer'],axis=1)
new_df_objectonly

Unnamed: 0_level_0,basin,region,public_meeting,scheme_management,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,...,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39800,Rufiji,Iringa,True,VWC,True,gravity,gravity,gravity,vwc,user-group,...,annually,soft,good,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
17195,Internal,Shinyanga,True,WUG,True,nira/tanira,nira/tanira,handpump,wug,user-group,...,never pay,soft,good,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional needs repair
28312,Internal,Singida,True,WUA,True,gravity,gravity,gravity,wua,user-group,...,per bucket,soft,good,insufficient,dam,dam,surface,communal standpipe multiple,communal standpipe,functional needs repair
19884,Wami / Ruvu,Pwani,True,VWC,True,other,other,other,vwc,user-group,...,never pay,soft,good,enough,river,river/lake,surface,hand pump,hand pump,non functional
30229,Pangani,Kilimanjaro,True,Water authority,True,gravity,gravity,gravity,water board,user-group,...,per bucket,soft,good,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47600,Pangani,Kilimanjaro,True,Water authority,True,gravity,gravity,gravity,vwc,user-group,...,never pay,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,non functional
38777,Wami / Ruvu,Morogoro,True,VWC,True,swn 80,swn 80,handpump,vwc,user-group,...,never pay,salty,salty,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional
47927,Internal,Manyara,True,VWC,True,gravity,gravity,gravity,vwc,user-group,...,never pay,soft,good,enough,river,river/lake,surface,communal standpipe,communal standpipe,non functional
58707,Pangani,Arusha,True,VWC,True,gravity,gravity,gravity,vwc,user-group,...,unknown,fluoride,fluoride,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,non functional


Let's look at how many unique values are in each column

In [24]:
[len(new_df_objectonly[a].value_counts()) for a in new_df_objectonly.columns]

[9, 21, 3, 13, 3, 18, 13, 7, 12, 5, 7, 7, 8, 6, 5, 10, 7, 3, 7, 6, 3]

Below is a list of the column names in the same corresponding order as the list of value_counts above.

In [25]:
[a for a in new_df_objectonly.columns]

['basin',
 'region',
 'public_meeting',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group',
 'status_group']

## 'source_class' as a predictor

In [26]:
source_class_df = pd.get_dummies(df['source_class'])

logreg = LogisticRegression()
ss = StandardScaler()

X_trainSource, X_testSource, y_trainSource, y_testSource = train_test_split(source_class_df, status_df['functional'], stratify=y, random_state=42, test_size= 0.2)
logreg.fit(X_trainSource, y_trainSource)

preds = logreg.predict(X_testSource)
print (f'Model Accuracy Score : {round((accuracy_score(y_testSource, preds))*100,4)}%')

log_loss_logregnew = cross_val_score(logreg, X_trainSource, y_trainSource, scoring='neg_log_loss')
log_loss_logregnew = -log_loss_logregnew.mean()
print('Log Loss:', log_loss_logregnew)

Model Accuracy Score : 54.3771%
Log Loss: 0.6893358245599099


## 'management_group' as a predictor

In [27]:
management_group_df = pd.get_dummies(df['management_group'])

logreg = LogisticRegression()
ss = StandardScaler()

X_train_MgtGr, X_test_MgtGr, y_train_MgtGr, y_test_MgtGr = train_test_split(management_group_df[['commercial','parastatal','user-group']], status_df['functional'], stratify=y, random_state=42, test_size= 0.2)

logreg.fit(X_trainSource, y_trainSource)


preds = logreg.predict(X_test_MgtGr)
print (f'Model Accuracy Score : {round((accuracy_score(y_test_MgtGr, preds))*100,4)}%')

log_loss_logreg_MgtGr = cross_val_score(logreg, X_train_MgtGr, y_train_MgtGr, scoring='neg_log_loss')
log_loss_logreg_MgtGr = -log_loss_logreg_MgtGr.mean()
print('Log Loss:', log_loss_logreg_MgtGr)

Model Accuracy Score : 54.3771%
Log Loss: 0.6885533500789895


## Iterating through all columns as predictors

We used 'source_class' and 'management_group' as a predictor. Let's just try this on every other column with dtype= object.

In [28]:
def find_predict(columns):
    
    status_df = pd.get_dummies(df['status_group'])
    
    for a in columns.columns:

        predictor = pd.get_dummies(df[a])

        logreg = LogisticRegression()
        ss = StandardScaler()

        X_trainAA, X_testAA, y_trainAA, y_testAA = train_test_split(predictor, status_df['functional'], stratify=y, random_state=42, test_size= 0.2)
        logreg.fit(X_trainAA, y_trainAA)

        preds = logreg.predict(X_testAA)
    

        log_loss_logregnew = cross_val_score(logreg, X_trainAA, y_trainAA, scoring='neg_log_loss')
        log_loss_logregnew = -log_loss_logregnew.mean()
        
        
        print (f'Column Used: {a}')
        print (f'Model Accuracy Score : {round((accuracy_score(y_testSource, preds))*100,4)}%')
        print('Log Loss:', log_loss_logregnew)
        print('-----------------------------')

In [29]:
find_predict(new_df_objectonly)

Column Used: basin
Model Accuracy Score : 57.4729%
Log Loss: 0.675667827794823
-----------------------------
Column Used: region
Model Accuracy Score : 58.4736%
Log Loss: 0.6639323844925578
-----------------------------
Column Used: public_meeting
Model Accuracy Score : 55.5836%
Log Loss: 0.6867102967082627
-----------------------------
Column Used: scheme_management
Model Accuracy Score : 54.798%
Log Loss: 0.6798568081630718
-----------------------------
Column Used: permit
Model Accuracy Score : 54.3771%
Log Loss: 0.6890153240388963
-----------------------------
Column Used: extraction_type
Model Accuracy Score : 63.1687%
Log Loss: 0.6444428922219083
-----------------------------
Column Used: extraction_type_group
Model Accuracy Score : 63.5428%
Log Loss: 0.6446205585453606
-----------------------------
Column Used: extraction_type_class
Model Accuracy Score : 63.468%
Log Loss: 0.6460415541679936
-----------------------------
Column Used: management
Model Accuracy Score : 54.957%
Log