In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [85]:
 # Function to view all columns/rows to be used with head() or tail().

def display_all(df):
   with pd.option_context('display.max_rows', 1000):
       with pd.option_context('display.max_columns', 1000):
           display(df)

In [86]:
df = pd.read_csv('../clean_train.csv')
df_test = pd.read_csv('../clean_test.csv')

df.columns = df.columns.str.replace(' ', '')
df_test.columns = df_test.columns.str.replace(' ', '')

df.columns = df.columns.str.replace('-', '')
df_test.columns = df_test.columns.str.replace('-', '')

df.columns = df.columns.str.replace('/', '')
df_test.columns = df_test.columns.str.replace('/', '')

df['status_group']=df['status_group'].replace("functional",2)
df['status_group']=df['status_group'].replace("non functional",1)
df['status_group']=df['status_group'].replace("functional needs repair",0)

In [87]:
df.shape, df_test.shape

((59400, 65), (14850, 62))

In [88]:
df.columns.difference(df_test.columns)

Index(['extraction_type_othermkulimashinyanga', 'scheme_management_None',
       'status_group'],
      dtype='object')

In [89]:
df.drop(columns = ['extraction_type_othermkulimashinyanga', 'scheme_management_None'], inplace=True)

In [90]:
df.shape, df_test.shape

((59400, 63), (14850, 62))

### Logistic Regression.

In [91]:
df_train, df_pretest = train_test_split(df,test_size=0.3)


In [92]:
df_train.shape, df_pretest.shape

((41580, 63), (17820, 63))

In [93]:
df_train.columns

Index(['id', 'gps_height', 'longitude', 'latitude', 'region_code',
       'district_code', 'population', 'payment', 'status_group', 'recordDay',
       'recordMonth', 'recordYear', 'public_meeting_True',
       'scheme_management_Company', 'scheme_management_Other',
       'scheme_management_Parastatal', 'scheme_management_Privateoperator',
       'scheme_management_SWC', 'scheme_management_Trust',
       'scheme_management_VWC', 'scheme_management_WUA',
       'scheme_management_WUG', 'scheme_management_WaterBoard',
       'scheme_management_Waterauthority', 'permit_True',
       'extraction_type_afridev', 'extraction_type_cemo',
       'extraction_type_climax', 'extraction_type_gravity',
       'extraction_type_indiamarkii', 'extraction_type_indiamarkiii',
       'extraction_type_ksb', 'extraction_type_mono',
       'extraction_type_niratanira', 'extraction_type_other',
       'extraction_type_otherplaypump', 'extraction_type_otherropepump',
       'extraction_type_otherswn81', 'extr

In [94]:
model = smf.mnlogit(formula="status_group ~ gps_height+longitude+latitude+region_code+district_code+population+payment+recordDay+recordMonth+recordYear+public_meeting_True+scheme_management_Company+scheme_management_Other+scheme_management_Parastatal+scheme_management_Privateoperator+scheme_management_SWC+scheme_management_Trust+scheme_management_VWC+scheme_management_WUA+scheme_management_WUG+scheme_management_WaterBoard+scheme_management_Waterauthority+permit_True+extraction_type_afridev+extraction_type_cemo+extraction_type_climax+extraction_type_gravity+extraction_type_indiamarkii+extraction_type_indiamarkiii+extraction_type_ksb+extraction_type_mono+extraction_type_niratanira+extraction_type_other+extraction_type_otherplaypump+extraction_type_otherropepump+extraction_type_otherswn81+extraction_type_submersible+extraction_type_swn80+extraction_type_walimi+extraction_type_windmill+quality_group_colored+quality_group_fluoride+quality_group_good+quality_group_milky+quality_group_salty+quality_group_unknown+quantity_group_dry+quantity_group_enough+quantity_group_insufficient+quantity_group_seasonal+quantity_group_unknown+source_class_groundwater+source_class_surface+source_class_unknown+waterpoint_type_cattletrough+waterpoint_type_communalstandpipe+waterpoint_type_communalstandpipemultiple+waterpoint_type_dam+waterpoint_type_handpump+waterpoint_type_improvedspring+waterpoint_type_other",data=df_train)
fitted = model.fit()
print(fitted.summary2())

predictions = fitted.predict(df_pretest)
predictions['status_group']=predictions.idxmax(axis=1)
predictions.drop(columns=[0,1,2], inplace=True)

predictions['status_group']=predictions['status_group'].replace(2,"functional")
predictions['status_group']=predictions['status_group'].replace(1,"non functional")
predictions['status_group']=predictions['status_group'].replace(0,"functional needs repair")

df_pretest['status_group']=df_pretest['status_group'].replace(2,"functional")
df_pretest['status_group']=df_pretest['status_group'].replace(1,"non functional")
df_pretest['status_group']=df_pretest['status_group'].replace(0,"functional needs repair")
print(classification_report(y_pred=predictions,y_true=df_pretest.status_group))

         Current function value: 0.683106
         Iterations: 35


  bse = np.sqrt(np.diag(self.cov_params()))


                                           Results: MNLogit
Model:                           MNLogit                      Pseudo R-squared:             0.234     
Dependent Variable:              status_group                 AIC:                          57043.0836
Date:                            2023-04-03 16:53             BIC:                          58062.0578
No. Observations:                41580                        Log-Likelihood:               -28404.   
Df Model:                        116                          LL-Null:                      -37073.   
Df Residuals:                    41462                        LLR p-value:                  0.0000    
Converged:                       0.0000                       Scale:                        1.0000    
No. Iterations:                  35.0000                                                              
------------------------------------------------------------------------------------------------------
             

In [95]:
from sklearn.metrics import (confusion_matrix, accuracy_score)

# confusion matrix
cm = confusion_matrix(df_pretest.status_group, predictions)
print ("Confusion Matrix : \n", cm)

# accuracy score of the model
print('Test accuracy = ', accuracy_score(df_pretest.status_group, predictions))


Confusion Matrix : 
 [[8800   28  956]
 [1038   43  186]
 [2843   21 3905]]
Test accuracy =  0.7153759820426487


### Submission File for DrivenData

Create prediction on test data and create dataframe for competition submission.

In [96]:
predictions = fitted.predict(df_test)
predictions['status_group']=predictions.idxmax(axis=1)
predictions.drop(columns=[0,1,2], inplace=True)

predictions['status_group']=predictions['status_group'].replace(2,"functional")
predictions['status_group']=predictions['status_group'].replace(1,"non functional")
predictions['status_group']=predictions['status_group'].replace(0,"functional needs repair")

df_disp= pd.read_csv('../testSet-indep.csv')
df_pred = pd.concat([df_disp['id'], predictions], axis=1)

df = pd.DataFrame(df_pred)

In [97]:
df

Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,functional
14846,18990,functional
14847,28749,functional
14848,33492,functional


In [98]:
df.to_csv('../submissions/Submission_logreg.csv', index=False)