# Import Library

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [2]:
# Display 100 columns for pandas
pd.set_option('display.max_columns', 100)

# Open, Read, and Save to data frame
X_train = 'train_features.csv'
X_test = 'test_features.csv'
y_train = 'train_labels.csv'

X_train = pd.read_csv(X_train)
X_test = pd.read_csv(X_test)
y_train = pd.read_csv(y_train)

# Check columns to check the what type of data
X_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


# Drop ID for y_train

In [3]:
# Predicting status group so we drop id
y_train = y_train.drop(columns='id')

# Split to Training and Validation Sets

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=.2)

# Encode Categorical Values

In [5]:
# Encode categorical features
encoder = ce.OrdinalEncoder()

# Fit & Transform
X_train = encoder.fit_transform(X_train)

# Baseline Random Forest Classifier Model

In [6]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)

# Fit training data to model
model.fit(X_train, y_train)

# Encode X_val categorical values
X_val = encoder.transform(X_val)

# Predict X_val
y_pred = model.predict(X_val)

# Print Accuracy Score
print('Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))

  after removing the cwd from sys.path.


Validation Set Accuracy Score: 0.7965488215488216


# Permutation Importance

In [7]:
# Check which feature is unimportant by shuffling feature values
## Weight close to 0 means it is less important and we should drop
import eli5
from eli5.sklearn import PermutationImportance

# Instantiate. prefit means cv already done
permuter = PermutationImportance(model, scoring='accuracy', cv='prefit', n_iter=3,
                                 random_state=42)
# Fit to test
permuter.fit(X_val, y_val)

# Display 
feature_names = X_val.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

Weight,Feature
0.0703  ± 0.0027,quantity_group
0.0362  ± 0.0017,extraction_type_class
0.0261  ± 0.0027,construction_year
0.0253  ± 0.0030,longitude
0.0195  ± 0.0034,latitude
0.0176  ± 0.0041,waterpoint_type
0.0150  ± 0.0025,funder
0.0146  ± 0.0038,amount_tsh
0.0139  ± 0.0016,population
0.0135  ± 0.0014,quantity


# Retest

In [8]:
# Save features with number greater than 0
mask = permuter.feature_importances_ > 0
feature = X_train.columns[mask]
X_train = X_train[feature]
X_val = X_val[feature]

# Fit training data to model
model.fit(X_train, y_train)

# Predict X_val
y_pred = model.predict(X_val)

# Print Accuracy Score
print('Retest Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))


  


Retest Validation Set Accuracy Score: 0.798063973063973


In [None]:
# Baseline random forest classifier gave us a pretty good score of 0.7965
# Retest the model with permutation importance and removing columns less than 0 yield 0.7980
# I belive we can do better by cleaning up the data and adding features
# Check out my Water_Pump_Best notebook

# -----------------------------------------------------------------