# Import Library

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [None]:
# Display 100 columns for pandas
pd.set_option('display.max_columns', 100)

# Open, Read, and Save to data frame
X_train = 'train_features.csv'
X_test = 'test_features.csv'
y_train = 'train_labels.csv'

X_train = pd.read_csv(X_train)
X_test = pd.read_csv(X_test)
y_train = pd.read_csv(y_train)

# Check columns to check the what type of data
X_train.head()

# Drop ID for y_train

In [None]:
# Predicting status group so we drop id
y_train = y_train.drop(columns='id')

# Split to Training and Validation Sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=.2)

# Encode Categorical Values

In [None]:
# Encode categorical features
encoder = ce.OrdinalEncoder()

# Fit & Transform
X_train = encoder.fit_transform(X_train)

# Baseline Random Forest Classifier Model

In [None]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)

# Fit training data to model
model.fit(X_train, y_train)

# Encode X_val categorical values
X_val = encoder.transform(X_val)

# Predict X_val
y_pred = model.predict(X_val)

# Print Accuracy Score
print('Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))

# Permutation Importance

In [None]:
# Check which feature is unimportant by shuffling feature values
## Weight close to 0 means it is less important and we should drop
import eli5
from eli5.sklearn import PermutationImportance

# Instantiate. prefit means cv already done
permuter = PermutationImportance(model, scoring='accuracy', cv='prefit', n_iter=3,
                                 random_state=42)
# Fit to test
permuter.fit(X_val, y_val)

# Display 
feature_names = X_val.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

# Retest

In [None]:
# Save features with number greater than 0
mask = permuter.feature_importances_ > 0
feature = X_train.columns[mask]
X_train = X_train[feature]
X_val = X_val[feature]

# Fit training data to model
model.fit(X_train, y_train)

# Predict X_val
y_pred = model.predict(X_val)

# Print Accuracy Score
print('Retest Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))


In [None]:
# Baseline random forest classifier gave us a pretty good score of 0.7965
# Retest the model with permutation importance and removing columns less than 0 yield 0.7980
# I belive we can do better by cleaning up the data and adding features
# Check out my Water_Pump_Best notebook

# -----------------------------------------------------------------

# Restart Kernal - Test with Dropped P.Imp <= 0

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Display 100 columns for pandas
pd.set_option('display.max_columns', 100)

# Open, Read, and Save to data frame
X_train = 'train_features.csv'
X_test = 'test_features.csv'
y_train = 'train_labels.csv'

X_train = pd.read_csv(X_train)
X_test = pd.read_csv(X_test)
y_train = pd.read_csv(y_train)

# Drop columns for baseline
#drop_cols = ['recorded_by','id','num_private', 'wpt_name'] #// 1st drop, 0.79857
# drop_cols = ['recorded_by','id','num_private', 'wpt_name', 'basin', 'source',
##             'subvillage','management_group','source_class'] #// 2nd drop 0.79969
#drop_cols = ['recorded_by','id','num_private', 'wpt_name', 'basin', 'source', 
##             'subvillage','management_group','source_class','water_quality','scheme_management',
##             'permit', 'extraction_type_group','district_code'] # // 3rd drop 0.79816

# Test Feature Engineer

In [2]:
# Returns water per person
X_train['Water_per_person'] = X_train['amount_tsh']/X_train['population']
X_test['Water_per_person'] = X_test['amount_tsh']/X_test['population']

# Dividing by 0 creates inf values - replace with nan
X_train['Water_per_person'] = X_train['Water_per_person'].replace([np.inf, -np.inf], np.nan)
X_test['Water_per_person'] = X_test['Water_per_person'].replace([np.inf, -np.inf], np.nan)

# Replace nan with 0
X_train['Water_per_person'] = X_train['Water_per_person'].replace(np.nan, 0)
X_test['Water_per_person'] = X_test['Water_per_person'].replace(np.nan, 0)

# Change to panda date time to extract week and month
X_train['date_recorded'] = pd.to_datetime(X_train['date_recorded'], infer_datetime_format=True)
X_test['date_recorded'] = pd.to_datetime(X_test['date_recorded'], infer_datetime_format=True)
X_train['week'] = X_train['date_recorded'].apply(lambda x: x.week)
X_test['week'] = X_test['date_recorded'].apply(lambda x: x.week)
X_train['month'] = X_train['date_recorded'].apply(lambda x: x.month)
X_test['month'] = X_test['date_recorded'].apply(lambda x: x.month)

# Change back to string or else RandomForestClassifier can't process
X_train['date_recorded']= X_train['date_recorded'].astype(str)
X_test['date_recorded'] = X_test['date_recorded'].astype(str)

# Test Drop

In [3]:
# Drop columns for n_estimators=400, max_depth=30 # Baseline 0.8131 // with features 0.81254

drop_cols = ['recorded_by','id','num_private', 'wpt_name'] # 1st drop, 0.8122 # with features 0.8126 & test 0.81557
#drop_cols = ['recorded_by','id','num_private', 'wpt_name','installer','date_recorded',
#             'source','management_group','extraction_type','source_type','extraction_type_group',
#             'district_code','lga','management','source_class','funder',
#             'basin','scheme_management'] # 2nd drop 0.8066
#drop_cols = ['recorded_by','id','num_private', 'wpt_name','installer','date_recorded',
#             'source','management_group','extraction_type','source_type','extraction_type_group',
#             'district_code','lga','management','source_class','funder',
#             'basin','scheme_management','payment','payment_type'] # 3rd drop 0.8058
#drop_cols = ['recorded_by','id','num_private', 'wpt_name','installer','date_recorded',
#             'source','management_group','extraction_type','source_type','extraction_type_group',
#             'district_code','lga','management','source_class','funder',
#           'basin','scheme_management','payment','payment_type','quality_group'] # 4th drop 0.8054
# drop_cols = ['recorded_by','id','num_private', 'wpt_name','installer','date_recorded',
#              'source','management_group','extraction_type','source_type','extraction_type_group',
#              'district_code','lga','management','source_class','funder',
#              'basin','scheme_management','payment','payment_type',
#              'quality_group','subvillage'] # 5th drop 0.8060

# Drop, Encode, Model

In [4]:
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

In [5]:
y_train = y_train.drop(columns='id')

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=.2)

# Encode categorical features
encoder = ce.OrdinalEncoder()

# Fit & Transform
X_train = encoder.fit_transform(X_train)

In [7]:
model = RandomForestClassifier(n_jobs=-1, random_state=42, n_estimators=400, max_depth=30)

# Fit training data to model
model.fit(X_train, y_train)

# Encode X_val categorical values
X_val = encoder.transform(X_val)

# Predict X_val
y_pred = model.predict(X_val)

# Print Accuracy Score
print('Validation Set Accuracy Score:', accuracy_score(y_val, y_pred))

  after removing the cwd from sys.path.


Validation Set Accuracy Score: 0.8126262626262626


# Permutation Importance

In [8]:
# Check which feature is unimportant by shuffling feature values
## Weight close to 0 means it is less important and we should drop
import eli5
from eli5.sklearn import PermutationImportance

# Instantiate. prefit means cv already done
permuter = PermutationImportance(model, scoring='accuracy', cv='prefit', n_iter=3,
                                 random_state=42)
# Fit to test
permuter.fit(X_val, y_val)

# Display 
feature_names = X_val.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

Weight,Feature
0.0318  ± 0.0030,quantity
0.0232  ± 0.0020,quantity_group
0.0122  ± 0.0022,extraction_type_class
0.0105  ± 0.0013,construction_year
0.0095  ± 0.0003,waterpoint_type
0.0042  ± 0.0019,longitude
0.0035  ± 0.0013,population
0.0022  ± 0.0004,latitude
0.0022  ± 0.0016,waterpoint_type_group
0.0019  ± 0.0017,amount_tsh


# Predict and Submit

In [9]:
# Encode X_test
X_test = encoder.transform(X_test)

# Predict X_test
y_pred = model.predict(X_test)

In [10]:
### SUBMISSION ###

sample_submission = pd.read_csv('sample_submission.csv')
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('Baseline_2_F-submission.csv', index=False)