In [215]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

# Train model with filled data

In [None]:
train_filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_filled.head()

Unnamed: 0.1,Unnamed: 0,psu_hh_idcode,hhid,rating_filled,q02,q03,q04,q05,q09,q23,Q01,Q03,Q06,Q07,Q08,Q11,Q19
0,0,30_8_1,3008,4.0,1,1,19680615,44,0,0,1.0,1.0,2.0,1.0,2.0,13.0,2.0
1,1,194_1_2,19401,1.0,2,2,19640910,48,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
2,2,224_6_1,22406,3.0,1,1,19510317,61,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
3,3,323_10_1,32310,5.0,1,1,19460402,66,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
4,4,428_10_1,42810,4.0,2,1,19400407,72,0,0,1.0,1.0,1.0,0.0,2.0,14.0,2.0


In [None]:
train_filled = train_filled.drop(train_filled.columns[0], axis=1)
y = train_filled['rating_filled']
feature_cols = list(train_filled.columns.difference(['psu_hh_idcode', 'hhid', 'rating_filled']))
X = train_filled[feature_cols]

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)


(16210, 13) (4053, 13) (16210,) (4053,)


# Apply Random Forest

### Hyperparameter Tuning

In [202]:
params = {
    'n_estimators':[200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth' : [4, 6, 8],
    'min_samples_leaf' : [8, 12],
    'min_samples_split' : [8, 16]
}

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring='accuracy', return_train_score=True)
# Fit the model
grid_search.fit(train_x, train_y)

In [205]:
# Retrieve results as a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Display the key columns
results_display = results[['params', 'mean_test_score', 'std_test_score']]
sorted_results = results_display.sort_values(by='mean_test_score', ascending=False)
# print(sorted_results)

#Get Best Model
best_model = grid_search.best_estimator_
probabilities = best_model.predict_proba(test_x)
unfilled_pred = best_model.predict(test_x)
print('Accuracy score of unfilled model: ', accuracy_score(test_y, unfilled_pred))

Accuracy score of unfilled model:  0.21274601686972822


In [None]:
# Train model with unfileld train data

In [213]:
train_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
print(train_unfilled.head())
y = train_unfilled['subjectivePoverty_rating']
feature_cols = list(train_unfilled.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = train_unfilled[feature_cols]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

  psu_hh_idcode   hhid  subjectivePoverty_rating  q02  q03  q05  q09  q23  \
0        30_8_1   3008                         4    1    1   44    0    0   
1       194_1_2  19401                         1    2    2   48    0    0   
2       224_6_1  22406                         3    1    1   61    0    0   
3      323_10_1  32310                         5    1    1   66    0    0   
4      428_10_1  42810                         4    2    1   72    0    0   

   Q01  Q03  Q06  Q07  Q08   Q11  Q19  
0    1    1  2.0  1.0  2.0  13.0  2.0  
1    1    1  2.0  0.0  2.0  13.0  2.0  
2    1    1  2.0  0.0  2.0  13.0  2.0  
3    1    1  2.0  0.0  2.0  13.0  2.0  
4    1    1  1.0  0.0  2.0  14.0  2.0  
(4267, 12) (1067, 12) (4267,) (1067,)


### Hyeprparameter tuning with grid search

In [None]:
params = {
    'n_estimators':[10, 20, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring='accuracy', return_train_score=True)
# Fit the model
grid_search.fit(train_x, train_y)

### get the prediction probability 

In [None]:
# Retrieve results as a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Display the key columns
results_display = results[['params', 'mean_test_score', 'std_test_score']]
sorted_results = results_display.sort_values(by='mean_test_score', ascending=False)
sorted_results.to_csv(os.path.join(RESULT_DATA_DIR, "rf_unfillled.csv"), index=False)

best_model = grid_search.best_estimator_
probabilities = best_model.predict_proba(test_x)
unfilled_pred = best_model.predict(test_x)
print('Accuracy score of unfilled model: ', accuracy_score(test_y, unfilled_pred))

Accuracy score of unfilled model:  0.20243673851921273


### Feature Importance