# RandomForest

In [5]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, log_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

# Train model with unfilled train data

In [3]:
train_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
# print(train_unfilled.head())

y = train_unfilled['subjectivePoverty_rating']
print(y)

# y = pd.get_dummies(train_unfilled["subjectivePoverty_rating"], prefix="rating").astype(int)
# print(y)

feature_cols = list(train_unfilled.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = train_unfilled[feature_cols]
# print(X.head())
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

0       4
1       1
2       3
3       5
4       4
       ..
5329    3
5330    4
5331    2
5332    5
5333    4
Name: subjectivePoverty_rating, Length: 5334, dtype: int64
(4267, 12) (1067, 12) (4267,) (1067,)


### Hyeprparameter tuning with grid search

In [6]:
params = {
    'n_estimators':[100, 200, 500, 700],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 5, 50],
    'min_samples_leaf': [35, 42, 50],     # 1% of train_x size
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring=log_loss_scorer, return_train_score=True)
# Fit the model
grid_search.fit(train_x, train_y)



### Get the prediction probability 

In [8]:

best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(test_x)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

val_log_loss = log_loss(test_y, y_val_pred_proba, labels=best_model.classes_)
print("Validation Log Loss:", val_log_loss)


# Stroe the grid search results
results = grid_search.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 

results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "rf_unfillled.csv"), index=False)


Best Parameters: {'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 35, 'min_samples_split': 5, 'n_estimators': 500}
Validation Log Loss: 1.9590206849953014


### Class with highest prob

In [None]:
test_input = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
test_input_x = test_input.drop(columns=['psu_hh_idcode'])

col_order = train_x.columns.tolist()
test_input_x = test_input_x[col_order]

print(train_x.head())
print('\n')
print(test_input_x.head())

id = test_input['psu_hh_idcode']
y_val_pred_proba = best_model.predict_proba(test_input_x)

print(y_val_pred_proba)

column_names = [f"subjective_poverty_{i}" for i in range(1, 11)]
probs = pd.DataFrame(y_val_pred_proba, columns=column_names)
submission = pd.concat([id, probs], axis=1)

submission.to_csv(os.path.join(RESULT_DATA_DIR, "submission1.csv"), index=False)

      Q01  Q03  Q06  Q07  Q08   Q11  Q19  q02  q03  q05  q09  q23
3941    1    1  2.0  0.0  2.0  13.0  2.0    1    1   52    0    0
1897    1    1  9.0  2.0  2.0   4.0  2.0    2    4   41    0    1
2229    1    1  2.0  0.0  2.0   1.0  2.0    1    1   32    0    0
4757    1    1  2.0  0.0  1.0   NaN  2.0    1    1   34    3    1
2868    1    1  2.0  0.0  2.0  13.0  2.0    1    1   41    0    0


   Q01  Q03  Q06  Q07  Q08   Q11  Q19  q02  q03  q05  q09  q23
0    1    1  1.0  1.0  2.0  13.0  2.0    1    1   72    0    4
1    1    1  2.0  1.0  2.0  13.0  2.0    1    1   64    0    4
2    1    1  9.0  0.0  2.0  13.0  2.0    1    1   69    0    4
3    1    1  3.0  0.0  2.0   2.0  2.0    1    1   53    0    4
4    1    1  2.0  0.0  2.0   2.0  2.0    1    1   48    0    4
[[0.04056033 0.0851399  0.19301861 ... 0.0312927  0.00423989 0.00025077]
 [0.04114728 0.08614083 0.20324473 ... 0.03251226 0.00309554 0.0002995 ]
 [0.01375626 0.03847356 0.08622709 ... 0.12406508 0.01632572 0.00276134]
 ...
