In [1]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, log_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

# Train model with unfilled train data

In [None]:
train_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
# print(train_unfilled.head())

y = train_unfilled['subjectivePoverty_rating']
print(y)

# y = pd.get_dummies(train_unfilled["subjectivePoverty_rating"], prefix="rating").astype(int)
# print(y)

feature_cols = list(train_unfilled.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = train_unfilled[feature_cols]
# print(X.head())
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

### Hyeprparameter tuning with grid search

In [None]:
params = {
    'n_estimators':[100, 200, 500, 700],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 5, 50],
    'min_samples_leaf': [35, 42, 50],     # 1% of train_x size
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring=log_loss_scorer, return_train_score=True)
# Fit the model
grid_search.fit(train_x, train_y)

### Get the prediction probability 

In [None]:

best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(test_x)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

val_log_loss = log_loss(test_y, y_val_pred_proba, labels=best_model.classes_)
print("Validation Log Loss:", val_log_loss)


# Stroe the grid search results
results = grid_search.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 

results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "rf_unfillled.csv"), index=False)


### Class with highest prob

In [None]:
y_val_pred_proba = best_model.predict(test_x)
np.unique(y_val_pred_proba)

array([3., 4., 5.])