In [315]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, log_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

# Train model with unfilled train data

In [389]:
train_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
# print(train_unfilled.head())

y = train_unfilled['subjectivePoverty_rating']
print(y)

# y = pd.get_dummies(train_unfilled["subjectivePoverty_rating"], prefix="rating").astype(int)
# print(y)

feature_cols = list(train_unfilled.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = train_unfilled[feature_cols]
# print(X.head())
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

0       4
1       1
2       3
3       5
4       4
       ..
5329    3
5330    4
5331    2
5332    5
5333    4
Name: subjectivePoverty_rating, Length: 5334, dtype: int64
(4267, 12) (1067, 12) (4267,) (1067,)


### Hyeprparameter tuning with grid search

In [384]:
params = {
    'n_estimators':[100, 200, 500, 700],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 5, 50],
    'min_samples_leaf': [35, 42, 50],     # 1% of train_x size
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring=log_loss_scorer, return_train_score=True)
# Fit the model
grid_search.fit(train_x, train_y)

  _data = np.array(data, dtype=dtype, copy=copy,


### Get the prediction probability 

In [386]:

best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(test_x)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

val_log_loss = log_loss(test_y, y_val_pred_proba, labels=best_model.classes_)
print("Validation Log Loss:", val_log_loss)


# Stroe the grid search results
results = grid_search.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 

results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "rf_unfillled.csv"), index=False)


Best Parameters: {'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 35, 'min_samples_split': 50, 'n_estimators': 200}
Validation Log Loss: 1.9573764105959


In [388]:
y_val_pred_proba = best_model.predict(test_x)
np.unique(y_val_pred_proba)

array([3, 4, 5, 6, 7])

#### test1
Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Validation Log Loss: 1.9648590584269776

#### test2
Best Parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 100}
Validation Log Loss: 1.956265596544104

#### test3
Best Parameters: {'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 35, 'min_samples_split': 50, 'n_estimators': 500}
Validation Log Loss: 1.9587117477784433

### Apply model on filled dataset

In [288]:
train_filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_filled.head()

filled_y = train_filled['rating_filled']
filled_feature_cols = list(train_filled.columns.difference(['psu_hh_idcode', 'hhid', 'rating_filled']))
filled_X = train_filled[filled_feature_cols]

ftrain_x, ftest_x, ftrain_y, ftest_y = train_test_split(filled_X, filled_y, test_size = 0.2, random_state = 42)
print(ftrain_x.shape, ftest_x.shape, ftrain_y.shape, ftest_y.shape)




fy_val_pred_proba = best_model.predict_proba(ftest_x)
fval_log_loss = log_loss(ftest_y, fy_val_pred_proba, labels=best_model.classes_)
print("Validation Log Loss:", fval_log_loss)



(16210, 12) (4053, 12) (16210,) (4053,)
Validation Log Loss: 1.9497426049116517


Best Parameters: {'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 200}
Validation Log Loss: 1.9485769679636464


# Train model with filled data

In [None]:
train_filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_filled.head()

Unnamed: 0.1,Unnamed: 0,psu_hh_idcode,hhid,rating_filled,q02,q03,q04,q05,q09,q23,Q01,Q03,Q06,Q07,Q08,Q11,Q19
0,0,30_8_1,3008,4.0,1,1,19680615,44,0,0,1.0,1.0,2.0,1.0,2.0,13.0,2.0
1,1,194_1_2,19401,1.0,2,2,19640910,48,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
2,2,224_6_1,22406,3.0,1,1,19510317,61,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
3,3,323_10_1,32310,5.0,1,1,19460402,66,0,0,1.0,1.0,2.0,0.0,2.0,13.0,2.0
4,4,428_10_1,42810,4.0,2,1,19400407,72,0,0,1.0,1.0,1.0,0.0,2.0,14.0,2.0


In [294]:
train_filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_filled.head()

filled_y = train_filled['rating_filled']
filled_feature_cols = list(train_filled.columns.difference(['psu_hh_idcode', 'hhid', 'rating_filled']))
filled_X = train_filled[filled_feature_cols]

ftrain_x, ftest_x, ftrain_y, ftest_y = train_test_split(filled_X, filled_y, test_size = 0.2, random_state = 42)
print(ftrain_x.shape, ftest_x.shape, ftrain_y.shape, ftest_y.shape)

(16210, 12) (4053, 12) (16210,) (4053,)


## Apply Random Forest

### Hyperparameter Tuning

In [290]:
params = {
    'n_estimators':[200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth' : [4, 6, 8],
    'min_samples_leaf' : [8, 12],
    'min_samples_split' : [8, 16]
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
grid_search = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring=log_loss_scorer, return_train_score=True)
# Fit the model
grid_search.fit(ftrain_x, ftrain_y)

In [293]:
best_model = grid_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(test_x)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

val_log_loss = log_loss(test_y, y_val_pred_proba, labels=best_model.classes_)
print("Validation Log Loss:", val_log_loss)


# Stroe the grid search results
results = grid_search.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 

results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "rf_fillled.csv"), index=False)

Best Parameters: {'max_depth': 4, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 200}
Validation Log Loss: 1.9485769679636464


# SVM

## Apply SVM on unfilled dataset

### Missing Values

In [348]:
svm_unfilled = train_unfilled

missing_columns = svm_unfilled.columns[svm_unfilled.isnull().any()].tolist()
print( 'Columns with na values are: ', missing_columns)

# Fill na values with 'missing'
svm_unfilled['Q06'] = svm_unfilled['Q06'].fillna(-1)
svm_unfilled['Q07'] = svm_unfilled['Q07'].fillna(-1)
svm_unfilled['Q08'] = svm_unfilled['Q08'].fillna(-1)
svm_unfilled['Q11'] = svm_unfilled['Q11'].fillna(-1)
svm_unfilled['Q19'] = svm_unfilled['Q19'].fillna(-1)

Columns with na values are:  ['Q06', 'Q07', 'Q08', 'Q11', 'Q19']


In [350]:
# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded = encoder.fit_transform(svm_unfilled[missing_columns])


# Convert to DataFrame and combine with numerical features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
numerical_df = svm_unfilled.drop(columns=missing_columns)

# Combine numerical and encoded categorical data
processed_df = pd.concat([numerical_df, encoded_df], axis=1)

print("\nAfter One-Hot Encoding:")
print(processed_df)


After One-Hot Encoding:
     psu_hh_idcode   hhid  subjectivePoverty_rating  q02  q03  q05  q09  q23  \
0           30_8_1   3008                         4    1    1   44    0    0   
1          194_1_2  19401                         1    2    2   48    0    0   
2          224_6_1  22406                         3    1    1   61    0    0   
3         323_10_1  32310                         5    1    1   66    0    0   
4         428_10_1  42810                         4    2    1   72    0    0   
...            ...    ...                       ...  ...  ...  ...  ...  ...   
5329       571_8_1  57108                         3    2    1   73    0    0   
5330       601_5_1  60105                         4    1    1   60    0    0   
5331       782_1_1  78201                         2    1    1   55    0    0   
5332       606_3_1  60603                         5    1    1   53    0    1   
5333       450_4_1  45004                         4    1    1   78    0    0   

      Q01  Q03

In [353]:
y = processed_df['subjectivePoverty_rating']
print(y)

feature_cols = list(processed_df.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = processed_df[feature_cols]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

0       4
1       1
2       3
3       5
4       4
       ..
5329    3
5330    4
5331    2
5332    5
5333    4
Name: subjectivePoverty_rating, Length: 5334, dtype: int64
(4267, 45) (1067, 45) (4267,) (1067,)


In [372]:
# GridSearch CV
param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, scoring='neg_log_loss', verbose=2)
optimal_params.fit(X_train_scaled, train_y)
print("Best Parameters:", optimal_params.best_params_)
print("Best Log Loss:", optimal_params.best_score_)



# Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
# Best Log Loss: -1.948786842738619




Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.4s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.3s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.3s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.3s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.3s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   2.8s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   2.8s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   3.0s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   3.0s
[CV] END .........................C=0.5, gamma=1, kernel=rbf; total time=   3.0s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.8s
[CV] END .......................C=0.5, gamma=0.

In [374]:
# Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
# Best Log Loss: -1.948546531080508
svm_model = SVC(C=10, gamma=0.01, kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, train_y)
pred = svm_model.predict_proba(X_test_scaled)
log_loss(test_y, pred)
#1.9642149345026059

X_test_scaled.size()

KeyboardInterrupt: 

In [365]:
1.9642149345026059

1.9642149345026059