In [1]:
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

train_A = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_A.csv"))
train_B = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_B.csv"))

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, log_loss

def generate_best_RF_model(train_A):
    X = X_train = train_A.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns')
    y = train_A['subjectivePoverty_rating']

    # Define the parameter grid
    params = {
        'n_estimators': [ 200, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [4, 5],
        'min_samples_split': [2, 5, 50],
        'min_samples_leaf': [35, 42, 50],
    }

    # Create the scorer
    # log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid=params,
        scoring='neg_log_loss',
        cv=5,
        verbose=1,
        n_jobs=-1,
        return_train_score=True
    )

    # Fit the grid search
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    # Return the best model
    return best_model

# def predict_ratings_RF(model, data):
#     test_input_x = data.drop(columns=['psu_hh_idcode'])

#     id = data['psu_hh_idcode']
#     y_val_pred_proba = model.predict_proba(test_input_x)

#     column_names = [f"subjective_poverty_{i}" for i in range(1, 11)]
#     probs = pd.DataFrame(y_val_pred_proba, columns=column_names)
#     pred = pd.concat([id, probs], axis=1)

#     return pred

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

def generate_best_XGB_model(train_data):

    X_train = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns')
    y_train = train_data['subjectivePoverty_rating'] - 1
    param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [1, 3, 5],
    'n_estimators': [50, 100, 200],
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.4, 0.6, 0.8]
    }

    # Create the XGBoost model
    xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_log_loss',  # Use log loss as the evaluation metric
        cv=5,                    
        verbose=1,               
        n_jobs=-1                
    )

    grid_search.fit(X_train, y_train)
    #print("Best Parameters:", grid_search.best_params_)
    #print("Best Log Loss Score:", -grid_search.best_score_)

    best_model_xgb = grid_search.best_estimator_
    return best_model_xgb

In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def predict_ratings_RF(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

def predict_ratings_XGB(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

def predict_ratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)

    # Identify categorical columns
    missing_columns = [col for col in train_B_X.columns if -1 in train_B_X[col].unique()]
    
    # One-hot encode categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    encoded = encoder.fit_transform(train_B_X[missing_columns])

    # Convert to DataFrame and combine with numerical features
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
    numerical_df = train_B_X.drop(columns=missing_columns)

    # Combine numerical and encoded categorical data
    processed_train_B_X = pd.concat([numerical_df, encoded_df], axis=1)
    scaler = StandardScaler()
    train_B_X_scaled = scaler.transform(processed_train_B_X)
    preds_prob = model.predict_proba(train_B_X_scaled)
    output_df = output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df
    

In [27]:
model_rf = generate_best_RF_model(train_A)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [38]:
model_rf.feature_names_in_

array(['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03', 'Q06', 'Q07',
       'Q08', 'Q11', 'Q19'], dtype=object)

In [30]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
P_RF = predict_ratings_RF(model_rf, train_B_X)
P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_rf.csv"), index=False)

In [29]:
# Save model
from joblib import dump

dump(model_rf, "saved_models/rf_trained_on_filled_A.joblib")

['saved_models/rf_trained_on_filled_A.joblib']