In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import log_loss
import xgboost as xgb
import pandas as pd
import numpy as np
import os

from bayes_opt import BayesianOptimization
import xgboost as xgb
MODEL_TRAINING_DATA_DIR = "../data/model_training/"

In [19]:
def predict_ratings_XGB(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    sp = train_B_X['subjectivePoverty_rating']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(sp, preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

In [None]:
def generate_best_XGB_model_bayesian(train_data):

    # Prepare training data
    X_train = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns')
    y_train = train_data['subjectivePoverty_rating'] - 1
    
    # Split into training and validation sets for Bayesian Optimization
    X_train_bayes, X_val, y_train_bayes, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
    
    # Define the objective function for Bayesian Optimization
    def xgb_evaluate(learning_rate, max_depth, n_estimators, gamma, colsample_bytree, subsample, eta, reg_lambda, reg_alpha, min_child_weight):
        params = {
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),  # Must be an integer
            'gamma': gamma,
            'colsample_bytree': colsample_bytree,
            'subsample': subsample,
            'eta': eta,
            'objective': 'multi:softprob',
            'num_class': len(np.unique(y_train)),
            'eval_metric': 'mlogloss',
            'reg_lambda': reg_lambda,
            'reg_alpha': reg_alpha,
             'min_child_weight': min_child_weight,
            'use_label_encoder': False,
            'verbosity': 0,
            'random_state': 42,
            'n_estimators': int(n_estimators),
        }
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_bayes, y_train_bayes, eval_set=[(X_val, y_val)], verbose=False)
        y_pred = model.predict_proba(X_val)
        return -log_loss(y_val, y_pred)  # Negative log loss (BayesianOptimization maximizes the function)
    
    # Define the bounds for hyperparameters
    param_bounds = {
        'learning_rate': (0.01, 0.3),
        'max_depth': (3, 10),
        'n_estimators': (100, 500),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
        'gamma': (0, 0.5),
        'reg_alpha': (0, 1),
        'reg_lambda': (0.1, 10),
        'min_child_weight': (1, 5),
        'eta': (0.01, 0.2)
    }
    
    # Initialize Bayesian Optimizer
    optimizer = BayesianOptimization(f=xgb_evaluate, pbounds=param_bounds, random_state=42, verbose=2)
    
    # Maximize the objective function
    optimizer.maximize(init_points=10, n_iter=30)

    best_params = optimizer.max['params']
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])

    best_model_xgb = xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        **best_params
    )

    best_model_xgb.fit(X_train, y_train)

    return best_model_xgb


In [31]:
train_data = pd.read_csv("../data/model_training/TRAIN_MERGED_UNFILLED_encoded.csv") # FILLED OR UNFILLED
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)
xgb_model = generate_best_XGB_model_bayesian(train_A)

|   iter    |  target   | colsam... |    eta    |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------


ValueError: too many values to unpack (expected 2)

In [22]:
# train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
pred = predict_ratings_XGB(xgb_model, train_B)
display(pred)

1.9448122734844189


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,125_11_1,0.053198,0.147047,0.205009,0.227987,0.166126,0.101554,0.053130,0.029398,0.009781,0.006770
1,129_9_1,0.037607,0.095309,0.125541,0.204742,0.234721,0.175700,0.067691,0.039480,0.011303,0.007906
2,800_8_1,0.024932,0.062987,0.134589,0.238510,0.202400,0.162890,0.100470,0.046250,0.020191,0.006780
3,472_1_1,0.017377,0.033107,0.073200,0.158520,0.204500,0.161609,0.145780,0.174740,0.024756,0.006410
4,309_11_1,0.024061,0.070031,0.145830,0.235331,0.270069,0.124617,0.079546,0.034306,0.009698,0.006510
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.073820,0.132011,0.184935,0.209218,0.176477,0.114375,0.066254,0.026385,0.009803,0.006724
1330,566_4_1,0.031436,0.079376,0.213136,0.204183,0.209229,0.167167,0.052214,0.025665,0.010352,0.007241
1331,220_7_1,0.023395,0.067403,0.134976,0.206999,0.200921,0.191419,0.116847,0.040777,0.010424,0.006840
1332,144_5_2,0.042511,0.031137,0.173549,0.118494,0.187920,0.209207,0.119438,0.079145,0.031575,0.007025
