In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import log_loss
import xgboost as xgb
import pandas as pd
import numpy as np
import os

from bayes_opt import BayesianOptimization
import xgboost as xgb
MODEL_TRAINING_DATA_DIR = "../data/model_training/"

In [19]:
def predict_ratings_XGB(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    sp = train_B_X['subjectivePoverty_rating']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(sp, preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

In [None]:
def generate_best_XGB_model(train_data):

    X_train = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns')
    y_train = train_data['subjectivePoverty_rating'] - 1

    param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 5]
    }

# Create the XGBoost model
    xgb_model = xgb.XGBClassifier(
        objective='multi:softprob', 
        eval_metric='mlogloss',
        use_label_encoder=False, 
        random_state=42
    )

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_log_loss',  # Use log loss as the evaluation metric
        cv=3,                    
        verbose=1,               
        n_jobs=-1                
    )

    
    print("Starting GridSearchCV...")
    grid_search.fit(X_train, y_train)
    print("GridSearchCV Completed...")

    best_model_xgb = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)
    print("Best Log Loss Score:", -grid_search.best_score_)

    return best_model_xgb