# xgBoost

In [3]:
import pandas as pd
import numpy as np 
import os 

CLEAN_DATA_DIR = "../data/clean/"

In [31]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
X_train, X_test, y_train, y_test = train_test_split(data.drop(['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns'), data['subjectivePoverty_rating'], test_size=0.2, random_state=101)

y_train = y_train - 1
y_test = y_test - 1

from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [1, 3, 5],
    'n_estimators': [50, 100, 200],
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.4, 0.6, 0.8]
}

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use log loss as the evaluation metric
    cv=5,                    
    verbose=1,               
    n_jobs=-1                
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss Score:", -grid_search.best_score_)

best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
preds_proba = best_model.predict_proba(X_test)

# Evaluate log loss on the test data
from sklearn.metrics import log_loss
print(f"\nLog Loss from test: {log_loss(y_test, preds_proba)}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.4, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.7}
Best Log Loss Score: 1.8725519127904497

Log Loss from test: 1.7991903277100496


In [30]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
X_train, X_test, y_train, y_test = train_test_split(data.drop(['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns'), data['subjectivePoverty_rating'], test_size=0.2, random_state=101)

y_train = y_train - 1
y_test = y_test - 1

from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [1, 3, 5],
    'n_estimators': [50, 100, 20],
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.4, 0.6, 0.8]
}

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use log loss as the evaluation metric
    cv=5,                    
    verbose=1,               
    n_jobs=-1                
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss Score:", -grid_search.best_score_)

best_model_filled = grid_search.best_estimator_
preds = best_model_filled.predict(X_test)
preds_proba = best_model_filled.predict_proba(X_test)

# Evaluate log loss on the test data
from sklearn.metrics import log_loss
print(f"\nLog Loss from test: {log_loss(y_test, preds_proba)}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
Best Log Loss Score: 1.9343637203324355

Log Loss from test: 1.9312889728928007


In [33]:
preds_proba.shape

(1067, 10)

# Generating Predictions on TEST_INPUT.csv

In [28]:
from sklearn.metrics import log_loss

def generate_predictions(model, test_file_path=os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv")):
    X_test = pd.read_csv(test_file_path)
    test_ids = X_test['psu_hh_idcode']
    X_test = X_test.drop(columns=['psu_hh_idcode'])      
    preds_proba = model.predict_proba(X_test)

    # Create the output DataFrame
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

output = generate_predictions(best_model, os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
output.to_csv(os.path.join("../data/model_result/xgboost_unfilled.csv"), index=False)
output.head(3)

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.042289,0.097317,0.163744,0.236323,0.245543,0.125196,0.054663,0.029011,0.004156,0.001758
1,1_8_1,0.040327,0.085762,0.194763,0.234413,0.232981,0.11393,0.062237,0.029503,0.004539,0.001545
2,1_10_1,0.011831,0.025004,0.064346,0.133497,0.281209,0.190594,0.127226,0.135708,0.022629,0.007955


Baseline log loss = $-log(\frac{1}{\#classes}) = -log(\frac{1}{10}) = 2.303$

In [None]:
print("unique values in test classes:", sorted(list(y_test.unique())))
print("unique values in predicted classes:", sorted(list(np.unique(preds))))

In [6]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming y_test is a Pandas Series

# Calculate the proportion of each value
value_counts = y_test.value_counts(normalize=True).sort_index() 
print(value_counts)

subjectivePoverty_rating
0    0.035614
1    0.087160
2    0.164948
3    0.192127
4    0.206186
5    0.157451
6    0.086223
7    0.057170
8    0.010309
9    0.002812
Name: proportion, dtype: float64
