# xgBoost

In [3]:
import pandas as pd
import numpy as np 
import os 

CLEAN_DATA_DIR = "../data/clean/"

In [5]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
X_train, X_test, y_train, y_test = train_test_split(data.drop(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating'], axis='columns'), data['subjectivePoverty_rating'], test_size=0.2, random_state=101)

y_train = y_train - 1
y_test = y_test - 1

from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [1, 3, 5, 7],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.3, 0.5, 0.7, 0.9],
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0]
}

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use log loss as the evaluation metric
    cv=5,                    
    verbose=1,               
    n_jobs=-1                
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss Score:", -grid_search.best_score_)

best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
preds_proba = best_model.predict_proba(X_test)

# Evaluate log loss on the test data
from sklearn.metrics import log_loss
print(f"\nLog Loss from test: {log_loss(y_test, preds_proba)}")

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.5}
Best Log Loss Score: 1.9470490347548448

Log Loss from test: 1.9298221246999632


In [22]:
from sklearn.metrics import log_loss

def generate_predictions(model, test_file_path=os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv")):
    X_test = pd.read_csv(test_file_path)
    test_ids = X_test['psu_hh_idcode']
    X_test = X_test.drop(columns=['psu_hh_idcode'])      
    preds_proba = model.predict_proba(X_test)

    # Create the output DataFrame
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

output = generate_predictions(best_model, os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
output.to_csv(os.path.join("../data/model_result/xgboost_predictions.csv"), index=False)


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.042289,0.097317,0.163744,0.236323,0.245543,0.125196,0.054663,0.029011,0.004156,0.001758
1,1_8_1,0.040327,0.085762,0.194763,0.234413,0.232981,0.11393,0.062237,0.029503,0.004539,0.001545
2,1_10_1,0.011831,0.025004,0.064346,0.133497,0.281209,0.190594,0.127226,0.135708,0.022629,0.007955
3,2_3_1,0.027705,0.065376,0.147587,0.19848,0.24481,0.16229,0.111907,0.031666,0.008698,0.00148
4,3_1_1,0.053478,0.092919,0.185346,0.197593,0.242531,0.130524,0.065278,0.025638,0.005289,0.001405


In [13]:
# Load the dataset
data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
id_column = data['psu_hh_idcode']

# Prepare the training and testing datasets
                    
# X_train, X_test, y_train, y_test, test_ids = train_test_split(
#     data.drop(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating'], axis='columns'),
#     data['subjectivePoverty_rating'],
#     id_column,
#     test_size=0,
#     random_state=101
# )

# Use the entire dataset for training
X_train = data.drop(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating'], axis='columns')
y_train = data['subjectivePoverty_rating'] - 1 # Adjust the target variable to start from 0 (for multi-class classification)

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [1, 3, 5, 7],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.3, 0.5, 0.7, 0.9],
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0]
}

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_log_loss',  # Use log loss as the evaluation metric
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Perform gridsearch to find the best model
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss Score:", -grid_search.best_score_)

best_model = grid_search.best_estimator_

# Predicting on actual test data

test_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
test_X = test_data.drop(columns=['psu_hh_idcode'])
test_ids = test_data['psu_hh_idcode']

# reorder cols in teest_dat for alignment with training data

# test_data.to_csv(os.path.join(CLEAN_DATA_DIR, TE))
# Predict probabilities for the test dataset
preds_proba = best_model.predict_proba(test_X)

# Evaluate log loss on the test data
# print(f"\nLog Loss from test: {log_loss(y_test, preds_proba)}")

# Create the output DataFrame
output_df = pd.DataFrame(preds_proba, columns=[f'subjectivePoverty_rating_{i+1}' for i in range(preds_proba.shape[1])])
output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start

# Save to CSV
output_file = "test_predictions.csv"
output_df.head()
# output_df.to_csv(output_file, index=False)
# print(f"Predictions saved to {output_file}")


Fitting 5 folds for each of 1024 candidates, totalling 5120 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.3}
Best Log Loss Score: 1.9381333016168707


ValueError: feature_names mismatch: ['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03', 'Q06', 'Q07', 'Q08', 'Q11', 'Q19'] ['q02', 'q03', 'q09', 'q05', 'q23', 'Q01', 'Q03', 'Q06', 'Q07', 'Q08', 'Q11', 'Q19']

Baseline log loss = $-log(\frac{1}{\#classes}) = -log(\frac{1}{10}) = 2.303$

In [None]:
print("unique values in test classes:", sorted(list(y_test.unique())))
print("unique values in predicted classes:", sorted(list(np.unique(preds))))

In [6]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming y_test is a Pandas Series

# Calculate the proportion of each value
value_counts = y_test.value_counts(normalize=True).sort_index() 
print(value_counts)

subjectivePoverty_rating
0    0.035614
1    0.087160
2    0.164948
3    0.192127
4    0.206186
5    0.157451
6    0.086223
7    0.057170
8    0.010309
9    0.002812
Name: proportion, dtype: float64
