In [6]:
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

In [None]:
unfilled  = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
train_A = unfilled[[col for col in unfilled.columns if col not in ['hhid']]]
# the train_set as a result of inner joining with SubjectivePoverty (unfilled data) => ~5300 rows

subjects_in_train_A = set(train_A['psu_hh_idcode'])
filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv")) # all training data that doesn't appear in unfilled data (these are without labels) => ~15000 ish rows?
filled = filled[~filled['psu_hh_idcode'].isin(subjects_in_train_A)]
filled_X = filled[[col for col in filled.columns if col not in ['hhid', 'rating_filled']]]
train_B = filled_X

print("train_A: ", train_A.shape)
display(train_A.head())
print("train_B: ", train_B.shape)
display(train_B.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, log_loss

def generate_best_RF_model(train_A):
    feature_cols = list(train_A.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
    X = train_A[feature_cols]  
    y = train_A['subjectivePoverty_rating']

    # Define the parameter grid
    params = {
        'n_estimators': [100, 200, 500, 700],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [4, 5, 6],
        'min_samples_split': [2, 5, 50],
        'min_samples_leaf': [35, 42, 50],
    }

    # Create the scorer
    log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid=params,
        scoring=log_loss_scorer,
        cv=5,
        return_train_score=True
    )

    # Fit the grid search
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    # Return the best model
    return best_model

def predict_ratings_RF(model, data):
    test_input_x = data.drop(columns=['psu_hh_idcode'])

    id = data['psu_hh_idcode']
    y_val_pred_proba = model.predict_proba(test_input_x)

    column_names = [f"subjective_poverty_{i}" for i in range(1, 11)]
    probs = pd.DataFrame(y_val_pred_proba, columns=column_names)
    pred = pd.concat([id, probs], axis=1)

    return pred

In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb

def generate_best_XGB_model(train_data):

    X_train = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'], axis='columns')
    y_train = train_data['subjectivePoverty_rating'] - 1
    param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [1, 3, 5],
    'n_estimators': [50, 100, 200],
    'subsample': [0.3, 0.5, 0.7],
    'colsample_bytree': [0.4, 0.6, 0.8]
    }

    # Create the XGBoost model
    xgb_model = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=101)

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='neg_log_loss',  # Use log loss as the evaluation metric
        cv=5,                    
        verbose=1,               
        n_jobs=-1                
    )

    grid_search.fit(X_train, y_train)
    #print("Best Parameters:", grid_search.best_params_)
    #print("Best Log Loss Score:", -grid_search.best_score_)

    best_model_xgb = grid_search.best_estimator_
    return best_model_xgb

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def predict_ratings_RF(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

def predict_ratings_XGB(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

def predict_ratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = train_B_X.drop(columns=['psu_hh_idcode'])
    preds_proba = model.predict_proba(train_B_X)

    # Identify categorical columns
    missing_columns = [col for col in train_B_X.columns if -1 in train_B_X[col].unique()]
    
    # One-hot encode categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    encoded = encoder.fit_transform(train_B_X[missing_columns])

    # Convert to DataFrame and combine with numerical features
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
    numerical_df = train_B_X.drop(columns=missing_columns)

    # Combine numerical and encoded categorical data
    processed_train_B_X = pd.concat([numerical_df, encoded_df], axis=1)
    scaler = StandardScaler()
    train_B_X_scaled = scaler.transform(processed_train_B_X)
    preds_prob = model.predict_proba(train_B_X_scaled)
    output_df = output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df
    

In [7]:
# Split train_A and train_B
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

train_A.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_A.csv"), index=False)
train_B.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_B.csv"), index=False)

In [None]:
model_rf = generate_best_RF_model(train_A)

In [10]:
model_xgb = generate_best_XGB_model(train_A)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [12]:
model_xgb_backup = model_xgb

In [13]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)

In [18]:
P_XGB.to_csv(os.path.join("../data/train_B_preds/train_B_preds_xgb.csv"), index=False)

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# LEARNING weights with training data:
#   For each learner:
    # train the learner on train_A
    # predict probabilities for train_B

train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

# Train base learners on train_A
model_rf = generate_best_RF_model(train_A)
model_xgb = generate_best_XGB_model(train_A)
model_svm = generate_best_SVM_model(train_A)


ids = train_B['psu_hh_idcode']
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])

# predict ratings for train_B 

P_RF = predict_ratings_RF(model_rf, train_B_X) # returns dataframe containing id,subjective_poverty_1, ...,subjective_poverty_10
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)
P_SVM = pd.read_csv("../data/model_result/train_B_predictions_svm.csv") # predict_ratings_SVM(model_svm, train_B_X)

# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

# logistic regression on probabilities of each learner on train_B
X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B['subjectivePoverty_rating']
stack_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
stack_model.fit(X_stack, y_stack)

final_predictions = stack_model.predict(X_stack)
final_probabilities = stack_model.predict_proba(X_stack)
log_loss_score = log_loss(y_stack, final_probabilities)
print(f"Log Loss from Train_B labels: {log_loss_score:.4f}")

# At this point, we have our stacked model which we can use to generate predictions for our test set.  


KeyboardInterrupt



At this point we have the three files:
- train_B_preds_xgb.csv
- train_B_preds_rf.csv
- train_B_preds_svm.csv

In [None]:
P_RF = pd.read_csv("../data/train_B_preds/train_B_preds_rf.csv")
P_XGB = pd.read_csv("../data/train_B_preds/train_B_preds_xgb.csv")
P_SVM = pd.read_csv("../data/train_B_preds/train_B_preds_svm.csv")

# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

# logistic regression on probabilities of each learner on train_B
X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B['subjectivePoverty_rating']
stack_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
stack_model.fit(X_stack, y_stack)

final_predictions = stack_model.predict(X_stack)
final_probabilities = stack_model.predict_proba(X_stack)
log_loss_score = log_loss(y_stack, final_probabilities)
print(f"Log Loss from Train_B labels: {log_loss_score:.4f}")


In [None]:
def generate_predictions(model, test_file_path=os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv")):
    X_test = pd.read_csv(test_file_path)
    test_ids = X_test['psu_hh_idcode']
    X_test = X_test.drop(columns=['psu_hh_idcode'])      
    preds_proba = model.predict_proba(X_test)

    # Create the output DataFrame
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

submission_predictions = generate_predictions(stack_model, os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_predictions.to_csv(os.path.join("../data/submissions/", "submission_stacked.csv"), index=False)