In [1]:
import numpy as np
import os
import pandas as pd
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, log_loss
from sklearn.linear_model import LogisticRegression

from learners import generate_best_RF_model, generate_best_XGB_model, generate_best_SVM_model
from learners import predict_ratings_RF, predict_ratings_XGB, predict_ratings_SVM

CLEAN_DATA_DIR = "../data/clean/"

### Split train set into A and B

In [3]:
# Split train_A and train_B
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

train_A.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_A.csv"), index=False)
train_A_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_y = train_A['subjectivePoverty_rating']

train_B.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_B.csv"), index=False)
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A.shape)
print("train_B:", train_B.shape)

train_A: (14215, 14)
train_B: (4739, 14)


### getting the base learning models

In [4]:
# Retreiving model
if "rf_trained_on_filled_A_2.joblib" in os.listdir("saved_models"):
    model_rf = load("saved_models/rf_trained_on_filled_A_2.joblib")
else:
    model_rf = generate_best_RF_model(train_A)
    dump(model_rf, "saved_models/rf_trained_on_filled_A_2.joblib")

Fitting 5 folds for each of 162 candidates, totalling 810 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 42, 'min_samples_split': 2, 'n_estimators': 500}
Best Log Loss Score: 1.9345235638999985


In [5]:
P_RF = predict_ratings_RF(model_rf, train_B_X)
P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_rf_2.csv"), index=False)
P_RF.head(3)

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.048157,0.099652,0.18146,0.2144,0.205025,0.142089,0.073226,0.029182,0.005514,0.001295
1,783_8_3,0.03866,0.082182,0.17576,0.206052,0.207858,0.16024,0.082656,0.038393,0.006211,0.001988
2,561_5_1,0.029078,0.088555,0.18267,0.293259,0.195626,0.117158,0.064869,0.024835,0.003673,0.000278


In [6]:
if "xgb_trained_on_filled_A_2.joblib" in os.listdir("saved_models"):
    model_xgb = load("saved_models/xgb_trained_on_filled_A_2.joblib")
else:
    model_xgb = generate_best_XGB_model(train_A)
    dump(model_xgb, "saved_models/xgb_trained_on_filled_A_2.joblib")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Best Log Loss Score: 1.9332040421519412


In [7]:
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)
P_XGB.to_csv(os.path.join("../data/train_B_preds/train_B_preds_xgb_2.csv"), index=False)
print(P_XGB.shape)
P_XGB.head(3)

(4739, 11)


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.056401,0.117409,0.179836,0.211556,0.202344,0.14601,0.055165,0.02448,0.005842,0.000957
1,783_8_3,0.035917,0.074467,0.17847,0.202109,0.196128,0.203638,0.07206,0.025799,0.009168,0.002245
2,561_5_1,0.028957,0.075431,0.171144,0.374389,0.19144,0.092707,0.047474,0.013585,0.003999,0.000874


In [8]:
# train SVM model on train_B_X
if "svm_trained_on_filled_A_2.joblib" in os.listdir("saved_models"):
    model_svm = load("saved_models/svm_trained_on_filled_A_2.joblib")
else:
    model_svm = generate_best_SVM_model(train_A)
    dump(model_svm, "saved_models/svm_trained_on_filled_A_2.joblib")

In [10]:
# P_SVM = pd.read_csv("../data/train_B_preds/train_B_preds_rf.csv")
P_SVM = predict_ratings_SVM(model_svm, train_B_X)
P_SVM.to_csv("../data/train_B_preds/train_B_preds_svm_2.csv", index=False)
print(P_SVM.shape)
P_SVM.head(3)



# Training multinomial LogisticRegression model

In [12]:
# Training the stacking model
def stack_train(X_stack, y_stack):
    
    stack_model = LogisticRegression(multi_class='multinomial', max_iter=100000)
    stack_model.fit(X_stack, y_stack)

    #preds = stack_model.predict(X_stack)
    preds_proba = stack_model.predict_proba(X_stack)
    log_loss_score = log_loss(y_stack, preds_proba)
    print(f"Log Loss from Train_B labels: {log_loss_score:.4f}")
    dump(stack_model, "saved_models/stack_trained_on_filled.joblib")
    return stack_model


# aligned on id column. make sure each row corresponds to the same subject
assert len(P_RF) == len(P_XGB) == len(P_SVM)
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B_y
stack_model = stack_train(X_stack, y_stack)

# At this point, we have our stacked model which we can use to generate predictions for our test set.  



Log Loss from Train_B labels: 1.9307


# Stacking with existing predictions for Train_B

At this point we have the three files:
- train_B_preds_xgb.csv
- train_B_preds_rf.csv
- train_B_preds_svm.csv

In [13]:
from joblib import dump, load

def stack_predict(X_test):

    # Load previously trained models
    model_rf = load("saved_models/rf_trained_on_filled_A.joblib")
    P_RF = predict_ratings_RF(model_rf, X_test)

    model_xgb = load("saved_models/xgb_trained_on_filled_A.joblib")
    P_XGB = predict_ratings_XGB(model_xgb, X_test)

    model_svm = load("saved_models/svm_trained_on_filled_A.joblib")
    P_SVM = predict_ratings_SVM(model_svm, X_test)

    # aligned on id column. make sure each row corresponds to the same subject
    assert (len(P_RF) == len(P_XGB))
    assert (len(P_XGB) == len(P_SVM))
    assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
    assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

    test_ids = X_test['psu_hh_idcode']
    X_stack = pd.concat([
        P_RF.drop(columns=['psu_hh_idcode']),
        P_XGB.drop(columns=['psu_hh_idcode']), 
        P_SVM.drop(columns=['psu_hh_idcode'])
        ], axis=1)

    # final_predictions = stack_model.predict(X_stack)
    final_probabilities = stack_model.predict_proba(X_stack)
    output_df = pd.DataFrame(final_probabilities, columns=[f'subjective_poverty_{i+1}' for i in range(final_probabilities.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_df = stack_predict(X_test)
submission_df.to_csv("../data/model_result/final_stack_rf_xgb.csv", index=False)
submission_df.head(3)



Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.034671,0.081322,0.161742,0.234755,0.220199,0.136774,0.082762,0.038544,0.007791,0.001439
1,1_8_1,0.037028,0.081293,0.17845,0.240566,0.2116,0.127807,0.077128,0.036956,0.007762,0.001409
2,1_10_1,0.019763,0.039266,0.083281,0.153959,0.207328,0.205292,0.171624,0.105075,0.012431,0.001982


1.0

# Trying Stacking with Unfilled Data

In [44]:
train_unfilled = pd.read_csv("../data/clean/TRAIN_MERGED_UNFILLED.csv")
train_A_unfilled, train_B_unfilled = train_test_split(train_unfilled, test_size=0.25, stratify=train_unfilled['subjectivePoverty_rating'], random_state=42)

train_A_unfilled_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_unfilled_y = train_A['subjectivePoverty_rating']

train_B_unfilled_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_unfilled_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A_unfilled.shape)
print("train_B:", train_B_unfilled.shape)

train_A: (4000, 14)
train_B: (1334, 14)


In [45]:
model_rf_unfilled = generate_best_RF_model(train_A_unfilled)

P_RF = predict_ratings_RF(model_rf_unfilled, train_B_X)
P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_unfilled_rf.csv"), index=False)
P_RF.head(3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 42, 'min_samples_split': 5, 'n_estimators': 200}
Best Log Loss Score: 1.9425785952905543


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.050181,0.102449,0.172558,0.214099,0.207975,0.142176,0.071901,0.032231,0.005211,0.001218
1,783_8_3,0.063795,0.071783,0.177513,0.213175,0.175172,0.18187,0.084609,0.024932,0.00662,0.00053
2,561_5_1,0.043903,0.099249,0.1841,0.241651,0.195154,0.133287,0.065821,0.033035,0.003474,0.000325


In [46]:
model_xgb_unfilled = generate_best_XGB_model(train_A_unfilled)
P_XGB = predict_ratings_XGB(model_xgb_unfilled, train_B_X)
P_XGB.head(3)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.3}
Best Log Loss Score: 1.942583005697204


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.053793,0.11367,0.184411,0.222228,0.193372,0.133627,0.062182,0.030279,0.005036,0.001402
1,783_8_3,0.14386,0.061194,0.174682,0.182356,0.118766,0.199183,0.084444,0.023572,0.010198,0.001746
2,561_5_1,0.048,0.092392,0.177748,0.273673,0.221296,0.105726,0.0533,0.023346,0.003297,0.001223


In [33]:
# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
# assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    #P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B_unfilled_y
stack_model_unfilled = stack_train(X_stack, y_stack)



Log Loss from Train_B labels: 1.9310


In [34]:
X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_df = stack_predict(X_test)
submission_df.to_csv("../data/model_result/final_stack_rf_xgb_unfilled.csv", index=False)

In [35]:
submission_df.head()

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.034926,0.081487,0.161801,0.234052,0.21976,0.137408,0.082477,0.038709,0.007912,0.001467
1,1_8_1,0.037256,0.081517,0.178275,0.240017,0.211253,0.128337,0.076902,0.037127,0.007877,0.001439
2,1_10_1,0.019409,0.039248,0.083486,0.152764,0.205995,0.207236,0.173612,0.103876,0.012397,0.001978
3,2_3_1,0.02899,0.066281,0.138853,0.200827,0.214893,0.177242,0.108326,0.053365,0.009576,0.001646
4,3_1_1,0.040487,0.09167,0.184278,0.2265,0.204341,0.13431,0.073386,0.035698,0.00789,0.00144
