In [1]:
import numpy as np
import os
import pandas as pd
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, log_loss
from sklearn.linear_model import LogisticRegression

from learners import generate_best_RF_model, generate_best_XGB_model, generate_best_SVM_model
from learners import predict_ratings_RF, predict_ratings_XGB, predict_ratings_SVM

CLEAN_DATA_DIR = "../data/clean/"

import warnings
warnings.filterwarnings("ignore")

### Split train set into A and B

In [2]:
# Split train_A and train_B
train_data = pd.read_csv("../data/model_training/TRAIN_MERGED_FILLED_encoded.csv") # FILLED OR UNFILLED
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

train_A.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_A.csv"), index=False)
train_A_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_y = train_A['subjectivePoverty_rating']

train_B.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_B.csv"), index=False)
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A.shape)
print("train_B:", train_B.shape)

train_A: (14215, 41)
train_B: (4739, 41)


### getting the base learning models

In [6]:
# Retreiving RF model
model_file_rf = "rf_trained_on_filled_A_encoded_calibrated_<enter-score>.joblib"
if model_file_rf in os.listdir("saved_models/trained_on_filled/rf"):
    model_rf = load("saved_models/trained_on_filled/rf/"+model_file_rf)
else:
    model_rf = generate_best_RF_model(train_A)
    dump(model_rf, f"saved_models/trained_on_filled/rf/{model_file_rf}")

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best Parameters: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 500}
Best Log Loss Score (Uncalibrated): 1.9414782887699105
Calibrating the Random Forest for better probabilities...
Calibrated Log Loss Score (on Calibration Set): 1.9008488217221724


In [11]:
P_RF = predict_ratings_RF(model_rf, train_B_X)
#P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_rf_2.csv"), index=False)
P_RF.shape

(4739, 11)

In [3]:
model_file_xgb = "xgb_trained_on_unfilled_A_encoded_calibrated_<score>.joblib"
if model_file_xgb in os.listdir("saved_models/trained_on_filled/xgb"):
    model_xgb = load(f"saved_models/trained_on_filled/xgb/{model_file_xgb}")
else:
    model_xgb = generate_best_XGB_model(train_A)
    dump(model_xgb, f"saved_models/trained_on_filled/xgb/{model_file_xgb}")

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 

In [13]:
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)
P_XGB.to_csv(os.path.join("../data/train_B_preds/train_B_preds_xgb_2.csv"), index=False)
print(P_XGB.shape)

(4739, 11)


In [14]:
# train SVM model on train_B_X
model_file_svm = "svm_trained_on_filled_A_encoded_score.joblib"
if model_file_svm in os.listdir("saved_models/trained_on_filled/svm"):
    model_svm = load(f"saved_models/trained_on_filled/svm/{model_file_svm}")
else:
    model_svm = generate_best_SVM_model(train_A)
    dump(model_svm, f"saved_models/trained_on_filled/svm/{model_file_svm}")

In [16]:
# P_SVM = pd.read_csv("../data/train_B_preds/train_B_preds_rf.csv")
P_SVM = predict_ratings_SVM(model_svm, train_B_X)
# P_SVM.to_csv("svm_trained_on_filled_A_encoded_pp_ss_1.945.joblib"", index=False)
print(P_SVM.shape)

(4739, 11)


# Choosing base models and stack model for Final Submission

At this point we have the three files:
- train_B_preds_xgb.csv
- train_B_preds_rf.csv
- train_B_preds_svm.csv

In [11]:
from joblib import dump, load

def stack_predict(X_test, models=[]):

    # Load previously trained models
    model_rf = load(model_rf_file)
    P_RF = predict_ratings_RF(model_rf, X_test)

    model_xgb = load(model_xgb_file)
    P_XGB = predict_ratings_XGB(model_xgb, X_test)

    model_svm = load(model_svm_file)
    P_SVM = predict_ratings_SVM(model_svm, X_test)

    # aligned on id column. make sure each row corresponds to the same subject
    assert (len(P_RF) == len(P_XGB))
    assert (len(P_XGB) == len(P_SVM))
    assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
    assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

    test_ids = X_test['psu_hh_idcode']
    X_stack = pd.concat([
        P_RF.drop(columns=['psu_hh_idcode']),
        P_XGB.drop(columns=['psu_hh_idcode']), 
        P_SVM.drop(columns=['psu_hh_idcode'])
        ], axis=1)

    # final_predictions = stack_model.predict(X_stack)
    final_probabilities = stack_model.predict_proba(X_stack)
    output_df = pd.DataFrame(final_probabilities, columns=[f'subjective_poverty_{i+1}' for i in range(final_probabilities.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

def stack_train(X_stack, y_stack):
    
    stack_model = LogisticRegression(multi_class='multinomial', max_iter=100000)
    stack_model.fit(X_stack, y_stack)

    #preds = stack_model.predict(X_stack)
    preds_proba = stack_model.predict_proba(X_stack)
    log_loss_score = log_loss(y_stack, preds_proba)
    print(f"Log Loss from Training Logistic Regression: {log_loss_score:.3f}")
    #dump(stack_model, f"saved_models/trained_on_filled/stack/stack_trained_on_filled_rf_xgb_svm{round(log_loss_score,4)}.joblib")
    return stack_model


def final_predictions_stack(train_B, X_test, base_models=[]):
    
    train_B_X = train_B.drop(columns='subjectivePoverty_rating')
    train_B_y = train_B['subjectivePoverty_rating']

    probabilities = []
    
    # Train logistic Regression model on train_B
    for i, model in enumerate(base_models):
        pred_probs = model.predict_proba(train_B_X.drop(columns="psu_hh_idcode"))
        pred_probs_df = pd.DataFrame(
            pred_probs,
            columns=[f"{i}_{j}" for j in range(1, 11)]
        )
        probabilities.append(pred_probs_df)

    X_stack_B = pd.concat(probabilities, axis=1)
    print(f"Training Logistic Regression stack using {len(base_models)} base models")
    stack_model = stack_train(X_stack_B, train_B_y)

    # scoring the model on all rows of TRAIN_MERGED_UNFILLED_encoded.csv
    data_val = pd.read_csv("../data/model_training/TRAIN_MERGED_UNFILLED_encoded.csv")
    data_val_X = data_val.drop(columns=['subjectivePoverty_rating'])
    data_val_y = data_val['subjectivePoverty_rating']
    
    # scoring model on labelled training data
    probabilities = []

    for i, model in enumerate(base_models):
        pred_probs = model.predict_proba(data_val_X.drop(columns="psu_hh_idcode"))
        pred_probs_df = pd.DataFrame(
            pred_probs,
            columns=[f"{i}_{j}" for j in range(1, 11)]
        )
        probabilities.append(pred_probs_df)
    
    X_stack_val = pd.concat(probabilities, axis=1)
    val_probabilities = stack_model.predict_proba(X_stack_val)
    
    print("Log Loss from testing on labelled train data:", log_loss(data_val_y, val_probabilities))

    # Now, predicting for our test_data
    probabilities = []
    test_ids = X_test['psu_hh_idcode']
    for i, model in enumerate(base_models):
        pred_probs = model.predict_proba(X_test.drop(columns="psu_hh_idcode"))
        pred_probs_df = pd.DataFrame(
            pred_probs,
            columns=[f"{i}_{j}" for j in range(1, 11)]
        )
        probabilities.append(pred_probs_df)
    X_stack = pd.concat(probabilities, axis=1)
    final_probabilities = stack_model.predict_proba(X_stack)
    output_df = pd.DataFrame(final_probabilities, columns=[f'subjective_poverty_{i+1}' for i in range(final_probabilities.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

    



In [9]:
model_rf = load("saved_models/trained_on_filled/rf/rf_trained_on_filled_A_encoded_1.940.joblib")
model_rf_cal1 = load("saved_models/trained_on_filled/rf/rf_trained_on_filled_A_encoded_calibrated_1.788.joblib")
model_rf_cal2 = load("saved_models/trained_on_filled/rf/rf_trained_on_filled_A_encoded_calibrated_1.900.joblib")
model_rf_cal3 = load("saved_models/trained_on_filled/rf/rf_trained_on_filled_A_encoded_calibrated_unknown.joblib")

model_xgb = load("saved_models/trained_on_filled/xgb/xgb_trained_on_filled_A_encoded_1.933.joblib")
model_xgb1 = load("saved_models/trained_on_filled/xgb/xgb_trained_on_filled_A_encoded_reg_gs_eg1.joblib")
model_xgb_cal1 = load("saved_models/trained_on_filled/xgb/xgb_trained_on_unfilled_A_encoded_calibrated_1.933.joblib")

model_svm = load("saved_models/trained_on_filled/svm/svm_trained_on_filled_A_encoded_pp_ss_1.946.joblib")
model_svm1 = load("saved_models/trained_on_filled/svm/svm_trained_on_filled_A_encoded_pp_ss_1.945.joblib")


In [23]:
X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT_encoded.csv"))
submission_df = final_predictions_stack(train_B, X_test, base_models=[model_rf_cal2, model_xgb1, model_svm])

print(submission_df.shape)
submission_df.head(3)

Training Logistic Regression stack using 3 base models
Log Loss from Training Logistic Regression: 1.927
Log Loss from testing on labelled train data: 1.9179941613654685
(1334, 11)


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.038928,0.091728,0.183292,0.2367,0.207836,0.127539,0.072393,0.033035,0.007203,0.001347
1,1_8_1,0.03987,0.091735,0.19456,0.240061,0.201972,0.125465,0.067179,0.030698,0.007143,0.001317
2,1_10_1,0.015553,0.029626,0.06405,0.13494,0.20859,0.230235,0.180896,0.120271,0.013565,0.002275


In [24]:
submission_df.to_csv("../data/submissions_today/final_stack_rfcal2_xgb1_svm_1.9178.csv", index=False)

# Output predictions using single base learning model

In [27]:
base_learner = model_xgb


# scoring model on labelled data
data_val = pd.read_csv("../data/model_training/TRAIN_MERGED_UNFILLED_encoded.csv")
data_val_X = data_val.drop(columns=['subjectivePoverty_rating'])
data_val_y = data_val['subjectivePoverty_rating']

val_probabilities = base_learner.predict_proba(data_val_X.drop(columns=['psu_hh_idcode']))
print("Log Loss from testing on labelled train data:", log_loss(data_val_y, val_probabilities))

X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT_encoded.csv"))
test_ids = X_test['psu_hh_idcode']

final_probabilities = base_learner.predict_proba(X_test.drop(columns="psu_hh_idcode"))
output_df = pd.DataFrame(final_probabilities, columns=[f'subjective_poverty_{i+1}' for i in range(final_probabilities.shape[1])])
output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start



Log Loss from testing on labelled train data: 1.8940754314036365


In [28]:
output_df.to_csv("../data/submissions_today/final_stack_xgb_1.894.csv", index=False)

In [18]:
output_df.shape

(1334, 11)

# Trying Stacking with Unfilled Data

In [44]:
train_unfilled = pd.read_csv("../data/clean/TRAIN_MERGED_UNFILLED.csv")
train_A_unfilled, train_B_unfilled = train_test_split(train_unfilled, test_size=0.25, stratify=train_unfilled['subjectivePoverty_rating'], random_state=42)

train_A_unfilled_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_unfilled_y = train_A['subjectivePoverty_rating']

train_B_unfilled_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_unfilled_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A_unfilled.shape)
print("train_B:", train_B_unfilled.shape)

train_A: (4000, 14)
train_B: (1334, 14)


In [9]:
import tensorflow as tf
import keras_tuner as kt
print(kt.__version__)

1.4.7


In [45]:
model_rf_unfilled = generate_best_RF_model(train_A_unfilled)

P_RF = predict_ratings_RF(model_rf_unfilled, train_B_X)
P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_unfilled_rf.csv"), index=False)
P_RF.head(3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 42, 'min_samples_split': 5, 'n_estimators': 200}
Best Log Loss Score: 1.9425785952905543


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.050181,0.102449,0.172558,0.214099,0.207975,0.142176,0.071901,0.032231,0.005211,0.001218
1,783_8_3,0.063795,0.071783,0.177513,0.213175,0.175172,0.18187,0.084609,0.024932,0.00662,0.00053
2,561_5_1,0.043903,0.099249,0.1841,0.241651,0.195154,0.133287,0.065821,0.033035,0.003474,0.000325


In [46]:
model_xgb_unfilled = generate_best_XGB_model(train_A_unfilled)
P_XGB = predict_ratings_XGB(model_xgb_unfilled, train_B_X)
P_XGB.head(3)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.3}
Best Log Loss Score: 1.942583005697204


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.053793,0.11367,0.184411,0.222228,0.193372,0.133627,0.062182,0.030279,0.005036,0.001402
1,783_8_3,0.14386,0.061194,0.174682,0.182356,0.118766,0.199183,0.084444,0.023572,0.010198,0.001746
2,561_5_1,0.048,0.092392,0.177748,0.273673,0.221296,0.105726,0.0533,0.023346,0.003297,0.001223


In [33]:
# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
# assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    #P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B_unfilled_y
stack_model_unfilled = stack_train(X_stack, y_stack)



Log Loss from Train_B labels: 1.9310


In [34]:
X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_df = stack_predict(X_test)
submission_df.to_csv("../data/model_result/final_stack_rf_xgb_unfilled.csv", index=False)

In [35]:
submission_df.shape

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.034926,0.081487,0.161801,0.234052,0.21976,0.137408,0.082477,0.038709,0.007912,0.001467
1,1_8_1,0.037256,0.081517,0.178275,0.240017,0.211253,0.128337,0.076902,0.037127,0.007877,0.001439
2,1_10_1,0.019409,0.039248,0.083486,0.152764,0.205995,0.207236,0.173612,0.103876,0.012397,0.001978
3,2_3_1,0.02899,0.066281,0.138853,0.200827,0.214893,0.177242,0.108326,0.053365,0.009576,0.001646
4,3_1_1,0.040487,0.09167,0.184278,0.2265,0.204341,0.13431,0.073386,0.035698,0.00789,0.00144


In [4]:
file = pd.read_csv("../data/model_result/final_stack_rf_xgb_svm_today1.csv")
file.shape

(22406, 11)

In [8]:
from sklearn.metrics import log_loss

# Testing for log_loss score
test_data = pd.read_csv("../data/clean/TRAIN_MERGED_UNFILLED_encoded.csv")
test_X = test_data.drop(columns=['subjectivePoverty_rating'])
test_y = test_data['subjectivePoverty_rating']

for model_xgb in os.listdir('../saved_models/trained_on_filled/xgb'):

    for model_rf in os.listdir('../saved_models/trained_on_filled/rf'):

        for model_svm in os.listdir('../saved_models/trained_on_filled/svm'):

            
# base_model = load()
# P_1 = predict_ratings_XXX(test_X)
# base_model2 = load()
# P_2 = predict_ratings_XXX(test_X)
# base_model3 = load() 
# P_3 = predict_ratings_XXX(test_X)

# stack_model = load(stack_model.joblib)

X_stack = pd.concat([
    P_1.drop(columns=['psu_hh_idcode']),
    P_2.drop(columns=['psu_hh_idcode']), 
    P_3.drop(columns=['psu_hh_idcode'])
    ], axis=1)

stack_model = stack_train(train_B_X, train_B_y)
preds_probs = stack_model.predict_proba(X_stack)
submission_df = stack_predict(X_test)


print("Log loss: ", -log_loss(test_y, preds_probs))


(5334, 40)