In [1]:
import numpy as np
import os
import pandas as pd
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, log_loss
from sklearn.linear_model import LogisticRegression

from learners import generate_best_RF_model, generate_best_XGB_model, generate_best_SVM_model, generate_best_NN_model
from learners import predict_ratings_RF, predict_ratings_XGB, predict_ratings_SVM

CLEAN_DATA_DIR = "../data/clean/"

2024-11-21 17:21:14.031867: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Split train set into A and B

In [2]:
# Split train_A and train_B
train_data = pd.read_csv("../data/model_training/TRAIN_MERGED_UNFILLED_encoded.csv") # FILLED OR UNFILLED
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

train_A.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_A.csv"), index=False)
train_A_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_y = train_A['subjectivePoverty_rating']

train_B.to_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_B.csv"), index=False)
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A.shape)
print("train_B:", train_B.shape)

train_A: (4000, 41)
train_B: (1334, 41)


### getting the base learning models

In [3]:
# Retreiving model
model_file_rf = "rf_trained_on_filled_A_encoded.joblib"
if model_file_rf in os.listdir("saved_models"):
    model_rf = load(model_file_rf)
else:
    model_rf = generate_best_RF_model(train_A)
    dump(model_rf, f"saved_models/{model_file_rf}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 35, 'min_samples_split': 5, 'n_estimators': 200}
Best Log Loss Score: 1.939987538181482


In [None]:
P_RF = predict_ratings_RF(model_rf, train_B_X)
#P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_rf_2.csv"), index=False)
P_RF.head(3)

In [3]:
model_file_xgb = "xgb_trained_on_unfilled_A_encoded_reg_pru.joblib"
if model_file_xgb in os.listdir("saved_models"):
    model_xgb = load(f"saved_models/{model_file_xgb}")
else:
    model_xgb = generate_best_XGB_model(train_A)
    dump(model_xgb, f"saved_models/{model_file_xgb}")

Starting RandomizedSearchCV...
Fitting 3 folds for each of 100 candidates, totalling 300 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 

In [None]:
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)
P_XGB.to_csv(os.path.join("../data/train_B_preds/train_B_preds_xgb_2.csv"), index=False)
print(P_XGB.shape)
P_XGB.head(3)

In [None]:
# train SVM model on train_B_X
model_file_svm = "svm_trained_on_filled_A_encoded.joblib"
if model_file_svm in os.listdir("saved_models/trained_on_filled"):
    model_svm = load(f"saved_models/{model_file_svm}")
else:
    model_svm = generate_best_SVM_model(train_A)
    dump(model_svm, f"saved_models/{model_file_svm}")

In [7]:
# P_SVM = pd.read_csv("../data/train_B_preds/train_B_preds_rf.csv")
P_SVM = predict_ratings_SVM(model_svm, train_B_X)
P_SVM.to_csv("../data/train_B_preds/train_B_preds_svm.csv", index=False)
print(P_SVM.shape)
P_SVM.head(3)

(4739, 11)


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,785_6_1,0.042777,0.08721,0.175174,0.234578,0.212712,0.133957,0.067138,0.037819,0.006467,0.00217
1,783_8_3,0.033835,0.076405,0.156816,0.192379,0.216627,0.186597,0.087163,0.039594,0.008834,0.001749
2,561_5_1,0.039653,0.092173,0.179529,0.238296,0.208596,0.126913,0.073349,0.03343,0.006513,0.001548


# Training multinomial LogisticRegression model

In [14]:
# Training the stacking model
def stack_train(X_stack, y_stack):
    
    stack_model = LogisticRegression(multi_class='multinomial', max_iter=100000)
    stack_model.fit(X_stack, y_stack)

    #preds = stack_model.predict(X_stack)
    preds_proba = stack_model.predict_proba(X_stack)
    log_loss_score = log_loss(y_stack, preds_proba)
    print(f"Log Loss from Train_B labels: {log_loss_score:.4f}")
    dump(stack_model, "saved_models/stack_trained_on_filled.joblib")
    return stack_model


# aligned on id column. make sure each row corresponds to the same subject
assert len(P_RF) == len(P_XGB) == len(P_SVM)
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B_y
stack_model = stack_train(X_stack, y_stack)

# At this point, we have our stacked model which we can use to generate predictions for our test set.  

Log Loss from Train_B labels: 1.9294




# Stacking with existing predictions for Train_B

At this point we have the three files:
- train_B_preds_xgb.csv
- train_B_preds_rf.csv
- train_B_preds_svm.csv

In [12]:
from joblib import dump, load

def stack_predict(X_test):

    # Load previously trained models
    model_rf = load("saved_models/rf_trained_on_filled_A.joblib")
    P_RF = predict_ratings_RF(model_rf, X_test)

    model_xgb = load("saved_models/xgb_trained_on_filled_A.joblib")
    P_XGB = predict_ratings_XGB(model_xgb, X_test)

    model_svm = load("saved_models/svm_trained_on_filled_A.joblib")
    P_SVM = predict_ratings_SVM(model_svm, X_test)

    # aligned on id column. make sure each row corresponds to the same subject
    assert (len(P_RF) == len(P_XGB))
    assert (len(P_XGB) == len(P_SVM))
    assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
    assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

    test_ids = X_test['psu_hh_idcode']
    X_stack = pd.concat([
        P_RF.drop(columns=['psu_hh_idcode']),
        P_XGB.drop(columns=['psu_hh_idcode']), 
        P_SVM.drop(columns=['psu_hh_idcode'])
        ], axis=1)

    # final_predictions = stack_model.predict(X_stack)
    final_probabilities = stack_model.predict_proba(X_stack)
    output_df = pd.DataFrame(final_probabilities, columns=[f'subjective_poverty_{i+1}' for i in range(final_probabilities.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_df = stack_predict(X_test)
submission_df.to_csv("../data/model_result/final_stack_rf_xgb.csv", index=False)
submission_df.head(3)



Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.035,0.080942,0.162766,0.238063,0.217361,0.135634,0.082854,0.038136,0.007873,0.001371
1,1_8_1,0.037324,0.080091,0.181152,0.237227,0.212783,0.129285,0.076822,0.036187,0.007788,0.001341
2,1_10_1,0.018671,0.038077,0.082158,0.150919,0.206934,0.208681,0.174767,0.105468,0.012469,0.001858


1.0

# Trying Stacking with Unfilled Data

In [44]:
train_unfilled = pd.read_csv("../data/clean/TRAIN_MERGED_UNFILLED.csv")
train_A_unfilled, train_B_unfilled = train_test_split(train_unfilled, test_size=0.25, stratify=train_unfilled['subjectivePoverty_rating'], random_state=42)

train_A_unfilled_X = train_A.drop(columns=['subjectivePoverty_rating'])
train_A_unfilled_y = train_A['subjectivePoverty_rating']

train_B_unfilled_X = train_B.drop(columns=['subjectivePoverty_rating'])
train_B_unfilled_y = train_B['subjectivePoverty_rating']

print("train_A:", train_A_unfilled.shape)
print("train_B:", train_B_unfilled.shape)

train_A: (4000, 14)
train_B: (1334, 14)


In [9]:
import tensorflow as tf
import keras_tuner as kt
print(kt.__version__)

1.4.7


In [45]:
model_rf_unfilled = generate_best_RF_model(train_A_unfilled)

P_RF = predict_ratings_RF(model_rf_unfilled, train_B_X)
P_RF.to_csv(os.path.join("../data/train_B_preds/train_B_preds_unfilled_rf.csv"), index=False)
P_RF.head(3)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 42, 'min_samples_split': 5, 'n_estimators': 200}
Best Log Loss Score: 1.9425785952905543


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.050181,0.102449,0.172558,0.214099,0.207975,0.142176,0.071901,0.032231,0.005211,0.001218
1,783_8_3,0.063795,0.071783,0.177513,0.213175,0.175172,0.18187,0.084609,0.024932,0.00662,0.00053
2,561_5_1,0.043903,0.099249,0.1841,0.241651,0.195154,0.133287,0.065821,0.033035,0.003474,0.000325


In [46]:
model_xgb_unfilled = generate_best_XGB_model(train_A_unfilled)
P_XGB = predict_ratings_XGB(model_xgb_unfilled, train_B_X)
P_XGB.head(3)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.3}
Best Log Loss Score: 1.942583005697204


Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,785_6_1,0.053793,0.11367,0.184411,0.222228,0.193372,0.133627,0.062182,0.030279,0.005036,0.001402
1,783_8_3,0.14386,0.061194,0.174682,0.182356,0.118766,0.199183,0.084444,0.023572,0.010198,0.001746
2,561_5_1,0.048,0.092392,0.177748,0.273673,0.221296,0.105726,0.0533,0.023346,0.003297,0.001223


In [33]:
# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
# assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])

X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    #P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B_unfilled_y
stack_model_unfilled = stack_train(X_stack, y_stack)



Log Loss from Train_B labels: 1.9310


In [34]:
X_test = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_df = stack_predict(X_test)
submission_df.to_csv("../data/model_result/final_stack_rf_xgb_unfilled.csv", index=False)

In [35]:
submission_df.head()

Unnamed: 0,psu_hh_idcode,subjective_poverty_1,subjective_poverty_2,subjective_poverty_3,subjective_poverty_4,subjective_poverty_5,subjective_poverty_6,subjective_poverty_7,subjective_poverty_8,subjective_poverty_9,subjective_poverty_10
0,1_7_1,0.034926,0.081487,0.161801,0.234052,0.21976,0.137408,0.082477,0.038709,0.007912,0.001467
1,1_8_1,0.037256,0.081517,0.178275,0.240017,0.211253,0.128337,0.076902,0.037127,0.007877,0.001439
2,1_10_1,0.019409,0.039248,0.083486,0.152764,0.205995,0.207236,0.173612,0.103876,0.012397,0.001978
3,2_3_1,0.02899,0.066281,0.138853,0.200827,0.214893,0.177242,0.108326,0.053365,0.009576,0.001646
4,3_1_1,0.040487,0.09167,0.184278,0.2265,0.204341,0.13431,0.073386,0.035698,0.00789,0.00144
