In [3]:
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

In [22]:
unfilled  = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
train_A = unfilled[[col for col in unfilled.columns if col not in ['hhid']]]
# the train_set as a result of inner joining with SubjectivePoverty (unfilled data) => ~5300 rows

subjects_in_train_A = set(train_A['psu_hh_idcode'])
filled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv")) # all training data that doesn't appear in unfilled data (these are without labels) => ~15000 ish rows?
filled = filled[~filled['psu_hh_idcode'].isin(subjects_in_train_A)]
filled_X = filled[[col for col in filled.columns if col not in ['hhid', 'rating_filled']]]
train_B = filled_X

print("train_A: ", train_A.shape)
display(train_A.head())
print("train_B: ", train_B.shape)
display(train_B.head())

train_A:  (5334, 14)


Unnamed: 0,psu_hh_idcode,subjectivePoverty_rating,q02,q03,q05,q09,q23,Q01,Q03,Q06,Q07,Q08,Q11,Q19
0,30_8_1,4,1,1,44,0,0,1,1,2.0,1.0,2.0,13.0,2.0
1,194_1_2,1,2,2,48,0,0,1,1,2.0,0.0,2.0,13.0,2.0
2,224_6_1,3,1,1,61,0,0,1,1,2.0,0.0,2.0,13.0,2.0
3,323_10_1,5,1,1,66,0,0,1,1,2.0,0.0,2.0,13.0,2.0
4,428_10_1,4,2,1,72,0,0,1,1,1.0,0.0,2.0,14.0,2.0


train_B:  (14929, 13)


Unnamed: 0,psu_hh_idcode,q02,q03,q05,q09,q23,Q01,Q03,Q06,Q07,Q08,Q11,Q19
5334,1_2_2,2,2,47,0,0,1.0,1.0,2.0,1.0,2.0,13.0,2.0
5335,1_3_2,2,2,56,0,0,1.0,1.0,2.0,1.0,2.0,13.0,2.0
5336,1_3_3,1,4,27,0,2,1.0,1.0,2.0,1.0,2.0,13.0,2.0
5337,1_5_2,2,2,49,0,0,1.0,1.0,1.0,0.0,2.0,2.0,2.0
5338,1_5_3,1,4,36,0,2,1.0,1.0,2.0,0.0,2.0,2.0,2.0


In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# LEARNING weights with training data:
#   For each learner:
    # train the learner on train_A
    # predict probabilities for train_B

train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_FILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

# Train base learners on train_A
model_rf = generate_best_RF_model(train_A)
model_xgb = generate_best_XGB_model(train_A)
model_svm = generate_best_SVM_model(train_A)


ids = train_B['psu_hh_idcode']
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])

# predict ratings for train_B 
P_RF = predict_ratings_RF(model_rf, train_B_X) # returns dataframe containing id,subjective_poverty_1, ...,subjective_poverty_10
P_XGB = predict_ratings_XGB(model_xgb, train_B_X)
P_SVM = predict_ratings_SVM(model_svm, train_B_X)

# aligned on id column. make sure each row corresponds to the same subject
assert all(P_RF['psu_hh_idcode'] == P_XGB['psu_hh_idcode'])
assert all(P_XGB['psu_hh_idcode'] == P_SVM['psu_hh_idcode'])


P_RF = P_RF.drop(columns=['psu_hh_idcode'])
P_XGB = P_XGB.drop(columns=['psu_hh_idcode'])
P_SVM = P_SVM.drop(columns=['psu_hh_idcode'])


# logistic regression on probabilities of each learner on train_B
X_stack = pd.concat([
    P_RF.drop(columns=['psu_hh_idcode']),
    P_XGB.drop(columns=['psu_hh_idcode']), 
    P_SVM.drop(columns=['psu_hh_idcode'])
    ], axis=1)

y_stack = train_B['subjectivePoverty_rating']
stack_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
stack_model.fit(X_stack, y_stack)

final_predictions = stack_model.predict(X_stack)
final_probabilities = stack_model.predict_proba(X_stack)
log_loss_score = log_loss(y_stack, final_probabilities)
print(f"Log Loss from Train_B labels: {log_loss_score:.4f}")

# At this point, we have our stacked model which we can use to generate predictions for our test set.  


KeyboardInterrupt



In [None]:
def generate_predictions(model, test_file_path=os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv")):
    X_test = pd.read_csv(test_file_path)
    test_ids = X_test['psu_hh_idcode']
    X_test = X_test.drop(columns=['psu_hh_idcode'])      
    preds_proba = model.predict_proba(X_test)

    # Create the output DataFrame
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty_{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

submission_predictions = generate_predictions(stack_model, os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
submission_predictions.to_csv(os.path.join("../data/submissions/", "submission_stacked.csv"), index=False)