In [23]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import os
import pandas as pd
MODEL_TRAINING_DATA_DIR = "../data/model_training/"
CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

In [24]:
def predict_ratings_SVM(train_data):

    train_x = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    # # Combine numerical and encoded categorical data
    # processed_train = encoder(train_x)
    # processed_train = encode_filler(processed_train)

    y = train_data['subjectivePoverty_rating']
    # X = processed_train

    ordinal_col = ['q23', 'Q06', 'Q01']
    numerical_col = ['q05', 'q09', 'Q07']
    binary_col =  ['q02', 'q03', 'Q03', 'Q08', 'Q11', 'Q19']

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_col),    # Scale numerical features
            ('ord', MinMaxScaler(), ordinal_col),      # Scale ordinal categorical features
            ('one_hot', 'passthrough', binary_col)      # Leave one-hot-encoded features unchanged
        ]
    )

    # Complete pipeline with SVM
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(probability=True, random_state=42))
    ])
    
   # GridSearch CV
    param_grid = {
        'classifier__C': [0.1, 0.5, 1, 10],
        'classifier__gamma': ['scale', 0.1, 0.01, 0.001],
        'classifier__kernel': ['rbf']
    }

    optimal_params = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=5, scoring='neg_log_loss', verbose=1)
    
    # Fit the model
    optimal_params.fit(train_x, y)
    print("best score: ", optimal_params.best_score_)

    #Save the results to csv file
    results = optimal_params.cv_results_
    log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
    hyperparameters = results['params'] 
    results_df = pd.DataFrame(hyperparameters)
    results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

    # Display the results sorted by Log Loss
    results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
    results_df.to_csv(os.path.join(RESULT_DATA_DIR, "ecx_svm_pipeline_unfilled.csv"), index=False)
    return optimal_params.best_estimator_

['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']
['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']
['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']

In [25]:
# train_data = pd.read_csv(os.path.join(MODEL_TRAINING_DATA_DIR, "TRAIN_MERGED_UNFILLED_encoded.csv"))
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)
model = predict_ratings_SVM(train_A)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
best score:  -1.9624797401918197


In [26]:
def predictratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    # train_B_X = encoder(train_B_X)
    # train_B_X = encode_filler(train_B_X)
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(train_B['subjectivePoverty_rating'], preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

In [27]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
pred = predictratings_SVM(model, train_B_X)
display(pred)
#1.9505287965222604 - pipeline X, encoded X
#1.9847676398932983 - pipeline X, encoded 0
#1.9525670441687637 - pipeline 0 , encoded 0 1.953911477755434
#1.9533630140413745 - pipeline 0 , encoded X


1.9575881303365301


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,125_11_1,0.039290,0.091817,0.178437,0.214899,0.209453,0.143429,0.078440,0.034249,0.008007,0.001979
1,129_9_1,0.035739,0.091321,0.171501,0.214516,0.209784,0.151591,0.076997,0.037985,0.008212,0.002354
2,800_8_1,0.035694,0.072921,0.165916,0.213400,0.209383,0.154384,0.090034,0.047295,0.009538,0.001436
3,472_1_1,0.029825,0.063753,0.094181,0.191058,0.222054,0.199455,0.116729,0.066007,0.012675,0.004265
4,309_11_1,0.038018,0.088696,0.170608,0.210838,0.209003,0.147142,0.085202,0.041189,0.007888,0.001415
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.040227,0.087402,0.175998,0.214224,0.206970,0.143691,0.084469,0.037193,0.008532,0.001293
1330,566_4_1,0.036206,0.089240,0.172897,0.211116,0.204938,0.150938,0.082090,0.041067,0.009077,0.002431
1331,220_7_1,0.037429,0.088212,0.164148,0.207216,0.208730,0.154948,0.086079,0.042369,0.008590,0.002279
1332,144_5_2,0.051523,0.058908,0.084303,0.157352,0.211073,0.191334,0.152542,0.076725,0.013287,0.002954
