In [107]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import os
import pandas as pd
from joblib import dump, load
MODEL_TRAINING_DATA_DIR = "../data/model_training/"
CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

In [126]:
def predict_ratings_SVM(train_data):

    train_x = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    y = train_data['subjectivePoverty_rating']

    ordinal_col = ['q23', 'Q06']
    numerical_col = ['q05', 'q09', 'Q07']
    binary_col = ['q02', 'q03_1', 'q03_2', 'q03_3', 'q03_4', 'q03_5', 'q03_6', 'q03_7', 'q03_8', 'q03_9', 'q03_10',
    'q03_11', 'q03_12', 'q03_13', 'q03_14', 'Q03', 'Q08', 'Q11_1', 'Q11_2', 'Q11_3', 'Q11_4',
    'Q11_5', 'Q11_6', 'Q11_7', 'Q11_8', 'Q11_9', 'Q11_10', 'Q11_11', 'Q11_12', 'Q11_13', 'Q19',
    'Q01_0', 'Q01_1', 'Q01_2']

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_col),    # Scale numerical features
            ('ord', MinMaxScaler(), ordinal_col),      # Scale ordinal categorical features
            ('one_hot', 'passthrough', binary_col)      # Leave one-hot-encoded features unchanged
        ]
    )

    # Complete pipeline with SVM
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(probability=True, random_state=42))
    ])
    
   # GridSearch CV
    param_grid = {
        'classifier__C': [0.5, 1, 10],
        'classifier__gamma': ['scale', 0.1, 0.01],
        'classifier__kernel': ['rbf']
    }

    optimal_params = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=5, scoring='neg_log_loss', verbose=1)
    
    # Fit the model
    optimal_params.fit(train_x, y)
    print("best score: ", optimal_params.best_score_)

    #Save the results to csv file
    results = optimal_params.cv_results_
    log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
    hyperparameters = results['params'] 
    results_df = pd.DataFrame(hyperparameters)
    results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

    # Display the results sorted by Log Loss
    results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
    results_df.to_csv(os.path.join(RESULT_DATA_DIR, "svm_pipeline_filled_ss.csv"), index=False)
    
    return optimal_params.best_estimator_

In [127]:
# RESULT_DATA_DIR = "../data/model_result/"

train_data = pd.read_csv(os.path.join(MODEL_TRAINING_DATA_DIR, "TRAIN_MERGED_FILLED_encoded.csv"))
# train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)
model = predict_ratings_SVM(train_A)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to di

best score:  -1.9476993086970533


In [116]:
dump(model, "../3-modelling/saved_models/svm_trained_on_filled_A_encoded_pp_ss_1.945.joblib")

['../3-modelling/saved_models/svm_trained_on_filled_A_encoded_pp_ss_1.945.joblib']

In [96]:
def predictratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(train_B['subjectivePoverty_rating'], preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

In [97]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
pred = predictratings_SVM(model, train_B_X)
display(pred)
#1.9505287965222604 - pipeline X, encoded X
#1.9847676398932983 - pipeline X, encoded 0
#1.9533662772985925 - pipeline 0 , encoded 0
#1.9575881303365301 - pipeline 0 , encoded X

1.9533662772985925


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,125_11_1,0.039890,0.092944,0.180141,0.218821,0.209198,0.141438,0.074578,0.033406,0.007520,0.002064
1,129_9_1,0.036594,0.090774,0.169973,0.217414,0.210214,0.149580,0.076856,0.038560,0.007703,0.002332
2,800_8_1,0.034935,0.070608,0.151127,0.212988,0.210181,0.157492,0.098014,0.052061,0.011088,0.001506
3,472_1_1,0.029240,0.061385,0.091293,0.191307,0.225495,0.198943,0.118426,0.065822,0.013824,0.004264
4,309_11_1,0.038074,0.090968,0.175809,0.211597,0.210086,0.144472,0.080585,0.039616,0.007243,0.001550
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.039886,0.087919,0.178151,0.218339,0.206414,0.141980,0.083736,0.033901,0.008274,0.001399
1330,566_4_1,0.034881,0.089563,0.181598,0.214946,0.198427,0.148075,0.079150,0.041630,0.009236,0.002493
1331,220_7_1,0.037741,0.088074,0.163341,0.209686,0.210014,0.153953,0.085054,0.041674,0.008252,0.002212
1332,144_5_2,0.037947,0.056039,0.088604,0.096079,0.216874,0.191699,0.170707,0.115162,0.020928,0.005961
