In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import os
import pandas as pd
MODEL_TRAINING_DATA_DIR = "../data/model_training/"
RESULT_DATA_DIR = "../data/model_result/"

In [None]:
def predict_ratings_SVM(train_data):

    train_x = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    y = train_data['subjectivePoverty_rating']

    # Scale ordinal and numerical data
    ordinal_col = ['q23', 'Q06']
    numerical_col = ['q05', 'q09', 'Q07']
    scalable_col = ordinal_col + numerical_col

    scalable_x = train_x[scalable_col]
    
    # Scale the features
    scaler = StandardScaler()
    scaled_x = scaler.fit_transform(scalable_x)
    scaled_x = pd.DataFrame(scaled_x, columns=scalable_x.columns, index=scalable_x.index)


    # One-hot-encoding
    ohc_col = ['q02', 'q03_1', 'q03_2', 'q03_3', 'q03_4', 'q03_5', 'q03_6', 'q03_7', 'q03_8', 'q03_9', 'q03_10',
    'q03_11', 'q03_12', 'q03_13', 'q03_14', 'Q03', 'Q08', 'Q11_1', 'Q11_2', 'Q11_3', 'Q11_4',
    'Q11_5', 'Q11_6', 'Q11_7', 'Q11_8', 'Q11_9', 'Q11_10', 'Q11_11', 'Q11_12', 'Q11_13', 'Q19',
    'Q01_0', 'Q01_1', 'Q01_2']
    ohc_x = train_x[ohc_col]
    # ohc_x = encoder(ohc_x)
    # ohc_x = encode_filler(ohc_x)
    processed_x = pd.concat([scaled_x, ohc_x], axis=1)

    # GridSearch CV
    param_grid = {
        'C': [0.5, 1, 10],
        'gamma': ['scale', 0.1, 0.01],
        'kernel': ['rbf']
    }

    optimal_params = GridSearchCV(SVC(probability=True, random_state=42), param_grid, n_jobs=-1, cv=5, scoring='neg_log_loss')
    
    # Fit the model
    optimal_params.fit(processed_x, y)
    print("best score: ", optimal_params.best_score_)

    #Save the results to csv file
    results = optimal_params.cv_results_
    log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
    hyperparameters = results['params'] 
    results_df = pd.DataFrame(hyperparameters)
    results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

    # Display the results sorted by Log Loss
    results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
    results_df.to_csv(os.path.join(RESULT_DATA_DIR, "svm_scaler_filled.csv"), index=False)

    return optimal_params.best_estimator_, scaler

In [33]:
train_data = pd.read_csv(os.path.join(MODEL_TRAINING_DATA_DIR, "TRAIN_MERGED_UNFILLED_encoded.csv"))

train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

model, scaler= predict_ratings_SVM(train_A)

best score:  -1.9566210198631904


In [27]:
model = result[0]
scaler = result[1]

In [29]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
# display(train_B_X)

def predictratings_SVM(model, scaler, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']

    # Scale ordinal and numerical data
    ordinal_col = ['q23', 'Q06']
    numerical_col = ['q05', 'q09', 'Q07']
    scalable_col = ordinal_col + numerical_col

    scalable_x = train_B_X[scalable_col]

    scaled_x = scaler.transform(scalable_x)
    scaled_x = pd.DataFrame(scaled_x, columns=scalable_x.columns, index=scalable_x.index)

    # One-hot-encoding
    ohc_col = ['q02', 'q03_1', 'q03_2', 'q03_3', 'q03_4', 'q03_5', 'q03_6', 'q03_7', 'q03_8', 'q03_9', 'q03_10',
    'q03_11', 'q03_12', 'q03_13', 'q03_14', 'Q03', 'Q08', 'Q11_1', 'Q11_2', 'Q11_3', 'Q11_4',
    'Q11_5', 'Q11_6', 'Q11_7', 'Q11_8', 'Q11_9', 'Q11_10', 'Q11_11', 'Q11_12', 'Q11_13', 'Q19',
    'Q01_0', 'Q01_1', 'Q01_2']

    ohc_x = train_B_X[ohc_col]
    processed_x = pd.concat([scaled_x, ohc_x], axis=1)


    preds_proba = model.predict_proba(processed_x)
    print(log_loss(train_B['subjectivePoverty_rating'], preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

pred = predictratings_SVM(model, scaler, train_B_X)
display(pred)




1.9525670441687637


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,125_11_1,0.042326,0.100207,0.198878,0.220306,0.190766,0.131592,0.074284,0.032160,0.007809,0.001672
1,129_9_1,0.034815,0.089819,0.148711,0.215838,0.219608,0.163542,0.078356,0.038659,0.008513,0.002139
2,800_8_1,0.030884,0.066006,0.142376,0.212844,0.217293,0.167825,0.097959,0.051879,0.011014,0.001920
3,472_1_1,0.028243,0.059863,0.091540,0.176005,0.227839,0.200532,0.132426,0.065874,0.013144,0.004533
4,309_11_1,0.036398,0.101143,0.176531,0.214061,0.219556,0.133580,0.075188,0.033903,0.008071,0.001568
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.041144,0.092536,0.176661,0.219909,0.210417,0.136500,0.079856,0.033037,0.008334,0.001606
1330,566_4_1,0.033879,0.088505,0.178952,0.208840,0.209351,0.152723,0.077726,0.038333,0.008811,0.002880
1331,220_7_1,0.035826,0.083817,0.156081,0.212037,0.217032,0.159349,0.087401,0.037977,0.008284,0.002197
1332,144_5_2,0.034504,0.057781,0.137760,0.156903,0.195478,0.166863,0.134209,0.098464,0.015511,0.002527


psu_hh_idcode	subjective_poverty1	subjective_poverty2	subjective_poverty3	subjective_poverty4	subjective_poverty5	subjective_poverty6	subjective_poverty7	subjective_poverty8	subjective_poverty9	subjective_poverty10
0	125_11_1	0.025525	0.059814	0.087623	0.220639	0.233748	0.182541	0.121753	0.053184	0.011969	0.003203
1	129_9_1	0.037075	0.058612	0.088723	0.214832	0.184983	0.169812	0.163631	0.064149	0.014398	0.003784
2	800_8_1	0.027463	0.057546	0.080168	0.217410	0.228223	0.184984	0.132869	0.054868	0.012673	0.003796
3	472_1_1	0.034906	0.057764	0.085594	0.217024	0.193675	0.172937	0.157929	0.062606	0.014007	0.003559
4	309_11_1	0.027289	0.057630	0.078263	0.222137	0.226802	0.182085	0.132389	0.057131	0.012434	0.003839
...	...	...	...	...	...	...	...	...	...	...	...
1329	588_5_1	0.026330	0.058408	0.083609	0.219034	0.233413	0.183294	0.125962	0.054159	0.012287	0.003504
1330	566_4_1	0.036910	0.058539	0.088463	0.214998	0.185711	0.170098	0.163133	0.064025	0.014363	0.003761
1331	220_7_1	0.031674	0.057190	0.081302	0.220262	0.208272	0.177602	0.146896	0.059892	0.013415	0.003495
1332	144_5_2	0.024264	0.065964	0.090802	0.210098	0.245841	0.185363	0.109233	0.054836	0.011278	0.002322
1333	18_7_1	0.034787	0.057585	0.085540	0.217065	0.194416	0.173364	0.157189	0.062621	0.013908	0.003525
