In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

In [2]:
def encoder(data):
  col = [col for col in data.columns if -1 in data[col].values]
  # One-hot encode categorical columns
  encoder = OneHotEncoder(sparse_output=False, drop=None)
  encoded = encoder.fit_transform(data[col])

  # Convert to DataFrame and combine with numerical features
  encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(col), index=data.index)
  numerical_df = data.drop(columns=col)

  # Combine numerical and encoded categorical data
  processed_df = pd.concat([numerical_df, encoded_df], axis=1)
  return processed_df


def encode_filler(data):
  na_col = ['Q11_5.0', 'Q11_9.0', 'Q11_14.0',
              'Q11_2.0', 'Q08_-1.0', 'Q08_2.0', 'q02', 'Q11_3.0', 'Q03',
              'Q11_4.0', 'Q11_7.0', 'Q11_13.0', 'Q19_2.0', 'Q11_-1.0', 'Q11_10.0',
              'Q11_12.0', 'Q19_-1.0',  'Q11_1.0', 'Q19_1.0', 'Q11_8.0', 'Q08_1.0',  'q03',
               'Q11_11.0', ]

  for col in na_col:
      if col not in data.columns:
          data[col] = 0  # Assign 0
  
  filled_data = data[na_col]
  return filled_data

In [None]:
def predict_ratings_SVM(train_data):

    train_x = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    # # Combine numerical and encoded categorical data
    # processed_train = encoder(train_x)
    # processed_train = encode_filler(processed_train)

    y = train_data['subjectivePoverty_rating']
    # X = processed_train

    ordinal_col = ['q23', 'Q01', 'Q06']
    numerical_col = ['q05', 'q09', 'Q07']
    binary_col = ['q02', 'q03', 'Q03', 'Q08', 'Q11', 'Q19']

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_col),    # Scale numerical features
            ('ord', StandardScaler(), ordinal_col),      # Scale ordinal categorical features
            ('one_hot', 'passthrough', binary_col)      # Leave one-hot-encoded features unchanged
        ]
    )

    # Complete pipeline with SVM
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(probability=True, random_state=42))
    ])
    
   # GridSearch CV
    param_grid = {
        'classifier__C': [0.5, 1, 10, 100],
        'classifier__gamma': ['scale', 0.1, 0.01, 0.001],
        'classifier__kernel': ['rbf']
    }

    optimal_params = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=5, scoring='neg_log_loss', verbose=1)
    
    # Fit the model
    optimal_params.fit(train_x, y)
    return optimal_params.best_estimator_

In [17]:
RESULT_DATA_DIR = "../data/model_result/"
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))

train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

model = predict_ratings_SVM(train_A)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']
['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']
['q02', 'q03', 'q05', 'q09', 'q23', 'Q01', 'Q03']

In [22]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
# display(train_B_X)

def predictratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    # train_B_X = encoder(train_B_X)
    # train_B_X = encode_filler(train_B_X)
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(train_B['subjectivePoverty_rating'], preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

pred = predictratings_SVM(model, train_B_X)
display(pred)

#1.9505287965222604 - scale X



1.9533630140413727


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,125_11_1,0.039803,0.093217,0.182597,0.224901,0.204218,0.137551,0.075938,0.031804,0.007923,0.002049
1,129_9_1,0.033954,0.089941,0.156894,0.213108,0.213700,0.161301,0.077689,0.042275,0.008087,0.003051
2,800_8_1,0.031121,0.068129,0.157122,0.217648,0.209474,0.160807,0.091368,0.052482,0.010940,0.000910
3,472_1_1,0.029062,0.058346,0.091228,0.168264,0.231946,0.205058,0.130607,0.066442,0.013639,0.005408
4,309_11_1,0.038624,0.090862,0.174254,0.213264,0.210572,0.144435,0.082904,0.036531,0.007472,0.001081
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.042078,0.088893,0.178973,0.222101,0.200386,0.140713,0.081242,0.036272,0.008636,0.000706
1330,566_4_1,0.034969,0.089865,0.176643,0.213351,0.199546,0.150787,0.081267,0.040948,0.008889,0.003736
1331,220_7_1,0.036896,0.086409,0.161406,0.207002,0.211835,0.156614,0.086781,0.041645,0.008455,0.002959
1332,144_5_2,0.036821,0.065292,0.133330,0.202747,0.202078,0.168880,0.115258,0.064309,0.010154,0.001129


psu_hh_idcode	subjective_poverty1	subjective_poverty2	subjective_poverty3	subjective_poverty4	subjective_poverty5	subjective_poverty6	subjective_poverty7	subjective_poverty8	subjective_poverty9	subjective_poverty10
0	125_11_1	0.025525	0.059814	0.087623	0.220639	0.233748	0.182541	0.121753	0.053184	0.011969	0.003203
1	129_9_1	0.037075	0.058612	0.088723	0.214832	0.184983	0.169812	0.163631	0.064149	0.014398	0.003784
2	800_8_1	0.027463	0.057546	0.080168	0.217410	0.228223	0.184984	0.132869	0.054868	0.012673	0.003796
3	472_1_1	0.034906	0.057764	0.085594	0.217024	0.193675	0.172937	0.157929	0.062606	0.014007	0.003559
4	309_11_1	0.027289	0.057630	0.078263	0.222137	0.226802	0.182085	0.132389	0.057131	0.012434	0.003839
...	...	...	...	...	...	...	...	...	...	...	...
1329	588_5_1	0.026330	0.058408	0.083609	0.219034	0.233413	0.183294	0.125962	0.054159	0.012287	0.003504
1330	566_4_1	0.036910	0.058539	0.088463	0.214998	0.185711	0.170098	0.163133	0.064025	0.014363	0.003761
1331	220_7_1	0.031674	0.057190	0.081302	0.220262	0.208272	0.177602	0.146896	0.059892	0.013415	0.003495
1332	144_5_2	0.024264	0.065964	0.090802	0.210098	0.245841	0.185363	0.109233	0.054836	0.011278	0.002322
1333	18_7_1	0.034787	0.057585	0.085540	0.217065	0.194416	0.173364	0.157189	0.062621	0.013908	0.003525
