In [19]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

In [20]:
def encoder(data):
  col = [col for col in data.columns if -1 in data[col].values]
  # One-hot encode categorical columns
  encoder = OneHotEncoder(sparse_output=False, drop=None)
  encoded = encoder.fit_transform(data[col])

  # Convert to DataFrame and combine with numerical features
  encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(col), index=data.index)
  numerical_df = data.drop(columns=col)

  # Combine numerical and encoded categorical data
  processed_df = pd.concat([numerical_df, encoded_df], axis=1)
  return processed_df


def encode_filler(data):
  na_col = ['Q06_11.0', 'Q11_5.0', 'Q06_10.0', 'Q11_9.0', 'Q06_9.0', 'Q07_-1.0', 'Q06_2.0', 'Q01', 'Q11_14.0', 'Q06_5.0',
              'Q11_2.0', 'Q06_4.0', 'Q08_-1.0', 'Q07_4.0', 'Q08_2.0', 'Q07_0.0', 'q02', 'Q06_8.0', 'Q07_2.0', 'Q11_3.0', 'Q03',
              'Q11_4.0', 'q23', 'Q11_7.0', 'Q11_13.0', 'Q06_1.0', 'Q19_2.0', 'Q06_-1.0', 'Q11_-1.0', 'Q11_10.0', 'q05', 'Q07_1.0',
              'Q11_12.0', 'Q19_-1.0', 'Q06_7.0', 'Q11_1.0', 'Q19_1.0', 'Q11_8.0', 'Q08_1.0', 'Q06_0.0', 'q03', 'Q06_3.0', 'q09',
              'Q07_3.0', 'Q11_11.0', 'Q06_6.0']

  for col in na_col:
      if col not in data.columns:
          data[col] = 0  # Assign 0
  
  filled_data = data[na_col]
  return filled_data

In [21]:
def predict_ratings_SVM(train_data):

    train_x = train_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])

    # Combine numerical and encoded categorical data
    processed_train = encoder(train_x)
    processed_train = encode_filler(processed_train)

    y = train_data['subjectivePoverty_rating']
    X = processed_train

  
    # Scale the features
    # scaler = StandardScaler()
    # X_train_scaled = scaler.fit_transform(X)
    
   # GridSearch CV
    param_grid = {
        'C': [0.5, 1, 10, 100],
        'gamma': ['scale', 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }

    optimal_params = GridSearchCV(SVC(probability=True, random_state=42), param_grid, n_jobs=-1, cv=5, scoring='neg_log_loss')
    
    # Fit the model
    optimal_params.fit(X, y)
    return optimal_params.best_estimator_

In [16]:
RESULT_DATA_DIR = "../data/model_result/"
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))

train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

model = predict_ratings_SVM(train_A)

In [None]:
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
# display(train_B_X)

def predictratings_SVM(model, train_B_X):
    test_ids = train_B_X['psu_hh_idcode']
    train_B_X = encoder(train_B_X)
    train_B_X = encode_filler(train_B_X)
    preds_proba = model.predict_proba(train_B_X)
    print(log_loss(train_B['subjectivePoverty_rating'], preds_proba))
    output_df = pd.DataFrame(preds_proba, columns=[f'subjective_poverty{i+1}' for i in range(preds_proba.shape[1])])
    output_df.insert(0, 'psu_hh_idcode', test_ids.values)  # Insert the ID column at the start
    return output_df

pred = predictratings_SVM(model, train_B_X)
display(pred)

#1.9505287965222604 - scale X



In [None]:
MODEL_TRAINING_DATA_DIR = "../data/model_training/"
train_data = pd.read_csv(os.path.join(MODEL_TRAINING_DATA_DIR, "TRAIN_MERGED_UNFILLED_encoded.csv"))
train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)

model = predict_ratings_SVM(train_A)
train_B_X = train_B.drop(columns=['subjectivePoverty_rating'])
pred = predictratings_SVM(model, train_B_X)
display(pred)

#1.9847676398932983 - scale o

1.9847676398932983


Unnamed: 0,psu_hh_idcode,subjective_poverty1,subjective_poverty2,subjective_poverty3,subjective_poverty4,subjective_poverty5,subjective_poverty6,subjective_poverty7,subjective_poverty8,subjective_poverty9,subjective_poverty10
0,125_11_1,0.037714,0.088038,0.162156,0.206846,0.206600,0.153097,0.087770,0.046170,0.009439,0.002170
1,129_9_1,0.044272,0.099092,0.177918,0.225486,0.119236,0.167930,0.096035,0.053468,0.012200,0.004363
2,800_8_1,0.037609,0.086254,0.162107,0.206880,0.207113,0.153115,0.089702,0.046112,0.009003,0.002106
3,472_1_1,0.037549,0.084711,0.162122,0.206384,0.205959,0.152950,0.092459,0.046198,0.009504,0.002163
4,309_11_1,0.037491,0.084642,0.162476,0.206771,0.207688,0.154264,0.091087,0.046064,0.007451,0.002066
...,...,...,...,...,...,...,...,...,...,...,...
1329,588_5_1,0.037643,0.087379,0.162128,0.206865,0.206794,0.153098,0.088508,0.046156,0.009258,0.002172
1330,566_4_1,0.041635,0.093976,0.171549,0.217925,0.150267,0.161936,0.093466,0.050683,0.011375,0.007188
1331,220_7_1,0.037614,0.083467,0.162170,0.206784,0.207284,0.153168,0.092627,0.046017,0.008822,0.002047
1332,144_5_2,0.038440,0.085921,0.162641,0.205601,0.205517,0.153733,0.090047,0.045666,0.008296,0.004137
