# SVM

In [3]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, log_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

## Apply SVM on unfilled dataset
### Missing Values

In [4]:
svm_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))

missing_columns = svm_unfilled.columns[svm_unfilled.isnull().any()].tolist()
print( 'Columns with na values are: ', missing_columns)

# Fill na values with 'missing'
svm_unfilled['Q06'] = svm_unfilled['Q06'].fillna(-1)
svm_unfilled['Q07'] = svm_unfilled['Q07'].fillna(-1)
svm_unfilled['Q08'] = svm_unfilled['Q08'].fillna(-1)
svm_unfilled['Q11'] = svm_unfilled['Q11'].fillna(-1)
svm_unfilled['Q19'] = svm_unfilled['Q19'].fillna(-1)

Columns with na values are:  ['Q06', 'Q07', 'Q08', 'Q11', 'Q19']


In [5]:
# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded = encoder.fit_transform(svm_unfilled[missing_columns])


# Convert to DataFrame and combine with numerical features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
numerical_df = svm_unfilled.drop(columns=missing_columns)

# Combine numerical and encoded categorical data
processed_df = pd.concat([numerical_df, encoded_df], axis=1)

print("\nAfter One-Hot Encoding:")
print(processed_df)


After One-Hot Encoding:
     psu_hh_idcode   hhid  subjectivePoverty_rating  q02  q03  q05  q09  q23  \
0           30_8_1   3008                         4    1    1   44    0    0   
1          194_1_2  19401                         1    2    2   48    0    0   
2          224_6_1  22406                         3    1    1   61    0    0   
3         323_10_1  32310                         5    1    1   66    0    0   
4         428_10_1  42810                         4    2    1   72    0    0   
...            ...    ...                       ...  ...  ...  ...  ...  ...   
5329       571_8_1  57108                         3    2    1   73    0    0   
5330       601_5_1  60105                         4    1    1   60    0    0   
5331       782_1_1  78201                         2    1    1   55    0    0   
5332       606_3_1  60603                         5    1    1   53    0    1   
5333       450_4_1  45004                         4    1    1   78    0    0   

      Q01  Q03

### Divide data to train/test

In [None]:
y = processed_df['subjectivePoverty_rating']
print(y)

feature_cols = list(processed_df.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = processed_df[feature_cols]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_x)
X_test_scaled = scaler.transform(test_x)

0       4
1       1
2       3
3       5
4       4
       ..
5329    3
5330    4
5331    2
5332    5
5333    4
Name: subjectivePoverty_rating, Length: 5334, dtype: int64
(4267, 45) (1067, 45) (4267,) (1067,)
      Q01  Q03  Q06_-1.0  Q06_0.0  Q06_1.0  Q06_10.0  Q06_11.0  Q06_2.0  \
1158    1    1       0.0      0.0      1.0       0.0       0.0      0.0   
4359    2    1       0.0      0.0      1.0       0.0       0.0      0.0   
3088    2    1       0.0      0.0      0.0       0.0       0.0      1.0   
803     1    1       0.0      0.0      0.0       0.0       0.0      0.0   
168     1    1       0.0      0.0      0.0       0.0       0.0      1.0   
...   ...  ...       ...      ...      ...       ...       ...      ...   
644     1    1       0.0      0.0      0.0       0.0       0.0      0.0   
1643    1    1       0.0      0.0      0.0       0.0       0.0      1.0   
3934    1    1       0.0      0.0      0.0       0.0       0.0      0.0   
2810    1    1       0.0      0.0      0.0 

### Hyeprparameter tuning with grid search

In [8]:
# GridSearch CV
param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, scoring='neg_log_loss', verbose=2)
optimal_params.fit(X_train_scaled, train_y)

print("Best Parameters:", optimal_params.best_params_)
print("Best Log Loss:", optimal_params.best_score_)

# Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
# Best Log Loss: -1.948786842738619



Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.5s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.4s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.4s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.4s
[CV] END .....................C=0.5, gamma=scale, kernel=rbf; total time=   2.4s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END .......................C=0.5, gamma=0.1, kernel=rbf; total time=   2.4s
[CV] END ......................C=0.5, gamma=0.01, kernel=rbf; total time=   2.3s
[CV] END ......................C=0.5, gamma=0.01

In [None]:
results = optimal_params.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 
results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.head()

#Save the results to csv file
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "svm_unfillled.csv"), index=False)

### Predict on test data with model

In [39]:
test_input = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
test_input_x = test_input.drop(columns=['psu_hh_idcode'])
missing_columns = test_input_x.columns[test_input_x.isnull().any()].tolist()
test_input_x = test_input_x.fillna(-1)

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded = encoder.fit_transform(test_input_x[missing_columns])

# Convert to DataFrame and combine with numerical features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
numerical_df = test_input_x.drop(columns=missing_columns)

# Combine numerical and encoded categorical data
test_input_x = pd.concat([numerical_df, encoded_df], axis=1)

print("After One-Hot Encoding:\n", test_input_x)
# scaler = StandardScaler()
# scaler.fit_transform(train_x)
# test_input_x_trans = scaler.transform(test_input_x)

# print("After Transformation: ", test_input_x_trans)

# # pred = optimal_params.predict_proba(test_input_x)
# # print(test_input_x.head())
# # print('--------')
# # print(test_x.head())


After One-Hot Encoding:
       q02  q03  q09  q05  q23  Q01  Q03  Q06_-1.0  Q06_0.0  Q06_1.0  ...  \
0       1    1    0   72    4    1    1       0.0      0.0      1.0  ...   
1       1    1    0   64    4    1    1       0.0      0.0      0.0  ...   
2       1    1    0   69    4    1    1       0.0      0.0      0.0  ...   
3       1    1    0   53    4    1    1       0.0      0.0      0.0  ...   
4       1    1    0   48    4    1    1       0.0      0.0      0.0  ...   
...   ...  ...  ...  ...  ...  ...  ...       ...      ...      ...  ...   
1329    1    1    0   61    4    1    1       0.0      0.0      0.0  ...   
1330    2    2    0   49    4    1    1       0.0      0.0      0.0  ...   
1331    2    2    0   35    4    1    1       0.0      0.0      0.0  ...   
1332    2    2    0   69    4    1    1       0.0      0.0      0.0  ...   
1333    2    2    0   60    4    1    1       0.0      0.0      0.0  ...   

      Q11_3.0  Q11_4.0  Q11_8.0  Q11_10.0  Q11_12.0  Q11_13.0 

In [None]:
#For my reference, predict with optimized parameters HARD CODED Version

svm_model = SVC(C=10, gamma=0.01, kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, train_y)
pred = svm_model.predict_proba(X_test_scaled)
log_loss(test_y, pred)
#1.9642149345026059