# SVM

In [1]:
import os 
import pandas as pd 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, log_loss, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer

CLEAN_DATA_DIR = "../data/clean/"
RESULT_DATA_DIR = "../data/model_result/"

## Apply SVM on unfilled dataset
### Missing Values

In [52]:
svm_unfilled = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))

missing_columns = svm_unfilled.columns[svm_unfilled.isnull().any()].tolist()
print( 'Columns with na values are: ', missing_columns)

# Fill na values with 'missing'
svm_unfilled['Q06'] = svm_unfilled['Q06'].fillna(-1)
svm_unfilled['Q07'] = svm_unfilled['Q07'].fillna(-1)
svm_unfilled['Q08'] = svm_unfilled['Q08'].fillna(-1)
svm_unfilled['Q11'] = svm_unfilled['Q11'].fillna(-1)
svm_unfilled['Q19'] = svm_unfilled['Q19'].fillna(-1)

Columns with na values are:  ['Q06', 'Q07', 'Q08', 'Q11', 'Q19']


In [56]:
# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded = encoder.fit_transform(svm_unfilled[missing_columns])

# Convert to DataFrame and combine with numerical features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
numerical_df = svm_unfilled.drop(columns=missing_columns)

# Combine numerical and encoded categorical data
processed_df = pd.concat([numerical_df, encoded_df], axis=1)
display(svm_unfilled)
display(encoded_df)
display(numerical_df)

Unnamed: 0,psu_hh_idcode,subjectivePoverty_rating,hhid,q02,q03,q05,q09,q23,Q01,Q03,Q06,Q07,Q08,Q11,Q19
0,1_2_1,2,102,1,1,52,0,0,1,1,2.0,1.0,2.0,13.0,2.0
1,1_3_1,4,103,1,1,58,0,0,1,1,9.0,1.0,2.0,13.0,2.0
2,1_5_1,6,105,1,1,54,0,0,1,1,1.0,0.0,2.0,2.0,2.0
3,1_11_1,6,111,1,1,44,0,0,1,1,3.0,1.0,2.0,13.0,2.0
4,1_12_1,4,112,2,1,54,0,0,1,1,3.0,1.0,2.0,13.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5329,834_5_1,5,83405,1,1,60,0,0,1,1,2.0,1.0,2.0,4.0,2.0
5330,834_6_1,5,83406,1,1,69,0,0,2,1,2.0,1.0,2.0,2.0,2.0
5331,834_7_1,5,83407,1,1,42,0,0,1,1,9.0,2.0,2.0,4.0,2.0
5332,834_8_1,6,83408,1,1,56,0,0,1,1,2.0,2.0,2.0,4.0,2.0


Unnamed: 0,Q06_-1.0,Q06_0.0,Q06_1.0,Q06_2.0,Q06_3.0,Q06_4.0,Q06_5.0,Q06_6.0,Q06_7.0,Q06_8.0,...,Q11_7.0,Q11_8.0,Q11_10.0,Q11_11.0,Q11_12.0,Q11_13.0,Q11_14.0,Q19_-1.0,Q19_1.0,Q19_2.0
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5329,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5330,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5332,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,psu_hh_idcode,subjectivePoverty_rating,hhid,q02,q03,q05,q09,q23,Q01,Q03
0,1_2_1,2,102,1,1,52,0,0,1,1
1,1_3_1,4,103,1,1,58,0,0,1,1
2,1_5_1,6,105,1,1,54,0,0,1,1
3,1_11_1,6,111,1,1,44,0,0,1,1
4,1_12_1,4,112,2,1,54,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
5329,834_5_1,5,83405,1,1,60,0,0,1,1
5330,834_6_1,5,83406,1,1,69,0,0,2,1
5331,834_7_1,5,83407,1,1,42,0,0,1,1
5332,834_8_1,6,83408,1,1,56,0,0,1,1


### Divide data to train/test

In [54]:
y = processed_df['subjectivePoverty_rating']
print(y)

feature_cols = list(processed_df.columns.difference(['psu_hh_idcode', 'hhid', 'subjectivePoverty_rating']))
X = processed_df[feature_cols]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_x)
X_test_scaled = scaler.transform(test_x)

0       2
1       4
2       6
3       6
4       4
       ..
5329    5
5330    5
5331    5
5332    6
5333    5
Name: subjectivePoverty_rating, Length: 5334, dtype: int64
(4267, 45) (1067, 45) (4267,) (1067,)


### Hyeprparameter tuning with grid search

In [None]:
# GridSearch CV
param_grid = {
    'C': [0.5, 1, 10, 100],
    'gamma': ['scale', 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, scoring='neg_log_loss', verbose=2)
optimal_params.fit(X_train_scaled, train_y)

print("Best Parameters:", optimal_params.best_params_)
print("Best Log Loss:", optimal_params.best_score_)

# Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
# Best Log Loss: -1.948786842738619

In [9]:
results = optimal_params.cv_results_
log_loss_scores = results['mean_test_score']  # Mean log loss (negative)
hyperparameters = results['params'] 
results_df = pd.DataFrame(hyperparameters)
results_df['Mean Log Loss'] = -log_loss_scores  # Convert back to positive (lower is better)

# Display the results sorted by Log Loss
results_df = results_df.sort_values(by='Mean Log Loss', ascending=True)
results_df.head()

#Save the results to csv file
results_df.to_csv(os.path.join(RESULT_DATA_DIR, "svm_unfillled.csv"), index=False)

### Predict on test data with model

In [61]:
test_input = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TEST_INPUT.csv"))
test_input_x = test_input.drop(columns=['psu_hh_idcode'])

missing_columns = [col for col in test_input_x.columns if -1 in test_input_x[col].values]

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded = encoder.fit_transform(test_input_x[missing_columns])

# Convert to DataFrame and combine with numerical features
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns))
numerical_df = test_input_x.drop(columns=missing_columns)

# Combine numerical and encoded categorical data
test_input_x = pd.concat([numerical_df, encoded_df], axis=1)


# Step 1: Get the column order from df1
test_x_columns = test_x.columns

# Step 2: Reorder df2 columns to match df1
test_input_x = test_input_x.reindex(columns=[col for col in test_x_columns if col in test_input_x.columns])

# Step 3: Add the extra columns from df1 that are missing in df2
for col in test_x_columns:
    if col not in test_input_x.columns:
        test_input_x[col] = 0  # Assign 0
# Step 4: Reorder df2 to exactly match df1's column order
test_input_x = test_input_x[test_x_columns]


# print("After One-Hot Encoding:\n", test_input_x.head())

scaler = StandardScaler()
scaler.fit_transform(train_x)
test_input_x_trans = scaler.transform(test_input_x)

id = test_input['psu_hh_idcode']
y_val_pred_proba = optimal_params.predict_proba(test_input_x_trans)

column_names = [f"subjective_poverty_{i}" for i in range(1, 11)]
probs = pd.DataFrame(y_val_pred_proba, columns=column_names)
submission = pd.concat([id, probs], axis=1)

print(y_val_pred_proba[1].sum())

submission.to_csv(os.path.join(RESULT_DATA_DIR, "submission_svm.csv"), index=False)


1.0


In [58]:
#For my reference, predict with optimized parameters HARD CODED Version

svm_model = SVC(C=10, gamma=0.01, kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, train_y)
pred = svm_model.predict_proba(X_test_scaled)
log_loss(test_y, pred)
#1.9642149345026059

1.9645313925346224