In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import pandas as pd
CLEAN_DATA_DIR = "../data/clean/"

In [84]:
RESULT_DATA_DIR = "../data/model_result/"
train_data = pd.read_csv(os.path.join(CLEAN_DATA_DIR, "TRAIN_MERGED_UNFILLED.csv"))
columns_to_fill = ['Q06', 'Q07', 'Q08', 'Q11', 'Q19'
                   ]
train_data[columns_to_fill] = train_data[columns_to_fill].fillna(-1)
train_data = train_data.drop(columns=['hhid'])

train_A, train_B = train_test_split(train_data, test_size=0.25, stratify=train_data['subjectivePoverty_rating'], random_state=42)


In [None]:


def predict_ratings_SVM(train_data, test_data):
    missing_columns = [col for col in train_data.columns if -1 in train_data[col].values]
    # One-hot encode categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    encoded = encoder.fit_transform(train_data[missing_columns])

    # Convert to DataFrame and combine with numerical features
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns), index=train_data.index)
    numerical_df = train_data.drop(columns=missing_columns)

    # Combine numerical and encoded categorical data
    processed_df = pd.concat([numerical_df, encoded_df], axis=1)
    y = processed_df['subjectivePoverty_rating']

  

    feature_cols = list(processed_df.columns.difference(['psu_hh_idcode', 'subjectivePoverty_rating']))
    X = processed_df[feature_cols]
  
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X)
    
    # Define the hyperparameter grid
    param_grid = {
        'C': [10, 100],
        'gamma': [0.1, 0.01],
        'kernel': ['rbf']
    }
    
    # Set up GridSearchCV
    log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
    grid_search = GridSearchCV(
        estimator=SVC(probability=True, random_state=42),
        param_grid=param_grid,
        scoring=log_loss_scorer,
        cv=5,
    )
    
    # Fit the model
    grid_search.fit(X_train_scaled, y)
    encoded_x_cols = X.columns


    test_input_x = test_data.drop(columns=['psu_hh_idcode', 'subjectivePoverty_rating'])
    missing_columns = [col for col in test_input_x.columns if -1 in test_input_x[col].values]

    # One-hot encode categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop=None)
    encoded = encoder.fit_transform(test_input_x[missing_columns])

    # Convert to DataFrame and combine with numerical features
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(missing_columns), index=test_input_x.index)
    numerical_df = test_input_x.drop(columns=missing_columns)

    # Combine numerical and encoded categorical data
    processed_test_input_x = pd.concat([numerical_df, encoded_df], axis=1)

    print('processed_test_input_x')
    display(processed_test_input_x)

    # Step 2: Reorder df2 columns to match df1
    test_input_x = processed_test_input_x.reindex(columns=[col for col in encoded_x_cols if col in processed_test_input_x.columns])

    # Step 3: Add the extra columns from df1 that are missing in df2
    for col in encoded_x_cols:
        if col not in test_input_x.columns:
            test_input_x[col] = 0  # Assign 0
    # Step 4: Reorder df2 to exactly match df1's column order
    test_input_x = test_input_x[encoded_x_cols]

    test_input_x_trans = scaler.transform(test_input_x)

    id = test_data['psu_hh_idcode']
    y_val_pred_proba = grid_search.predict_proba(test_input_x)

    column_names = [f"subjective_poverty_{i}" for i in range(1, 11)]
    probs = pd.DataFrame(y_val_pred_proba, columns=column_names)
    submission = pd.concat([id, probs], axis=1)
    return submission

In [90]:
submission = predict_ratings_SVM(train_A, train_B)
submission.to_csv(os.path.join(RESULT_DATA_DIR, "train_B_predssssss_svm.csv"), index=False)

