# SVM Training

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

from NILib import *

In [5]:
def train_svm(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['C', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    encoder = OneHotEncoder(categorical_features=cat_fx)

    X_train = encoder.fit_transform( train.iloc[:,:-1].values )
    y_train = train.iloc[:,-1].values
    # y_train[y_train == -1] = 0
    
    X_valid = encoder.transform( valid.iloc[:,:-1].values )
    y_valid = valid.iloc[:,-1].values
    
    X_test = encoder.transform( test.iloc[:,:-1].values )
    y_test = test.iloc[:,-1].values

    for c in [0.001, 0.01]: #, 0.1, 1.0, 10, 100, 1000]:
        
        print ("Training with C:", c)
        
        model = SVC(kernel='rbf', probability=True, max_iter=1000)
        model.fit(X_train, y_train)
        
        y_preds = 2*model.predict_proba(X_valid)[:,1] - 1.0
        cur_avg_binary_log_loss = np.mean(binary_log_loss(y_preds, y_valid))
        
        model_file_name = "{:s}_C{:04d}.model".format(output_model_file, int(c * 1000))
        
        with open(model_file_name, 'wb') as fout:
            pickle.dump(model, fout)
        
        print ("Model saved to", model_file_name)
        
        # update experimental results
        exp = exp.append({'C': c, 
                          'avg_binary_log_loss':cur_avg_binary_log_loss},
                         ignore_index=True)
    
    return exp

In [6]:
# enable/disable
if True:
    experiments = train_svm ( "../data/census/train_ori.csv.bz2",
                             "../data/census/valid_ori.csv.bz2",
                             "../data/census/test_ori.csv.bz2",
                             "../out/models/svm_census")  

    experiments.to_csv('../out/models/svm_census.csv', index=False)

    print (experiments)

Loading pre-processed files...


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Training with C: 0.001




Model saved to ../out/models/svm_census_C0001.model
Training with C: 0.01
Model saved to ../out/models/svm_census_C0010.model
       C  avg_binary_log_loss
0  0.001             0.595056
1  0.010             0.595915
