# Training Models

This notebook contains the code used for training the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Data Preparation

In [2]:
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file, 
                              train_split=0.6, valid_split=0.2, force=False):
    
    
    if  (force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") or 
          not os.path.exists(atk_train_file+".cat.json") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )


        # split-back into train valid test
        if 'instance_id' in train.columns.values:
            print ('   ... with instance ids')
            valid['instance_id'] += train.iloc[-1,0]
            test['instance_id']  += valid.iloc[-1,0]
            assert max(train['instance_id'])<min(valid['instance_id']), "Instance ID mismatch"
            assert max(valid['instance_id'])<min(test['instance_id']), "Instance ID mismatch"
            
            groups = np.concatenate( [ train['instance_id'].value_counts().sort_index().values,
                                       valid['instance_id'].value_counts().sort_index().values,
                                       test['instance_id'].value_counts().sort_index().values ] )
            
            num_train_groups = int( len(groups)*train_split )
            train_size = sum(groups[:num_train_groups])
            num_valid_groups = int( len(groups)*valid_split )
            valid_size = sum(groups[num_train_groups:num_train_groups+num_valid_groups])
        else:
            full_size = len(train) + len(valid) + len(test)
            train_size = int( full_size*train_split )
            valid_size = int( full_size*valid_split )
        
        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # get index of categorical features (-1 because of instance_id)
        cat_fx = full.columns.values[np.where(full.dtypes=='object')[0]]
        cat_fx = list(cat_fx)    
        full = label_encode(full, cat_fx)
        with open(atk_train_file+".cat.json", 'w') as fp:
            json.dump(cat_fx, fp)
        print ("CatFX:", cat_fx)

        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]
        
        assert len(train_cat)+len(valid_cat)+len(test_cat)==len(full), "Split sizes mismatch"
        

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
        
        with open(atk_train_file+".cat.json", 'r') as fp:
            cat_fx = json.load(fp)
    
    # return data
    return train_cat, valid_cat, test_cat, cat_fx


# Training SVM

In [3]:
# Our custom metrics
def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

In [10]:
from sklearn.preprocessing import OneHotEncoder

def train_svm(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['C', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    encoder = OneHotEncoder(categorical_features=cat_fx)

    X_train = encoder.fit_transform( train.iloc[:,:-1].values )
    y_train = train.iloc[:,-1].values
    # y_train[y_train == -1] = 0
    
    X_valid = encoder.transform( valid.iloc[:,:-1].values )
    y_valid = valid.iloc[:,-1].values
    
    X_test = encoder.transform( test.iloc[:,:-1].values )
    y_test = test.iloc[:,-1].values

    for c in [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]:
        
        model = SVC(kernel='rbf', probability=True, max_iter=1000)
        model.fit(X_train, y_train)
        
        y_preds = 2*model.predict_proba(X_valid)[:,1] - 1.0
        cur_avg_binary_log_loss = np.mean(binary_log_loss(y_preds, y_valid))
        
        model_file_name = "{:s}_C{:04d}.model".format(output_model_file, int(c * 1000))
        
        with open(model_file_name, 'wb') as fout:
            pickle.dump(model, fout)
        
        print ("Model saved to", model_file_name)
        
        # update experimental results
        exp = exp.append({'C': c, 
                          'avg_binary_log_loss':cur_avg_binary_log_loss},
                         ignore_index=True)
    
    return exp

In [11]:
# enable/disable LGBM Baseline
if True:
    experiments = train_svm ( "../data/census/train_ori.csv.bz2",
                             "../data/census/valid_ori.csv.bz2",
                             "../data/census/test_ori.csv.bz2",
                             "../out/models/svm_census")  

    experiments.to_csv('../out/models/svm_census.csv', index=False)

    print (experiments)

Loading pre-processed files...


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Model saved to ../out/models/svm_census_C0001.model
Model saved to ../out/models/svm_census_C0010.model
Model saved to ../out/models/svm_census_C0100.model
Model saved to ../out/models/svm_census_C1000.model
Model saved to ../out/models/svm_census_C10000.model
Model saved to ../out/models/svm_census_C100000.model
Model saved to ../out/models/svm_census_C1000000.model
          C  avg_binary_log_loss
0     0.001             0.596194
1     0.010             0.596547
2     0.100             0.597062
3     1.000             0.595874
4    10.000             0.596707
5   100.000             0.596940
6  1000.000             0.596249
