In [1]:
import os
os.chdir('../..')

In [2]:
import sys
import pandas as pd

import pickle

import matplotlib.pyplot as plot

from os import listdir
from os.path import isfile, join

import lightgbm as lgb

import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from modules.ml_efficacy.LGBMOrdinal_base import LGBMOrdinal, LGBMRegressor, LGBMClassifier, LightGBMCV, emse, emae

import warnings
warnings.filterwarnings('ignore')

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Functions

In [3]:
def process_real_df(df, cat_cols=None, ord_cols=None, cont_cols=None, enc=None):
    if cat_cols and cont_cols:
        df[cat_cols+ord_cols] = enc.fit_transform(df[cat_cols+ord_cols])
    else:
        print('Automated inference of column types to be implemented!')
    return df

def process_syn_df(df, cat_cols, ord_cols, cont_cols, enc=None):
    df[cat_cols+ord_cols] = enc.transform(df[cat_cols+ord_cols])

    return df

def check_low_appearing_vars(df):
    
    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print(c)
                print(val)
                print()
                
def replace_low_appearing_values(df, dataset):
    
    if 'Chicago' in dataset:
        dct_ = {}
        for i in df['hh_vehicles'].unique():
            if i >= 5:
                dct_[i] = '5+'
            else:
                dct_[i] = str(i)        
        df['hh_vehicles'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_size'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_size'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_bikes'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_bikes'].replace(dct_, inplace=True)   
    
    elif 'LPMC' in dataset:
        dct_ = {}
        for i in df['pt_n_interchanges'].unique():
            if i >= 2:
                dct_[i] = '2+'
            else:
                dct_[i] = str(i)        
        df['pt_n_interchanges'].replace(dct_, inplace=True) 
        
        dct_ = {
            'Diesel_LGV': 'LGV',
            'Petrol_LGV': 'LGV',
            'Hybrid_Car': 'Average_Car'
        }
        df['fueltype'].replace(dct_, inplace=True)

# Compute results

In [4]:
dataset = 'Chicago'

input_folder = '../synth_data/{}/'.format(dataset)

files_ = {}
models = []

for f in listdir(input_folder):
    if isfile(join(input_folder, f)):
        m = f.split('.')[0]
        models.append(m)
        files_[m] = join(input_folder, f)
        
models = ['WGAN_WI_01_NO_01', 'TEST']
files_ = {}
for m in models:
    files_[m] = join(input_folder, m) + '.csv'
        
models.append('original')
files_['original'] = '../data/' + dataset.split('_')[0] + '/data.csv' 

In [5]:
models

['WGAN_WI_01_NO_01', 'TEST', 'original']

In [6]:
df_orig = pd.read_csv('../data/' + dataset.split('_')[0] + '/data.csv')

In [7]:
replace_low_appearing_values(df_orig, dataset)

In [8]:
check_low_appearing_vars(df_orig)

In [9]:
if 'Chicago' in dataset:
    cont_cols = ['distance', 'age', 'departure_time']
    ord_cols = ['hh_vehicles', 'hh_size', 'hh_bikes', 'hh_income', 'education_level']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]
elif 'LPMC' in dataset:
    cont_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 
                 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 
                 'dur_pt_int', 'dur_driving', 'cost_transit', 
                 'cost_driving_fuel', 'driving_traffic_percent']
    ord_cols = ['travel_year', 'travel_month', 'travel_date', 
                'day_of_week', 'pt_n_interchanges', 'car_ownership']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]

In [10]:
enc = OrdinalEncoder()
ori = process_real_df(df_orig, cat_cols, ord_cols, cont_cols, enc)

In [11]:
filepath = './notebooks/tests/ml_efficacy/'
filename = 'cv_result_{}.pickle'.format(dataset)
cv_modelscores = {}
params={'n_estimators': 5000}

try:
    cv_modelscores = pickle.load(open(f'{filepath}{filename}','rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

No previous results found, starting fresh


In [12]:
for i, model in enumerate(models):
    
    tmp_df = pd.read_csv(files_[model])
    replace_low_appearing_values(tmp_df, dataset)
    v_df = process_syn_df(tmp_df, cat_cols, ord_cols, cont_cols, enc)
        
    if model in cv_modelscores:
        print(f'Previous results for {model}')
    else:
        print(f'Getting results for {model}')
        cv_modelscores[model] = {}
        for j, ycol in enumerate(ori.columns):
            info = '    Column: {} ({}/{})'.format(ycol, j+1, len(ori.columns))
            print(info, end="")
            sys.stdout.flush()
            
            Xcols = [c for c in ori.columns if c!=ycol]

            y_synth = v_df[ycol]
            X_synth = v_df[Xcols]
            y_real = ori[ycol]
            X_real = ori[Xcols]


            observe_sets = {'original': (X_real, y_real)}
            ccols = [c for c in cat_cols if c!=ycol]


            if ycol in cat_cols + ord_cols:
                lgbm_type = 'LGBMClassifier'
                kf = StratifiedKFold(shuffle=True, random_state=42)
                eval_metric = ['error']
            elif ycol in cont_cols:
                lgbm_type = 'LGBMRegressor'
                kf = KFold(shuffle=True, random_state=42)
                eval_metric = ['l2', 'l1']
            cv = LightGBMCV(lgbm_type=lgbm_type,
                splitter = kf,
                eval_metric = eval_metric,
                observe_sets = observe_sets,
                separate_observation_split = True,
                early_stopping_rounds = 5,
                return_cv_models = False,
                refit_model = False,
                verbose = True)
            cv.fit(X_synth, y_synth, categorical_feature=ccols, params=params)
            cv_modelscores[model][ycol] = cv.result_dict_
            
            print(' '*len(info), end='\r')

            if j == len(ori.columns):
                print('', end='\r')
            
        pickle.dump(cv_modelscores,open(f'{filepath}/{filename}','wb'))

        
print("\033[1mFINISHED!\033[0m")

Getting results for WGAN_WI_01_NO_01
Getting results for TEST           
Getting results for original       
[1mFINISHED![0m                  


In [13]:
internal = {}
external = {}
external_normalised = {}
cont_scores = {}
cat_scores = {}
ori_scores = {col: cv_modelscores['original'][col]['test_log_loss'] for col in cat_cols + ord_cols}
ori_scores.update({col: cv_modelscores['original'][col]['test_l2'] for col in cont_cols})
for model in models:
    internal[model] = {col: cv_modelscores[model][col]['test_log_loss'] for col in cat_cols + ord_cols}
    external[model] = {col: cv_modelscores[model][col]['original_log_loss'] for col in cat_cols + ord_cols}
    external_normalised[model] = {col: external[model][col]-ori_scores[col] for col in cat_cols + ord_cols}
    
    internal[model].update({col: cv_modelscores[model][col]['test_l2'] for col in cont_cols})
    external[model].update({col: cv_modelscores[model][col]['original_l2'] for col in cont_cols})
    external_normalised[model].update({col: external[model][col]/ori_scores[col] for col in cont_cols})
    
    cont_scores[model] = sum([external[model][col]/ori_scores[col] for col in cont_cols])
    cat_scores[model] = sum([external[model][col]-ori_scores[col] for col in cat_cols + ord_cols])

In [14]:
df = pd.DataFrame(external_normalised).T

In [15]:
df.to_csv('{}/model_scores_external_{}.csv'.format(filepath, dataset))

In [16]:
cat_sorted = sorted(cat_scores.items(), key=lambda item: item[1])
cont_sorted = sorted(cont_scores.items(), key=lambda item: item[1])

In [17]:
i=1
print('   | {:<30} | {:<30}'.format('categorical', 'continuous'))
print('-----------------------------------------------------------')
for a, b in zip(cat_sorted, cont_sorted):
    print('{:>2} | {:<30} | {:<30}'.format(i, '{:<12}: {:.3f}'.format(a[0], a[1]), '{:<12}: {:.3f}'.format(b[0], b[1])))
    i+=1

   | categorical                    | continuous                    
-----------------------------------------------------------
 1 | original    : -2.111           | original    : 2.559           
 2 | TEST        : 0.730            | TEST        : 3.061           
 3 | WGAN_WI_01_NO_01: 0.875        | WGAN_WI_01_NO_01: 3.117       
