In [1]:
import os
os.chdir('../..')

In [2]:
import sys
import pandas as pd

import pickle

import matplotlib.pyplot as plot

from os import listdir
from os.path import isfile, join

import numpy as np

import lightgbm as lgb

import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from modules.ml_efficacy.LGBMOrdinal_base import LGBMOrdinal, LGBMRegressor, LGBMClassifier, LightGBMCV, emse, emae

import warnings
warnings.filterwarnings('ignore')

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# Functions

In [3]:
def process_real_df(df, cat_cols=None, ord_cols=None, cont_cols=None, enc=None):
    if cat_cols and cont_cols:
        df[cat_cols+ord_cols] = enc.fit_transform(df[cat_cols+ord_cols])
    else:
        print('Automated inference of column types to be implemented!')
    return df

def process_syn_df(df, cat_cols, ord_cols, cont_cols, enc=None):
    df[cat_cols+ord_cols] = enc.transform(df[cat_cols+ord_cols])

    return df

def is_a_DATGAN(name):
    if any(x in name for x in ['TGAN', 'CTGAN', 'TVAE', 'FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']):
        return False
    else:
        return True
    
def check_low_appearing_vars(df):
    
    for c in df.columns:
        val = df[c].value_counts()
        if len(val) < 20:
            val = val/len(df)
            if any(val < 0.01) and c != 'choice':
                print('Variable {}: '.format(c))
                for idx, v in zip(val.index, val):
                    if v < 0.01:
                        print('  {} - {:.2f}% ({:d})'.format(idx, 100*v, int(v*len(df))))
                print()
                
def replace_low_appearing_values(df, dataset):
    
    if 'Chicago' in dataset:
        dct_ = {}
        for i in df['hh_vehicles'].unique():
            if i >= 5:
                dct_[i] = '5+'
            else:
                dct_[i] = str(i)        
        df['hh_vehicles'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_size'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_size'].replace(dct_, inplace=True)
        
        dct_ = {}
        for i in df['hh_bikes'].unique():
            if i >= 6:
                dct_[i] = '6+'
            else:
                dct_[i] = str(i)        
        df['hh_bikes'].replace(dct_, inplace=True)       

    elif 'LPMC' in dataset:
        dct_ = {}
        for i in df['pt_n_interchanges'].unique():
            if i >= 2:
                dct_[i] = '2+'
            else:
                dct_[i] = str(i)        
        df['pt_n_interchanges'].replace(dct_, inplace=True) 
        
        dct_ = {
            'Diesel_LGV': 'LGV',
            'Petrol_LGV': 'LGV',
            'Hybrid_Car': 'Average_Car'
        }
        df['fueltype'].replace(dct_, inplace=True) 

# Get all models and associated files

In [4]:
dataset = 'LPMC'
input_folder = '../synth_data/{}/'.format(dataset)
n_models = 5
n_data = 5

# Models for testing all DATGANS
models = ['CTGAN', 'TGAN', 'TVAE']

for i in ['WGAN', 'SGAN', 'WGGP']:
    for j in ['WI', 'OR', 'WO']:
        for k in ['NO', 'BO', 'OD', 'OC']:
            models.append('{}_{}_{}'.format(i,j,k))
            
# Models for testing different DAGs
if 'DAG' in dataset:
    models = ['FULL', 'TRANSRED', 'LINEAR', 'NOLINKS', 'PREDICTION']

models.sort()

files_ = {}

for m in models:
    tmp = []
    if is_a_DATGAN(m):
        spl = m.split('_')
        for i in range(n_models):
            for j in range(n_data):
                tmp.append(input_folder + '{}_{}_{:0>2}_{}_{:0>2}.csv'.format(spl[0], spl[1], i+1,  spl[2], j+1))
    else:
        for i in range(n_models):
            for j in range(n_data):
                tmp.append(input_folder + '{}_{:0>2}_{:0>2}.csv'.format(m, i+1, j+1))
    files_[m] = tmp


models.append('original')

files_['original'] = ['../data/' + dataset.split('_')[0] + '/data.csv' ]

In [5]:
df_orig = pd.read_csv('../data/' + dataset.split('_')[0] + '/data.csv')

In [6]:
df_orig.columns

Index(['travel_mode', 'purpose', 'fueltype', 'faretype', 'bus_scale',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time_linear', 'age', 'female', 'driving_license',
       'car_ownership', 'distance', 'dur_walking', 'dur_cycling',
       'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int',
       'pt_n_interchanges', 'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_con_charge', 'driving_traffic_percent'],
      dtype='object')

In [7]:
replace_low_appearing_values(df_orig, dataset)

In [8]:
check_low_appearing_vars(df_orig)

In [9]:
if 'Chicago' in dataset:
    cont_cols = ['distance', 'age', 'departure_time']
    ord_cols = ['hh_vehicles', 'hh_size', 'hh_bikes', 'hh_income', 
                'education_level']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]
elif 'LPMC' in dataset:
    cont_cols = ['start_time_linear', 'age', 'distance', 'dur_walking', 
                 'dur_cycling', 'dur_pt_access', 'dur_pt_rail', 'dur_pt_bus', 
                 'dur_pt_int', 'dur_driving', 'cost_transit', 
                 'cost_driving_fuel', 'driving_traffic_percent']
    ord_cols = ['travel_year', 'travel_month', 'travel_date', 
                'day_of_week', 'pt_n_interchanges', 'car_ownership']
    cat_cols = [col for col in df_orig.columns if col not in cont_cols + ord_cols]

In [10]:
enc = OrdinalEncoder()
ori = process_real_df(df_orig, cat_cols, ord_cols, cont_cols, enc)

# Generate results

In [11]:
filepath = './notebooks/results/{}/'.format(dataset)
filename = 'cv_result_ml.pickle'
cv_modelscores = {}
params={'n_estimators': 5000}

try:
    cv_modelscores = pickle.load(open(f'{filepath}{filename}','rb'))
    print('Found previous pickel file, using that')
except:
    print('No previous results found, starting fresh')
    try:
        os.makedirs(filepath)
    except:
        pass

Found previous pickel file, using that


In [12]:
for i, model in enumerate(models):
    
    if model in cv_modelscores.keys():
        print("Results for model \033[1m{}\033[0m ({}/{}) already exists".format(model, i+1, len(models)))
    else:
        print("Getting results for model \033[1m{}\033[0m ({}/{})".format(model, i+1, len(models)))
        cv_modelscores[model] = []
        
    n_files_done = len(cv_modelscores[model])
        
    for j, f in enumerate(files_[model][n_files_done:]):
        
        print("  Processing file {}/{}".format(j+1+n_files_done, len(files_[model])) + " "*20)
        
        tmp_df = pd.read_csv(f)
        replace_low_appearing_values(tmp_df, dataset)
        v_df = process_syn_df(tmp_df, cat_cols, ord_cols, cont_cols, enc)
        
        tmp = {}
        for k, ycol in enumerate(ori.columns):
            info = '    Column: {} ({}/{})'.format(ycol, k+1, len(ori.columns))
            print(info, end="")
            sys.stdout.flush()
            Xcols = [c for c in ori.columns if c!=ycol]

            y_synth = v_df[ycol]
            X_synth = v_df[Xcols]
            y_real = ori[ycol]
            X_real = ori[Xcols]


            observe_sets = {'original': (X_real, y_real)}
            ccols = [c for c in cat_cols if c!=ycol]


            if ycol in cat_cols + ord_cols:
                lgbm_type = 'LGBMClassifier'
                kf = StratifiedKFold(shuffle=True, random_state=42)
                eval_metric = ['error']
            elif ycol in cont_cols:
                lgbm_type = 'LGBMRegressor'
                kf = KFold(shuffle=True, random_state=42)
                eval_metric = ['l2', 'l1']
            cv = LightGBMCV(lgbm_type=lgbm_type,
                splitter = kf,
                eval_metric = eval_metric,
                observe_sets = observe_sets,
                separate_observation_split = True,
                early_stopping_rounds = 5,
                return_cv_models = False,
                refit_model = False,
                verbose = True)
            cv.fit(X_synth, y_synth, categorical_feature=ccols, params=params)
            tmp[ycol] = cv.result_dict_
            
            print(' '*len(info), end='\r')

            if k == len(ori.columns):
                print('', end='\r')
            
        cv_modelscores[model].append(tmp)
            
        pickle.dump(cv_modelscores,open(f'{filepath}/{filename}','wb'))

print("\033[1mFINISHED!\033[0m")

Results for model [1mCTGAN[0m (1/40) already exists
Results for model [1mSGAN_OR_BO[0m (2/40) already exists
Results for model [1mSGAN_OR_NO[0m (3/40) already exists
Results for model [1mSGAN_OR_OC[0m (4/40) already exists
Results for model [1mSGAN_OR_OD[0m (5/40) already exists
Results for model [1mSGAN_WI_BO[0m (6/40) already exists
Results for model [1mSGAN_WI_NO[0m (7/40) already exists
Results for model [1mSGAN_WI_OC[0m (8/40) already exists
Results for model [1mSGAN_WI_OD[0m (9/40) already exists
Results for model [1mSGAN_WO_BO[0m (10/40) already exists
Results for model [1mSGAN_WO_NO[0m (11/40) already exists
Results for model [1mSGAN_WO_OC[0m (12/40) already exists
Results for model [1mSGAN_WO_OD[0m (13/40) already exists
Results for model [1mTGAN[0m (14/40) already exists
Results for model [1mTVAE[0m (15/40) already exists
Results for model [1mWGAN_OR_BO[0m (16/40) already exists
Results for model [1mWGAN_OR_NO[0m (17/40) already exists
Result

  Processing file 18/25                    
  Processing file 19/25                    
  Processing file 20/25                    
  Processing file 21/25                    
  Processing file 22/25                    
  Processing file 23/25                    
  Processing file 24/25                    
  Processing file 25/25                    
Getting results for model [1mWGGP_WI_OC[0m (34/40)
  Processing file 1/25                    
  Processing file 2/25                     
  Processing file 3/25                     
  Processing file 4/25                     
  Processing file 5/25                     
  Processing file 6/25                     
  Processing file 7/25                     
  Processing file 8/25                     
  Processing file 9/25                     
  Processing file 10/25                    
  Processing file 11/25                    
  Processing file 12/25                    
  Processing file 13/25                    
  Processing file 14/25 

In [13]:
ori_scores = {col: cv_modelscores['original'][0][col]['test_log_loss'] for col in cat_cols + ord_cols}
ori_scores.update({col: cv_modelscores['original'][0][col]['test_l2'] for col in cont_cols})

internal = {}
external = {}
external_normalised = {}
cont_scores = {}
cat_scores = {}

for model in models:
    
    n_tests = len(cv_modelscores[model])
    
    internal[model] = {}
    external[model] = {}
    external_normalised[model] = {}
    for col in cat_cols + ord_cols:
        tmp = [cv_modelscores[model][i][col]['test_log_loss'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_log_loss'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]

        
    for col in cont_cols:
        tmp = [cv_modelscores[model][i][col]['test_l2'] for i in range(n_tests)]
        internal[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        tmp = [cv_modelscores[model][i][col]['original_l2'] for i in range(n_tests)]
        external[model][col] = {'avg': np.mean(tmp), 'std': np.std(tmp)}
        
        external_normalised[model][col] = external[model][col]['avg'] - ori_scores[col]
    
    cont_scores[model] = sum([external[model][col]['avg']/ori_scores[col] for col in cont_cols])
    cat_scores[model] = sum([external[model][col]['avg']-ori_scores[col] for col in cat_cols + ord_cols])

In [14]:
cat_sorted = sorted(cat_scores.items(), key=lambda item: item[1])
cont_sorted = sorted(cont_scores.items(), key=lambda item: item[1])

In [15]:
i=1
print('   | {:<30} | {:<30}'.format('categorical', 'continuous'))
print('-----------------------------------------------------------')
for a, b in zip(cat_sorted, cont_sorted):
    print('{:>2} | {:<30} | {:<30}'.format(i, '{:<12}: {:.3f}'.format(a[0], a[1]), '{:<12}: {:.3f}'.format(b[0], b[1])))
    i+=1

   | categorical                    | continuous                    
-----------------------------------------------------------
 1 | original    : -3.596           | original    : 9.249           
 2 | WGGP_WI_NO  : 2.191            | WGGP_OR_BO  : 19.557          
 3 | WGGP_WI_OC  : 2.213            | WGGP_OR_OC  : 20.266          
 4 | WGGP_OR_OD  : 2.499            | WGGP_WI_BO  : 20.283          
 5 | WGGP_OR_BO  : 2.558            | WGGP_WI_OC  : 20.332          
 6 | WGGP_WO_NO  : 2.609            | WGGP_OR_OD  : 20.866          
 7 | WGGP_WO_OC  : 2.645            | WGGP_OR_NO  : 21.607          
 8 | CTGAN       : 2.891            | WGGP_WI_NO  : 22.041          
 9 | TGAN        : 2.907            | WGGP_WI_OD  : 22.095          
10 | WGGP_OR_OC  : 2.935            | WGGP_WO_BO  : 23.068          
11 | WGGP_OR_NO  : 2.944            | WGGP_WO_OC  : 23.230          
12 | WGGP_WO_OD  : 3.037            | WGGP_WO_OD  : 25.663          
13 | WGGP_WO_BO  : 3.090            | WGGP_