# Creation of needed code for feature selection and model training

In [1]:
import pandas as pd

In [None]:
from ast import literal_eval
df = pd.read_csv('../data/dataset_initial.csv')
df['Weather_Condition_Arr'] = df['Weather_Condition_Arr'].apply(lambda x: literal_eval(x) if str(x)!='nan' else x)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X, y = df[df.columns.drop('Severity')], df['Severity']

In [None]:
df

## Looking at data for additional processing and encoding needed

Colums of object type - categorical variables some of them with >2 classes

In [None]:
object_cols = df.columns[df.dtypes=='object']
objects = df[object_cols]
objects

# Feature selection and encoding

## Feature selection

Here feature selection refers to lowering categorical data cardinality and it is performed by sorting categories by frequencies and encoding first 10 as binary features while the others are encoded as separate feature labeled 'Other'.

### Weather column feature selection

Checking weather condition with multiple conditions present in the column

In [None]:
df[df['Weather_Condition_Arr'].map(lambda arr: len(arr), na_action='ignore')>1]['Weather_Condition_Arr']

Multiple weather condition will be also one hot encoded with multiple 1s on corresponding values

Example of top 20 weather conditions

In [None]:
from collections import Counter
from collections import OrderedDict
counts = df['Weather_Condition_Arr'].explode().value_counts()

In [None]:
%matplotlib inline
counts.plot(kind='barh')

Creating custom data transformer for weather

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter, OrderedDict

class WeatherConditionTransformator(BaseEstimator, TransformerMixin):
    def fit(self, X):
        counts = df['Weather_Condition_Arr'].explode().value_counts()
        self.top_weathers = counts[:10]
        return self
    
    def _weather_condition_mapper(self,weather_condition_arr):
        weathers = set()
        for w in weather_condition_arr:
            if w in self.top_weathers:
                w_to_add = w
            else:
                w_to_add='Weather_Other'
            weathers.add(w_to_add)
        return list(weathers)
            
    def transform(self, X, y=None):
        X['Weather_Condition_Arr'] = X['Weather_Condition_Arr'].map(self._weather_condition_mapper,na_action='ignore')
        return X

### States feature selection

In [None]:
%matplotlib inline
df['State'].value_counts().plot(kind='barh')

In [None]:
class StateTranformator(BaseEstimator,TransformerMixin):
    def fit(self,X):
        self.top_states = X['State'].value_counts()[:10].index
        return self
        
    def _map_state(self,state):
        if state in self.top_states:
            return state
        else:
            return 'Other'
    
    def transform(self,X):
        X['State']=X['State'].map(self._map_state,na_action='ignore')
        return X


### Final feature selector

Feature selection composes of weather feature selection and state feature selection

In [None]:
class FeatureSelector(TransformerMixin):
    def __init__(self):
        self.wct = WeatherConditionTransformator()
        self.st = StateTranformator()
        
    def fit(self, X):
        self.wct.fit(X)
        self.st.fit(X)
        return self
        
    def transform(self,X):
        X = self.wct.transform(X)
        X = self.st.transform(X)
        return X


In [None]:
import dtale

X_sample = X.sample(100000).copy()

p = FeatureSelector()

X_sample = p.fit_transform(X_sample)
dtale.show(X_sample[1000:])

## Encoding

In [None]:
class Encoder(TransformerMixin):
    def __init__(self,categorical_columns):
        self.categorical_columns=categorical_columns
    
    def fit(self,X):
        return self
    
    def transform(self,X):
        for column in self.categorical_columns:
            tempdf = pd.get_dummies(X[column], prefix=column,drop_first=True)
            X = pd.merge(
                left=X,
                right=tempdf,
                left_index=True,
                right_index=True,
            )
            X = X.drop(columns=column)
        tempdf = pd.get_dummies(X['Weather_Condition_Arr'].explode()).groupby(level=0).sum()
        X = pd.merge(
            left=X,
            right=tempdf,
            left_index=True,
            right_index=True
        )
        X = X.drop(columns="Weather_Condition_Arr")
        return X        

In [None]:
categorical_variables = ['Side', 'State', 'Amenity','Bump','Crossing',
                         'Give_Way', 'Junction','No_Exit',
                         'Railway','Roundabout','Station','Stop',
                         'Traffic_Calming','Traffic_Signal','Turning_Loop',
                         'Sunrise_Sunset','Civil_Twilight','Nautical_Twilight',
                         'Astronomical_Twilight']

## Final feature selection and coding example usage and results

In [None]:
X_train_sample = df.sample(100000).copy()
fs = FeatureSelector()
X_train_sample = fs.fit_transform(X_train_sample)
enc = Encoder(categorical_variables)
dtale.show(enc.fit_transform(X_train_sample)[:1000])

# Creating code infrastracture for kfold validation

Using custom code is necessary to select features and encode each k-1 training fold and to ensure no spillage. As that would be the case of feature selection and encoding on whole data.

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

class BaseTrainer():
    def __init__(self,X_train_val,Y_train_val,cat_vars=[],reg_vars=[],vtype="k-fold", k=3,split=0.8):
        self.vtype=vtype
        self.cat_vars=cat_vars
        if(vtype=="k-fold"):
            self._init_kfold(X_train_val,Y_train_val,cat_vars,reg_vars,k)
            return
        elif(vtype=="hold-out"):
            self._init_hold_out(X_train_val,Y_train_val,cat_vars,reg_vars,split)
        else:
            raise ValueError(f"Invalid type {type} supported types: kfold and hold-out")
            
    def _init_kfold(self,X_train_val,Y_train_val,cat_vars,reg_vars,k):
        self.X_train_val=X_train_val
        self.Y_train_val=Y_train_val
        self.kf = StratifiedKFold(n_splits=k)
        
    def _init_hold_out(self,X_train_val,Y_train_val,cat_vars,reg_vars,split):
        X_train,X_val,Y_train,Y_val = train_test_split(X_train_val,Y_train_val,train_size=split,stratify=Y_train_val)
        self.X_train,self.X_valid = self._select_and_encode(X_train, X_val)
        self.Y_train,self.Y_valid = self._encode_target(Y_train,Y_val)
        
        
    def _split_using_index(self,train_index,valid_index):
        X_train = self.X_train_val.iloc[train_index].copy()
        X_valid = self.X_train_val.iloc[valid_index].copy()
        Y_train = self.Y_train_val.iloc[train_index].copy()
        Y_valid = self.Y_train_val.iloc[valid_index].copy()
        return X_train, X_valid, Y_train, Y_valid

    def train_and_validate(self,params):
        if(self.vtype=="k-fold"):
            valid_score=self._train_and_validate_kfold(params)
        if(self.vtype=="hold-out"):
            valid_score=self._train_and_validate_hold_out(params)
        return valid_score
            
    def _train_and_validate_kfold(self,params):
        valid_scores = []
        for train_index, valid_index in self.kf.split(self.X_train_val,self.Y_train_val):
            X_train, X_valid, Y_train, Y_valid = self._split_using_index(train_index,valid_index)
            Y_train, Y_valid = self._encode_target(Y_train,Y_valid)
            X_train, X_valid = self._select_and_encode(X_train,X_valid)
            trained_model = self._train_model(params,X_train,Y_train,X_valid,Y_valid)
            valid_score = self._validate_model(trained_model,X_valid,Y_valid)
            valid_scores.append(valid_score)
        return np.mean(valid_scores)
    
    def _train_and_validate_hold_out(self,params):
        trained_model = self._train_model(params,self.X_train,self.Y_train,self.X_valid,self.Y_valid)
        valid_score = self._validate_model(trained_model,self.X_valid,self.Y_valid)
        return valid_score
                
    def _train_model(self,params,X_train,y_train,X_val=None,Y_val=None):
        raise NotImplementedException
    
    def _validate_model(self,model):
        raise NotImplementedException
        
    def _select_and_encode(self,X_train, X_valid):
        fs = FeatureSelector()
        X_train = fs.fit_transform(X_train)
        X_valid = fs.transform(X_valid)
        encoder = Encoder(self.cat_vars)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.fit_transform(X_valid)
        return X_train,X_valid
    
    def _encode_target(self,Y_train, Y_valid):
        return Y_train-1, Y_valid-1

# Model specific steps

## TabNet

In [None]:
from fastai.tabular.all import *
from fast_tabnet.core import *
from sklearn.utils.class_weight import compute_class_weight

class TabNetTrainer(BaseTrainer):
    def __init__(self,X_train_val,Y_train_val,cat_vars=[],reg_vars=[],vtype="k-fold", k=3,split=0.8,epochs=50):
        self.epochs=epochs
        super().__init__(X_train_val,Y_train_val,cat_vars,reg_vars,vtype, k,split)


    def _train_model(self,params,X_train,y_train,X_val,Y_val):
        bs= params.pop('batch_size')
        lr= params.pop('lr')
        to = self._fastaify_data(X_train,y_train,X_val,Y_val)
        dls = to.dataloaders(bs)
        
        optimizer = params.pop('optimizer')
    
        model = TabNetModel(get_emb_sz(to), len(to.cont_names), dls.c,**params)
        class_weights = self._get_weights(y_train)
        learn = Learner(dls, model,CrossEntropyLossFlat(weight=class_weights), opt_func=optimizer, lr=lr, metrics=[MatthewsCorrCoef()])
        learn.fit_one_cycle(self.epochs)
        return learn
    
    def _validate_model(self,model,X_val=None,Y_val=None):
        return float(model.validate()[1])
    
    def _get_weights(self,Y_train):
        class_weights=compute_class_weight('balanced',classes=[0,1,2,3],y=Y_train)
        class_weights=torch.tensor(class_weights,dtype=torch.float)
        return class_weights
        
    def _fastaify_data(self,X_train,Y_train,X_val,Y_val):
        train = pd.merge(
            left=X_train,
            right=Y_train,
            left_index=True,
            right_index=True,
        )

        val = pd.merge(
            left=X_val,
            right=Y_val,
            left_index=True,
            right_index=True,
        )

        train_len = len(train)
        val_len = len(val)
        splits = [list(range(0,train_len)),list(range(train_len,train_len+val_len))]

        train_val = pd.concat([train,val])
        train_val.reset_index(drop=True)
        
        cont_names = ['Start_Lat','Start_Lng','End_Lat','End_Lng','Distance(mi)',
            'Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)',
            'Visibility(mi)','Wind_Speed(mph)','Precipitation(in)','Wind_SN',
            'Wind_EW']

        cat_names = [col for col in train_val.columns]
        _=[cat_names.remove(cont_name) for cont_name in cont_names+['Severity']]
        
        to = TabularPandas(
            train_val, 
            [Categorify,FillMissing], 
            cat_names, cont_names, 
            y_names='Severity', 
            y_block = CategoryBlock(), 
            splits=splits
        )
        
        return to

### Hyperparameter optimization

Selection of data percent

In [None]:
X_train_val_test, Y_train_val_test = X.copy(),y.copy()
X_train_val,X_val,Y_train_val,Y_val = train_test_split(X_train_val_test, Y_train_val_test,train_size=0.1,stratify=Y_train_val_test)


In [None]:
from hyperopt import STATUS_OK,hp,tpe,Trials,fmin
from hyperopt.pyll import scope

tabnet_large_space={
    "lookahead": hp.choice("lookahead",[False,True]),
    "optimizer": hp.choice('optimizer',[
        {
            "opttype":"Adam",
             "wd":hp.loguniform('wdadam', np.log(0.0001), np.log(0.3))
        },
        {
            "opttype":"SGD",
            "wd":hp.loguniform('wdsgd', np.log(0.0001), np.log(0.3))
        },
        {
            "opttype":"RAdam",
            "wd":hp.loguniform('wdradam', np.log(0.0001), np.log(0.3))
        }
    ]),
    "n":scope.int(hp.choice("n",[8,64,128])),
    "n_steps":scope.int(hp.quniform("n_steps",3,10,1)),
    "gamma":hp.uniform("gamma",1,2),
    "momentum":hp.uniform("momentum",0,1),
    "lr":hp.choice("lr",[0.005,0.01,0.02,0.025]),
    "batch_size":hp.quniform("batch_size",12,17,1),
    "virtual_batch_size":hp.quniform("virtual_batch_size",8,11,1)
    }

In [None]:
def get_optimizer(opttype,opt_params,lookahead):
    OPT_DICT = {
        "Adam":Adam,
        "RAdam":RAdam,
        "SGD":SGD
    }
    opt_constructor = OPT_DICT[opttype]
    if lookahead:
        partial_opt = lambda spliter,lr: Lookahead(opt_constructor(spliter,lr,**opt_params))
        optimizer = partial_opt
    else:
        optimizer = partial(opt_constructor,**opt_params)
    return optimizer

def process_params(params):
    params['batch_size'] = int(np.power(2,params['batch_size']))
    params['virtual_batch_size'] = int(np.power(2,params['virtual_batch_size']))

    opt_params = params.pop('optimizer')
    opttype = opt_params.pop('opttype')
    lookahead = params.pop('lookahead')
    optimizer = get_optimizer(opttype,opt_params,lookahead)
    
    params['optimizer'] = optimizer
    n=params.pop('n')
    params['n_d']=n
    params['n_a']=n
    return params
    

def tabnet_fn(params):
    params = process_params(params)
    print(params)
    tabnet_trainer = TabNetTrainer(X_train_val,Y_train_val,vtype="hold-out",split=0.8,epochs=1)
    return -tabnet_trainer.train_and_validate(params)
    

In [None]:
#defaults.use_cuda=True

In [None]:
trials = Trials()
best_hyperparams = fmin(fn = tabnet_fn,
                        space = tabnet_large_space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)
best_hyperparams

## XGBoost

In [None]:
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
import xgboost

class XGBoostTrainer(BaseTrainer):
    def _train_model(self,params,X_train,y_train,X_val,Y_val):
        num_round = params.pop('num_round')
        model = xgboost.XGBClassifier(**params,verbosity=2)
        
        classes_weights = class_weight.compute_sample_weight(
            class_weight='balanced',
            y=y_train
        )

        model.fit(X_train, y_train, num_round=num_round, sample_weight=classes_weights)
        return model
    
    def _validate_model(self,model,X_val=None,Y_val=None):
        return matthews_corrcoef(Y_val,model.predict(X_val))

### Hyperparameter optimization

In [None]:
X_train_val_test, Y_train_val_test = X.copy(),y.copy()
X_train_val,X_val,Y_train_val,Y_val = train_test_split(X_train_val_test, Y_train_val_test,train_size=0.7,stratify=Y_train_val_test)

In [None]:
from hyperopt import STATUS_OK,hp,tpe,Trials,fmin
from hyperopt.pyll import scope


xgboost_large_space = {
    "eta":hp.uniform("eta",0.01,0.3),
    "gamma":hp.uniform("gamma",0,10),
    "max_depth":scope.int(hp.quniform("max_depth",3,10,1)),
    "min_child_weight":hp.uniform("min_child_weight",0,10),
    "max_delta_step":hp.uniform("max_delta_step",1,10),
    "subsample":hp.uniform("subsample",0.3,1),
    "lambda":hp.uniform("lambda",0,5),
    "alpha":hp.uniform("alpha",0,5),
    "num_round":scope.int(hp.quniform("num_round",50,200))
}

In [None]:
categorical_variables = ['Side', 'State', 'Amenity','Bump','Crossing',
                         'Give_Way', 'Junction','No_Exit',
                         'Railway','Roundabout','Station','Stop',
                         'Traffic_Calming','Traffic_Signal','Turning_Loop',
                         'Sunrise_Sunset','Civil_Twilight','Nautical_Twilight',
                         'Astronomical_Twilight']

def xgboost_fn(params):
    xgboost_trainer = XGBoostTrainer(X_train_val,Y_train_val,cat_vars=categorical_variables,vtype="hold-out",split=0.8)
    return -xgboost_trainer.train_and_validate(params)


In [None]:
trials = Trials()
best_hyperparams = fmin(fn = xgboost_fn,
                        space = xgboost_large_space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)
best_hyperparams

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(Y_val,clf.predict(X_val))
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot()


In [None]:
from xgboost import plot_importance
plot_importance(clf)