# Import libraries

In [98]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import random

# Set up

Set seed value

In [99]:
SEED = 212

Set random elements to seeds where possible

In [100]:
def seed_torch(seed=SEED):
    th.manual_seed(seed)
    random.seed(seed)
    th.cuda.manual_seed(seed)
    th.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    th.manual_seed(seed)
    th.backends.cudnn.benchmark = True
    th.backends.cudnn.deterministic = True

seed_torch()

Read in data set 

In [101]:
df = pd.read_csv('../data/freMTPL2freq.csv')

## Transformations, corrections, and splitting

In [102]:
df['VehPower'] = df['VehPower'].astype(object) # categorical ordinal
df['ClaimNb'].values[df['ClaimNb']>4] = 4 # corrected for unreasonable observations (see M.V. Wuthrich)
df['VehAge'].values[df['VehAge']>20] = 20 # capped for NN training (see M.V. Wuthrich)
df['DrivAge'].values[df['DrivAge']>90] = 90 # capped for NN training (see M.V. Wuthrich)
df['BonusMalus'].values[df['BonusMalus']>150] = 150 # capped for NN training (see M.V. Wuthrich)
df['Density']=np.log(df['Density']) # logged for NN training     (see M.V. Wuthrich)
df['Exposure'].values[df['Exposure']>1] = 1 # corrected for unreasonable observations (see M.V. Wuthrich)
df_new = df.drop(['IDpol'], axis=1) # variable not used

Check for missing values

In [103]:
missing = df_new.isnull().sum()/df_new.shape[0]*100
missing

ClaimNb       0.0
Exposure      0.0
Area          0.0
VehPower      0.0
VehAge        0.0
DrivAge       0.0
BonusMalus    0.0
VehBrand      0.0
VehGas        0.0
Density       0.0
Region        0.0
dtype: float64

Encode the data as per Wuthrich

In [104]:
df_new_encoded = pd.get_dummies(df_new, columns=['VehBrand', 'Region'], drop_first=True)

Label encode oredered catergorical variables as per Wutrich

In [105]:
cleanup_nums = {"Area":     {"A": 1, "B": 2, "C": 3, "D": 4, "E":5, "F": 6},
                "VehGas":   {"Regular": 1, "Diesel": 2}
                }

Apply label encoding - NOT ONE-HOT/DUMMY

In [106]:
df_new_encoded = df_new_encoded.replace(cleanup_nums)

Split data into X and y dataframes

In [107]:
X = df_new_encoded.iloc[:, 1:]
y = df_new_encoded.iloc[:, 0]

Split data into Train(Val) and final Test test

In [108]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2,  random_state=SEED)

Use Train(Val) to perform MinMax scaling. **Importantly** this scaler is dervied ONLY using Train(Val) data to avoid data leakage i.e. Test data should only be transformed using Training scalers

In [109]:
scaler = MinMaxScaler()
scaler.fit(X_trainval)

MinMaxScaler()

In [110]:
X_trainval[['Area', 'VehPower', 'VehAge','DrivAge','BonusMalus','Density']] = scaler.fit_transform(X_trainval[['Area', 'VehPower', 'VehAge','DrivAge','BonusMalus','Density']])
X_test[['Area', 'VehPower', 'VehAge','DrivAge','BonusMalus','Density']] = scaler.fit_transform(X_test[['Area', 'VehPower', 'VehAge','DrivAge','BonusMalus','Density']])

Check transformations

In [111]:
no_obj_cols = [var for var in X_trainval.columns if X_trainval[var].dtype!='object']
X_trainval[no_obj_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Exposure,542410.0,0.528417,0.364092,0.002732,0.17,0.49,0.99,1.0
Area,542410.0,0.457962,0.276476,0.0,0.2,0.4,0.6,1.0
VehPower,542410.0,0.223344,0.186676,0.0,0.090909,0.181818,0.272727,1.0
VehAge,542410.0,0.34871,0.269851,0.0,0.1,0.3,0.55,1.0
DrivAge,542410.0,0.381704,0.196197,0.0,0.222222,0.361111,0.513889,1.0
BonusMalus,542410.0,0.097645,0.156113,0.0,0.0,0.0,0.14,1.0
VehGas,542410.0,1.490207,0.499905,1.0,1.0,1.0,2.0,2.0
Density,542410.0,0.58621,0.183286,0.0,0.443157,0.585461,0.725176,1.0
VehBrand_B10,542410.0,0.026281,0.159969,0.0,0.0,0.0,0.0,1.0
VehBrand_B11,542410.0,0.02002,0.140068,0.0,0.0,0.0,0.0,1.0


Split Train(Val) into Train and Validation

In [112]:
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1,  random_state=SEED)

Join training and validation data back together

In [113]:
train_array = pd.concat([X_train, y_train], axis=1)
val_array = pd.concat([X_val, y_val], axis=1)

## Split data into different workers/companies

In [114]:
NUM_AGENTS = 2
train_array_split = np.array_split(train_array, NUM_AGENTS)
val_array_split = np.array_split(val_array, NUM_AGENTS)

Loop through and export

In [115]:
for i in range(NUM_AGENTS):
    locals()['train_array_splitted_{}'.format(i)] = train_array_split[i]
    locals()['X_train_{}'.format(i)] = locals()['train_array_splitted_{}'.format(i)].iloc[:, 0:39]
    locals()['X_train_{}'.format(i)].to_csv('X_train_'+str(i)+'.csv', index=False)
    locals()['y_train_{}'.format(i)] = locals()['train_array_splitted_{}'.format(i)].iloc[:, 39]
    locals()['y_train_{}'.format(i)].to_csv('y_train_'+str(i)+'.csv', index=False)
    locals()['val_array_splitted_{}'.format(i)] = val_array_split[i]
    locals()['X_val_{}'.format(i)] = locals()['val_array_splitted_{}'.format(i)].iloc[:, 0:39]
    locals()['X_val_{}'.format(i)].to_csv('X_val_'+str(i)+'.csv', index=False)
    locals()['y_val_{}'.format(i)] = locals()['val_array_splitted_{}'.format(i)].iloc[:, 39]
    locals()['y_val_{}'.format(i)].to_csv('y_val_'+str(i)+'.csv', index=False)