## 3.0 Transform Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats
import math

pd.options.mode.chained_assignment = None

## 3.1 Load Data

In [2]:
%run "0_load.ipynb"

Variable          Type         Data/Info
----------------------------------------
PCA               ABCMeta      <class 'sklearn.decomposition.pca.PCA'>
StandardScaler    type         <class 'sklearn.preproces<...>ing.data.StandardScaler'>
dt_sampleSubmit   DataFrame                id  loss\n0  <...>[125546 rows x 2 columns]
dt_test           DataFrame                id cat1 cat2 <...>25546 rows x 131 columns]
dt_train          DataFrame                id cat1 cat2 <...>88318 rows x 132 columns]
math              module       <module 'math' from '/Use<...>3.5/lib-dynload/math.so'>
matplotlib        module       <module 'matplotlib' from<...>/matplotlib/__init__.py'>
np                module       <module 'numpy' from '/Us<...>kages/numpy/__init__.py'>
path_submit       str          ../data/Allstate_Claims_S<...>ity/sample_submission.csv
path_test         str          ../data/Allstate_Claims_Severity/test.csv
path_train        str          ../data/Allstate_Claims_Severity/train.csv
pd  

## 3.2 Basic Data Transform

In [3]:
# for generalisation
ID = 'id'
TARGET = 'loss'

# y
y_train = dt_train[TARGET].ravel()

# test id for submission
id_test = dt_test[ID].ravel()

# drop ID and TARGET
dt_train.drop([ID, TARGET], axis = 1, inplace = True)
dt_test.drop([ID], axis = 1, inplace = True)

# feature names
features = dt_train.columns
col_cat = dt_train.select_dtypes(["object"]).columns.values
col_num = dt_train.select_dtypes(["float"]).columns.values
col_num = np.setdiff1d(col_num, ["loss"])

# rows
nrow_train = dt_train.shape[0]
nrow_test = dt_test.shape[0]

# concat
dt_all = pd.concat((dt_train, dt_test)).reset_index(drop = True)

# factorization
# for col in col_cat:
#     dt_all[col] = pd.factorize(dt_all[col], sort = True)[0]

# grouped target
groupedTarget = np.zeros(nrow_train, dtype = "object")
for i in range(12):
    if (i == 0):
        myindex = (y_train < ((i*1000) + 1000) )
    if (i > 0) & ( i < 11):
        myindex = (y_train > ((i*1000) -1) ) & (y_train < ((i*1000) + 1000) )
    if (i == 11):
        myindex = (y_train > ((i*1000) -1) )
    groupedTarget[myindex] = i
    
# x
x_train = np.array(dt_all.iloc[:nrow_train, :])
x_test = np.array(dt_all.iloc[nrow_train, :])

## 3.3 Advanced Data Transform

### 3.3.1 Simple Interaction on col_num

In [4]:
def simpleInter_num(dt_num, interaction = ["+", "-", "*"]):
    ## +
    if("+" in interaction):
        
        dt_sum = pd.DataFrame()
        
        for i in dt_num.columns:
            for j in dt_num.columns:
                if(i != j 
                   and "sum_" + i + "_" + j not in dt_sum.columns 
                   and "sum_" + j + "_" + i not in dt_sum.columns):
                    dt_sum["sum_" + i + "_" + j] = dt_num[i] + dt_num[j]
    
    ## -
    if("-" in interaction):
        
        dt_sub = pd.DataFrame()
        
        for i in dt_num.columns:
            for j in dt_num.columns:
                if(i != j 
                   and "sub_" + i + "_" + j not in dt_sub.columns 
                   and "sub_" + j + "_" + i not in dt_sub.columns):
                    dt_sub["sub_" + i + "_" + j] = dt_num[i] - dt_num[j]
                    
    ## *
    if("*" in interaction):
        
        dt_times = pd.DataFrame()
        
        for i in dt_num.columns:
            for j in dt_num.columns:
                if(i != j 
                   and "times_" + i + "_" + j not in dt_times.columns 
                   and "times_" + j + "_" + i not in dt_times.columns):
                    dt_times["times_" + i + "_" + j] = dt_num[i] * dt_num[j]
    
    return(dt_sum, dt_sub, dt_times)

In [5]:
# final ouput table
dt_all_sum, dt_all_sub, dt_all_times = simpleInter_num(dt_all.loc[:, col_num])

In [6]:
print("{} {} {}".format(dt_all_sum.shape, dt_all_sub.shape, dt_all_times.shape))

(313864, 91) (313864, 91) (313864, 91)


### 3.3.2 highly correlated features

In [7]:
# correlation
correlations = dt_all.loc[:, col_num].corr()

# Set the threshold to select only highly correlated attributes
threshold = 0.5

# List of pairs along with correlation above threshold
corr_list = []

#Search for the highly correlated pairs
for i in range(0,len(col_num)): #for 'size' features
    for j in range(i+1,len(col_num)): #avoid repetition
        if (correlations.iloc[i,j] >= threshold and correlations.iloc[i,j] < 1) or (correlations.iloc[i,j] < 0 and correlations.iloc[i,j] <= -threshold):
            corr_list.append([correlations.iloc[i,j],i,j]) #store correlation and columns index

# Sort to show higher ones first            
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

In [8]:
# Print correlations and column names
for v,i,j in s_corr_list:
    print ("%s and %s = %.2f" % (col_num[i],col_num[j],v))

cont11 and cont12 = 0.99
cont1 and cont9 = 0.93
cont10 and cont6 = 0.88
cont13 and cont6 = 0.81
cont1 and cont10 = 0.81
cont6 and cont9 = 0.80
cont12 and cont6 = 0.79
cont10 and cont9 = 0.79
cont11 and cont6 = 0.77
cont1 and cont6 = 0.76
cont11 and cont7 = 0.75
cont12 and cont7 = 0.74
cont10 and cont12 = 0.72
cont10 and cont13 = 0.71
cont10 and cont11 = 0.70
cont6 and cont7 = 0.66
cont13 and cont9 = 0.64
cont12 and cont9 = 0.63
cont1 and cont12 = 0.61
cont11 and cont9 = 0.61
cont1 and cont11 = 0.60
cont1 and cont13 = 0.53
cont4 and cont8 = 0.53


**PCA** ---------------------

In [11]:
# final output table
dt_all_pca = pd.DataFrame()

# highly correlated features
high_corr = .7
col_high_corr = set(col_num[i] for (v, i, j) in s_corr_list if v >= high_corr).union(set(col_num[j] for (v, i, j) in s_corr_list if v >= high_corr))
np_high_corr = np.array(dt_all[list(col_high_corr)])

# pca
pca = PCA(n_components = 2)
dt_all_pca = pd.DataFrame(pca.fit_transform(np_high_corr))

pca.explained_variance_ratio_

array([ 0.72148871,  0.13908504])

**Simple Interaction** ---------------------

In [12]:
# final output table
dt_all_highCorr_sum, dt_all_highCorr_sub, dt_all_highCorr_times = simpleInter_num(dt_all[list(col_high_corr)])

In [13]:
print("{} {} {}".format(dt_all_highCorr_sum.shape, dt_all_highCorr_sub.shape, dt_all_highCorr_times.shape))

(313864, 28) (313864, 28) (313864, 28)


### 3.3.3 prob of col_cat

In [14]:
# final output table
dt_all_prob_cat = pd.DataFrame()

for col in col_cat:
    freq = dt_all[col].value_counts()
    prob = freq / sum(freq)

    dt_prob = pd.DataFrame({"prob": prob})
    dt_prob = dt_prob.reset_index()

    dt_all_prob_cat[col + "_prob"] = pd.merge(dt_all[col].reset_index(), dt_prob, left_on = col, right_on = "index")["prob"]

### 3.3.4 entropy (optional, maybe too similar to prob)

In [15]:
# final output table
dt_all_entropy_cat = pd.DataFrame()

for col in col_cat:
    freq = dt_all[col].value_counts()
    prob = freq / sum(freq)
    entropy = - prob * np.log(prob)
    
    dt_entropy = pd.DataFrame({"entropy": prob})

    dt_all_entropy_cat[col + "_prob"] = pd.merge(dt_all[col].reset_index(), dt_entropy.reset_index(), left_on = col, right_on = "index")["entropy"]

### 3.3.5 joint entropy

In [16]:
# final output table
dt_train_jointEntropy_cat = pd.DataFrame()
dt_test_jointEntropy_cat = pd.DataFrame()

for col in col_cat:
    # cat and grouped target and the combined
    dt_groupedTarget_cat = pd.DataFrame({col: dt_train[col].values
                                         , "groupedTarget": groupedTarget
                                         , col + "_groupedTarget": dt_train[col].map(str) + pd.Series(groupedTarget).map(str)})

    freq = dt_groupedTarget_cat[col + "_groupedTarget"].value_counts()
    prob = freq / sum(freq)

    jointEntropy =  - prob * np.log(prob)

    dt_jointEntropy = pd.DataFrame({"jointEntropy": jointEntropy})

    dt_jointEntropy[col] = dt_jointEntropy.reset_index()["index"].str.slice(0, 1).values
    dt_jointEntropy_sum = dt_jointEntropy.groupby(col)["jointEntropy"].sum()

    dt_jointEntropy_sum = pd.DataFrame({"jointEntropy": dt_jointEntropy_sum})
    dt_train_jointEntropy_cat[col + "_groupedTarget"] = pd.merge(dt_groupedTarget_cat[col].reset_index(), dt_jointEntropy_sum.reset_index(), left_on = col, right_on = col)["jointEntropy"]
    dt_test_jointEntropy_cat[col + "_groupedTarget"] = pd.merge(dt_test[col].reset_index(), dt_jointEntropy_sum.reset_index(), left_on = col, right_on = col)["jointEntropy"]

### 3.3.6 target mean (maybe only apply to high cardinality feats; add random noise; leave on out?)

In [17]:
# final output table
dt_train_tm_cat = pd.DataFrame()
dt_test_tm_cat = pd.DataFrame()

for col in col_cat:
    # cat + target
    dt_cat_target = pd.DataFrame({col: dt_train[col].values
                                 , "target": y_train})

    # group col_cat
    grouped = dt_cat_target.groupby(col)
    # target mean for col_cat
    dt_targetMean = pd.DataFrame({"tm": grouped["target"].mean()})

    # add target mean to final output table
    dt_train_tm_cat[col + "_tm"] = pd.merge(dt_cat_target[col].reset_index(), dt_targetMean.reset_index(), left_on = col, right_on = col)["tm"]

    # add to test
    dt_test_tm_cat[col + "_tm"] = pd.merge(dt_test[col].reset_index(), dt_targetMean.reset_index(), how = "left", left_on = col, right_on = col)["tm"]

    # avoid leakage (by doing kfold tm)
    temp = np.zeros(nrow_train)

    for i in np.arange(0, 4):
        ids = np.arange(i, nrow_train, 4)

        dt_temp = pd.DataFrame({"tm": dt_cat_target.drop(ids, axis = 0).groupby(col)["target"].mean()})
        temp[ids] = pd.merge(dt_cat_target.iloc[ids, :][col].reset_index(), dt_temp.reset_index(), how = "left", left_on = col, right_on = col)["tm"]

    dt_train_tm_cat[col + "_tm"] = temp

### 3.3.7 n-way interaction

### 3.3.8 simple stats