# Porto Seguro’s Safe Driver Prediction

### Load python packages needed for analysis

In [None]:
%pylab inline

#working with the numbers
import numpy as np
import pandas as pd
#visualization
import seaborn as sns
sns.set(style="whitegrid")
import missingno as msn
#machine learning
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
import lightgbm as lgb

#other
import gc

### Load the datasets

In [None]:
df_train = pd.read_csv(r"input\train.csv")
df_test = pd.read_csv(r"input\test.csv")

print("Train data:")
print("Columns: {}".format(len(df_train.columns)))
print("Rows: {}". format(len(df_train)))
print("Test data:")
print("Columns: {}".format(len(df_test.columns)))
print("Rows: {}". format(len(df_test)))

In [None]:
display(df_train.head())
display(df_test.head())

### According to data describtion:


"*In this competition, you will predict the probability that an auto insurance policy holder files a claim.
In the train and test data, features that belong to similar groupings are tagged as such in the feature names (e.g., ind, reg, car, calc). In addition, feature names include the postfix **bin** to indicate **binary features** and **cat** to indicate **categorical features**. Features without these designations are either continuous or ordinal. Values of **-1 indicate** that the feature was **missing** from the observation. The target columns signifies whether or not a claim was filed for that policy holder.*"


In [None]:
df_train.dtypes 

### Let's start by looking a little bit a the distribution of the data and missing entries.

In [None]:
df_train2 = df_train.replace(-1, np.NaN)
df_test2 = df_test.replace(-1, np.NaN)

In [None]:
sorted_traindata = msn.nullity_sort(df_train2, sort='descending')
msn.matrix(sorted_traindata)

In [None]:
msn.heatmap(df_train2)

In [None]:
sorted_testdata = msn.nullity_sort(df_test2, sort='descending')
msn.matrix(sorted_testdata)

### Binary Features

In [None]:
binary_train = [c for c in df_train2.columns if c.endswith("bin")]
categorical_train = [c for c in df_train2.columns if c.endswith("cat")]

binary_test = [c for c in df_test2.columns if c.endswith("bin")]
categorical_test = [c for c in df_test2.columns if c.endswith("cat")]

In [None]:
plt.figure(figsize=(17,20))
for i, c in enumerate(binary_train):
    ax = plt.subplot(6,3,i+1)
    sns.countplot(df_train2[c])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.grid(False)

In [None]:
plt.figure(figsize=(17,20))
for i, c in enumerate(binary_test):
    ax = plt.subplot(6,3,i+1)
    sns.countplot(df_test2[c])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.grid(False)

### Categorical Features

In [None]:
print("Training Data")
for i in categorical_train:
    print(i)
    print(df_train2[i].isnull().sum())
print('\n"Test Data')   
for i in categorical_test:
    print(i)
    print(df_test2[i].isnull().sum())

### Ok looking at the categorical features we see that the both features with the very high percentage of missing attributes are located here and are namely: ps_car_03_cat and ps_car_05_cat. I will drop them for the moment. 

In [None]:
to_drop = ["ps_car_03_cat","ps_car_05_cat"]
df_train2.drop(to_drop, axis=1, inplace=True)
df_test2.drop(to_drop, axis=1, inplace=True)
categorical_train = [i for i in categorical_train if i not in to_drop]
categorical_test = [i for i in categorical_test if i not in to_drop]

In [None]:
for i in categorical_train:
    print(i)
    print(df_train2[i].value_counts())

### The following features are only binary in the train dataset. Is this also true for test data?

In [None]:
new_bin = ["ps_ind_04_cat","ps_car_02_cat","ps_car_07_cat", "ps_car_08_cat"]
for i in new_bin:
    print(i)
    print(df_test2[i].value_counts())

In [None]:
binary_train.append(new_bin)
binary_test.append(new_bin)
categorical_train = [i for i in categorical_train if i not in new_bin]
categorical_test = [i for i in categorical_test if i not in new_bin]

In [None]:
plt.figure(figsize=(17,10))
for i, c in enumerate(categorical_train):
    ax = plt.subplot(3,3,i+1)
    sns.countplot(df_train2[c])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.grid(False)

In [None]:
plt.figure(figsize=(17,10))
for i, c in enumerate(categorical_test):
    ax = plt.subplot(3,3,i+1)
    sns.countplot(df_test2[c])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    plt.grid(False)

### Ok in a first attempt let's just impute the missing values with the most common one

In [None]:
df_train3 = df_train2.apply(lambda x:x.fillna(x.value_counts().index[0]))
df_test3 = df_test2.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
enc = OneHotEncoder()
enc.fit_transform(df_train3[categorical_train])
enc.fit_transform(df_test3[categorical_test])

### Continuous features

In [None]:
continuous_train = [i for i in df_train3.columns if 
                    ((i not in binary_train) and (i not in categorical_train) and (i not in ["target", "id"]))]
continuous_test = [i for i in df_test3.columns if 
                   ((i not in binary_test) and (i not in categorical_test) and (i != "id"))]

In [None]:
corr = np.corrcoef(df_train3.transpose())
sns.heatmap(corr)

In [None]:
sns.clustermap(corr)

## To be continued... :D

In [None]:
gc.collect()