In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test.csv


# Introduction

This notebook aims at getting a good insight in the data for the porteSeguro
competition. Besides that, it gives some tips and tricks to prepare your data for modeling. The notebook consists of the following main sections:

1. Visual inspection of your data
2. Defining the metadata
3. Descriptive statistics
4. Handling imbalanced classes
5. Data quality checks
6. Exploratory data visualization
7. Feature selection
8. Featrue scaling 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)




In [3]:
DEBUG = True


In [4]:
if DEBUG:
    NROWS = 50000
else:
    NROWS = None

In [5]:
%%time
train = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/train.csv', nrows = NROWS)
test =  pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/test.csv', nrows = NROWS)

CPU times: user 552 ms, sys: 112 ms, total: 664 ms
Wall time: 828 ms


In [6]:
train.shape

(50000, 59)

In [7]:
train.tail(10)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
49990,125377,0,1,1,2,0,0,1,0,0,0,0,0,0,0,0,4,1,0,0,0.2,0.3,-1.0,7,1,-1,0,-1,0,1,1,2,1,32,3,0.31607,0.413636,0.361939,0.0,0.1,0.6,0.2,2,3,7,4,7,3,10,2,1,2,6,0,1,1,0,0,0
49991,125383,0,3,1,7,0,4,0,1,0,0,0,0,0,0,0,11,1,0,0,0.9,0.1,0.456892,7,0,1,0,0,1,1,0,0,1,25,3,0.4,0.916448,-1.0,3.741657,0.4,0.7,0.4,2,1,9,2,9,3,5,3,0,3,7,0,1,1,0,0,0
49992,125386,0,1,2,5,1,0,0,1,0,0,0,0,0,0,0,4,1,0,0,0.3,0.0,0.559017,5,1,0,0,0,4,1,0,2,1,84,3,0.316228,0.831967,0.320156,3.605551,0.5,0.4,0.9,2,2,9,6,10,2,15,4,2,4,5,0,1,0,0,1,1
49993,125390,0,5,2,10,1,0,0,0,0,1,0,0,0,0,0,12,1,0,0,0.6,0.6,1.151901,7,1,-1,8,-1,4,1,1,2,1,6,3,0.4,1.028017,0.412189,3.605551,0.1,0.3,0.5,3,3,8,3,9,1,7,5,2,2,5,0,0,1,0,0,0
49994,125391,0,0,1,6,0,0,0,1,0,0,0,0,0,0,0,8,1,0,0,0.8,0.2,0.587899,5,1,1,8,0,4,-1,0,0,1,104,2,0.5,1.509269,-1.0,3.605551,0.8,0.5,0.6,2,2,8,4,10,1,10,6,1,1,7,0,1,0,1,0,0
49995,125393,0,2,2,1,0,0,0,0,1,0,0,0,0,0,0,8,1,0,0,0.5,0.4,0.633936,6,1,-1,0,1,14,1,1,0,1,29,3,0.3995,0.605536,0.398497,2.236068,0.1,0.0,0.8,4,4,6,1,9,2,6,4,2,2,4,1,0,0,0,1,0
49996,125395,0,0,1,5,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.9,0.3,0.646142,10,1,-1,0,0,11,1,1,0,1,99,2,0.316228,0.67328,0.368782,3.316625,0.8,0.1,0.5,4,2,8,1,9,3,10,6,2,1,8,1,0,0,0,0,0
49997,125399,0,0,1,6,1,0,0,1,0,0,0,0,0,0,0,7,1,0,0,0.1,0.2,-1.0,11,0,-1,0,-1,11,1,1,0,1,30,3,0.44699,1.034777,0.452769,3.464102,0.1,0.0,0.8,3,2,8,3,9,0,8,6,1,1,6,0,1,1,0,0,0
49998,125410,0,4,1,3,1,0,0,0,0,1,0,0,0,0,0,3,1,0,0,0.9,0.2,-1.0,2,1,-1,0,0,4,1,1,0,1,48,2,0.316228,0.708415,0.374833,3.464102,0.2,0.5,0.8,4,4,7,3,7,2,3,6,0,3,6,0,1,0,0,1,0
49999,125414,0,1,1,9,1,0,0,0,1,0,0,0,0,0,0,10,0,0,0,0.2,0.4,-1.0,7,1,-1,0,-1,14,1,1,0,1,68,3,0.4,0.954816,0.407431,3.464102,0.8,0.3,0.8,3,3,8,4,8,3,6,9,2,6,5,0,1,0,0,1,0


# Data at first sight

Here is an expert of the data description for the competition:

- features that belong to similar groupings are tagged as such in the feature names (e.g, ind,reg,car,clac).

- feature names include the postfix bin to indicate binary features and cat to indicate categorical features.

- features without these designations are either continuous or ordinal.
 
- Values of -1 indiacate that the feature was missing from the observation.

- The target columns signifies whether or not a claim was filled for that policy holder.

Ok. that's important information to get us started. Let's have a quick look at the first and last rows to confirm all of this.



we indeed see the following
- binary variables
- categorical variables of which the category values are integers
- other variables with integer or float values
- variables with -1 representing missing values
- the target variables and an ID variable

Let's look at the number of rows and columns in the train data

In [8]:
cat_cols = [col for col in train.columns if 'cat' in col]

In [9]:
cat_cols

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [10]:
for col in cat_cols:
    print(col, train[col].nunique())

ps_ind_02_cat 5
ps_ind_04_cat 3
ps_ind_05_cat 8
ps_car_01_cat 13
ps_car_02_cat 2
ps_car_03_cat 3
ps_car_04_cat 10
ps_car_05_cat 3
ps_car_06_cat 18
ps_car_07_cat 3
ps_car_08_cat 2
ps_car_09_cat 6
ps_car_10_cat 3
ps_car_11_cat 104


In [11]:
train.shape

(50000, 59)

In [12]:
train.drop_duplicates()
train.shape

(50000, 59)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              50000 non-null  int64  
 1   target          50000 non-null  int64  
 2   ps_ind_01       50000 non-null  int64  
 3   ps_ind_02_cat   50000 non-null  int64  
 4   ps_ind_03       50000 non-null  int64  
 5   ps_ind_04_cat   50000 non-null  int64  
 6   ps_ind_05_cat   50000 non-null  int64  
 7   ps_ind_06_bin   50000 non-null  int64  
 8   ps_ind_07_bin   50000 non-null  int64  
 9   ps_ind_08_bin   50000 non-null  int64  
 10  ps_ind_09_bin   50000 non-null  int64  
 11  ps_ind_10_bin   50000 non-null  int64  
 12  ps_ind_11_bin   50000 non-null  int64  
 13  ps_ind_12_bin   50000 non-null  int64  
 14  ps_ind_13_bin   50000 non-null  int64  
 15  ps_ind_14       50000 non-null  int64  
 16  ps_ind_15       50000 non-null  int64  
 17  ps_ind_16_bin   50000 non-null 