In [20]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [21]:
dataset = pd.read_csv('test.csv')

In [22]:
dataset.head()

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
0,1,1,D3,26,0.487179,2,3,1,0.61194,0.781818,...,0,0,0,0,0,0,0,0,0,0
1,3,1,A2,26,0.076923,2,3,1,0.626866,0.727273,...,0,0,0,0,0,0,0,0,0,0
2,4,1,D3,26,0.144667,2,3,1,0.58209,0.709091,...,0,0,0,0,0,0,0,0,0,0
3,9,1,A1,26,0.151709,2,1,1,0.522388,0.654545,...,0,0,0,0,0,0,0,0,1,1
4,12,1,A1,26,0.076923,2,3,1,0.298507,0.672727,...,0,0,0,0,0,0,0,0,0,0


In [23]:
dataset.describe()

Unnamed: 0,Id,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
count,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,...,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0,19765.0
mean,39772.653883,1.024994,24.32173,0.324381,2.006527,2.655755,1.04341,0.414901,0.705846,0.292557,...,0.013964,0.057324,0.01108,0.045231,0.010068,0.008247,0.013559,0.008601,0.018315,0.056312
std,22942.157493,0.15611,5.184879,0.279439,0.080526,0.754993,0.291445,0.200997,0.074497,0.090343,...,0.117345,0.232466,0.10468,0.207817,0.099837,0.09044,0.115655,0.092345,0.134092,0.230528
min,1.0,1.0,2.0,0.0,2.0,1.0,1.0,0.0,0.345455,0.083682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19792.0,1.0,26.0,0.076923,2.0,3.0,1.0,0.253731,0.654545,0.225941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39898.0,1.0,26.0,0.230769,2.0,3.0,1.0,0.41791,0.709091,0.288703,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,59758.0,1.0,26.0,0.487179,2.0,3.0,1.0,0.58209,0.763636,0.345188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,79129.0,2.0,37.0,1.0,3.0,3.0,3.0,0.970149,1.0,0.878661,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
# discovering columns with missing values

empty_cols = [i for i in dataset.columns if dataset[i].isnull().any()]
empty_cols

['Employment_Info_1',
 'Employment_Info_4',
 'Employment_Info_6',
 'Insurance_History_5',
 'Family_Hist_2',
 'Family_Hist_3',
 'Family_Hist_4',
 'Family_Hist_5',
 'Medical_History_1',
 'Medical_History_10',
 'Medical_History_15',
 'Medical_History_24',
 'Medical_History_32']

In [25]:
# there can be 3 ways to repopulate missing values
# without considering outliers or exceptional cases

# the method being mean, median and mode

# mean and median for float values and integers - category 1
# mode is ideal for categorical data - category 2

In [26]:
# category 1

dataset['Employment_Info_1'] = dataset['Employment_Info_1'].fillna(dataset['Employment_Info_1'].mean())
dataset['Employment_Info_4'] = dataset['Employment_Info_4'].fillna(dataset['Employment_Info_4'].mean())
dataset['Employment_Info_6'] = dataset['Employment_Info_6'].fillna(dataset['Employment_Info_6'].mean())

dataset['Family_Hist_2'] = dataset['Family_Hist_2'].fillna(dataset['Family_Hist_2'].mean())
dataset['Family_Hist_3'] = dataset['Family_Hist_3'].fillna(dataset['Family_Hist_3'].mean())
dataset['Family_Hist_4'] = dataset['Family_Hist_4'].fillna(dataset['Family_Hist_4'].mean())
dataset['Family_Hist_5'] = dataset['Family_Hist_5'].fillna(dataset['Family_Hist_5'].mean())

In [27]:
# category 2

dataset['Medical_History_1'] = dataset['Medical_History_1'].fillna(dataset['Medical_History_1'].mode())
dataset['Medical_History_10'] = dataset['Medical_History_10'].fillna(dataset['Medical_History_10'].mode())
dataset['Medical_History_15'] = dataset['Medical_History_15'].fillna(dataset['Medical_History_15'].mode())
dataset['Medical_History_24'] = dataset['Medical_History_24'].fillna(dataset['Medical_History_24'].mode())
dataset['Medical_History_32'] = dataset['Medical_History_32'].fillna(dataset['Medical_History_32'].mode())

In [28]:
# studying multicollinearity can help us discard similar variables

# the technique that we shall use to solve multi-collinearity is Variance Inflation Factor (VIF)

# we shall derive a table while calculating the VIF for each feature
# features with similar VIF are highly correlated

In [29]:
print(dataset.corr())


                          Id  Product_Info_1  Product_Info_3  Product_Info_4  \
Id                  1.000000        0.008466       -0.003527        0.004513   
Product_Info_1      0.008466        1.000000        0.020570        0.044508   
Product_Info_3     -0.003527        0.020570        1.000000        0.051156   
Product_Info_4      0.004513        0.044508        0.051156        1.000000   
Product_Info_5      0.003295        0.059472        0.022358        0.031621   
...                      ...             ...             ...             ...   
Medical_Keyword_44  0.001045       -0.000265        0.012361       -0.010558   
Medical_Keyword_45 -0.000580       -0.004759       -0.052754       -0.014193   
Medical_Keyword_46 -0.008167       -0.007893       -0.007788       -0.036010   
Medical_Keyword_47  0.000528       -0.000115       -0.022376       -0.018861   
Medical_Keyword_48 -0.003441        0.007286        0.051006       -0.031149   

                    Product_Info_5  Pro

In [30]:
X = add_constant(dataset)
ds=pd.Series([variance_inflation_factor(X.values, i)
               for i in range(X.shape[1])],
              index=X.columns)
print(ds)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [37]:
print(dataset.dtypes)

dataset.dtypes

Id                      int64
Product_Info_1          int64
Product_Info_2         object
Product_Info_3          int64
Product_Info_4        float64
                       ...   
Medical_Keyword_44      int64
Medical_Keyword_45      int64
Medical_Keyword_46      int64
Medical_Keyword_47      int64
Medical_Keyword_48      int64
Length: 127, dtype: object


Id                      int64
Product_Info_1          int64
Product_Info_2         object
Product_Info_3          int64
Product_Info_4        float64
                       ...   
Medical_Keyword_44      int64
Medical_Keyword_45      int64
Medical_Keyword_46      int64
Medical_Keyword_47      int64
Medical_Keyword_48      int64
Length: 127, dtype: object