In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import preprocessing

%matplotlib inline



In [2]:
X = pd.read_csv('train.csv')
Y = X['Response']
X.drop('Response', axis=1, inplace=True)
X.drop('Id', axis=1, inplace=True)
X.fillna(-1, inplace=True)

X_test = pd.read_csv('test.csv')
X_test.drop('Id', axis=1, inplace=True)
X_test.fillna(-1,  inplace=True)

In [3]:
X.head()

Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
0,1,D3,10,0.076923,2,1,1,0.641791,0.581818,0.148536,...,0,0,0,0,0,0,0,0,0,0
1,1,A1,26,0.076923,2,3,1,0.059701,0.6,0.131799,...,0,0,0,0,0,0,0,0,0,0
2,1,E1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,...,0,0,0,0,0,0,0,0,0,0
3,1,D4,10,0.487179,2,3,1,0.164179,0.672727,0.205021,...,0,0,0,0,0,0,0,0,0,0
4,1,D2,26,0.230769,2,3,1,0.41791,0.654545,0.23431,...,0,0,0,0,0,0,0,0,0,0


## Encoding string features
Only one feature with string values, although there are several other's that are known to be categorical. Probably not the best approach.

In [None]:
X['Product_Info_2_cat'] = X['Product_Info_2'].str[0]
X['Product_Info_2_num'] = X['Product_Info_2'].str[1]
X_test['Product_Info_2_cat'] = X_test['Product_Info_2'].str[0]
X_test['Product_Info_2_cat'] = X_test['Product_Info_2'].str[1]

#X.drop('Product_Info_2', axis=1, inplace=True)
#X_test.drop('Product_Info_2', axis=1, inplace=True)

In [None]:
print(X.shape, Y.shape)

In [None]:
X.describe()

In [None]:
X.to_csv('basic_features_train.csv', index=False)
Y.to_csv('basic_features_labels.csv', index=True, header=False)
X_test.to_csv('basic_features_test.csv', index=False)

## Sum na values

In [None]:
X['NumNan'] = np.sum(X==-1, axis=1)
X_test['NumNan'] = np.sum(X_test==-1, axis=1)
X.head()

In [None]:
X.to_csv('onehot_nansum_features_train.csv', index=False)
Y.to_csv('onehot_nansum_features_labels.csv', index=True, header=False)
X_test.to_csv('onehot_nansum_features_test.csv', index=False)

## More engineering

In [None]:
X['Age_Ht'] = X['Ins_Age']*X['Ht']
X_test['Age_Ht'] = X_test['Ins_Age']*X_test['Ht']

X['Age_Wt'] = X['Ins_Age']*X['Wt']
X_test['Age_Wt'] = X_test['Ins_Age']*X_test['Wt']

X['Age_Ht_Wt'] = X['Ins_Age']*X['Ht']*X['Wt']
X_test['Age_Ht_Wt'] = X_test['Ins_Age']*X_test['Ht']*X_test['Wt']

In [None]:
X.to_csv('onehot_eng_features_train.csv', index=False)
Y.to_csv('onehot_eng_features_labels.csv', index=True, header=False)
X_test.to_csv('onehot_eng_features_test.csv', index=False)

## One hot encoding

In [None]:
X = pd.read_csv('train.csv')
X.drop('Id', axis=1, inplace=True)

X_test = pd.read_csv('test.csv')
X_test.drop('Id', axis=1, inplace=True)
X_test['Response'] = 0

In [None]:
cats = ['Product_Info_2', 'Product_Info_2_cat', 'Product_Info_2_num',
        'Product_Info_1',  'Product_Info_3', 'Product_Info_5', 'Product_Info_6',
        'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 'InsuredInfo_1',
        'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7',
        'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4',
        'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1',
        'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5',
        'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9',
        'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14',
        'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19',
        'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23',
        'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28',
        'Medical_History_29', 'Medical_History_30', 'Medical_History_31', 'Medical_History_33',
        'Medical_History_34', 'Medical_History_35', 'Medical_History_36', 'Medical_History_37',
        'Medical_History_38', 'Medical_History_39', 'Medical_History_40', 'Medical_History_41' ]

In [None]:
# Need to work with everything together so we keep encodings consistent between train and test
all_dat = X.append(X_test)
all_dat.fillna(-1, inplace=True)
for cat in cats:
    all_dat = pd.concat( [all_dat, pd.get_dummies( all_dat[cat], prefix=cat)], axis=1, join_axes=[all_dat.index] )
    all_dat.drop(cat, axis=1, inplace=True)
all_dat.head()

In [None]:
X = all_dat[all_dat['Response']>0].copy()
X_test = all_dat[all_dat['Response']<1].copy()

Y = X['Response']

X.drop('Response', axis=1, inplace=True)
X_test.drop('Response', axis=1, inplace=True)
print(X.shape, X_test.shape, Y.shape)

In [None]:
X_test.head()

In [None]:
X.to_csv('onehot_features_train.csv', index=False)
Y.to_csv('onehot_features_labels.csv', index=True, header=False)
X_test.to_csv('onehot_features_test.csvst.csv', index=False)