In [1]:
import numpy as np
import pandas as pd

# machine learning models
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

#feature scaling
from sklearn.preprocessing import StandardScaler, RobustScaler

#Pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

#Cross validation
from sklearn.model_selection import cross_val_score, train_test_split

#Model persistence
from sklearn.externals import joblib

In [2]:
train = pd.read_csv('train.gz', index_col=0)
test = pd.read_csv('test.gz', index_col=0)

In [3]:
train.describe()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,...,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,743803.6,0.036448,1.900378,1.358943,4.423318,0.416794,0.405188,0.393742,0.257033,0.163921,...,5.441382,1.441918,2.872288,7.539026,0.122427,0.62784,0.554182,0.287182,0.349024,0.153318
std,429367.8,0.187401,1.983789,0.664594,2.699902,0.493311,1.350642,0.488579,0.436998,0.370205,...,2.332871,1.202963,1.694887,2.746652,0.327779,0.483381,0.497056,0.452447,0.476662,0.360295
min,7.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,371991.5,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,743547.5,0.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1115549.0,0.0,3.0,2.0,6.0,1.0,0.0,1.0,1.0,0.0,...,7.0,2.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1488027.0,1.0,7.0,4.0,11.0,1.0,6.0,1.0,1.0,1.0,...,19.0,10.0,13.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
print(train.columns.values)

['id' 'target' 'ps_ind_01' 'ps_ind_02_cat' 'ps_ind_03' 'ps_ind_04_cat'
 'ps_ind_05_cat' 'ps_ind_06_bin' 'ps_ind_07_bin' 'ps_ind_08_bin'
 'ps_ind_09_bin' 'ps_ind_10_bin' 'ps_ind_11_bin' 'ps_ind_12_bin'
 'ps_ind_13_bin' 'ps_ind_14' 'ps_ind_15' 'ps_ind_16_bin' 'ps_ind_17_bin'
 'ps_ind_18_bin' 'ps_reg_01' 'ps_reg_02' 'ps_reg_03' 'ps_car_01_cat'
 'ps_car_02_cat' 'ps_car_03_cat' 'ps_car_04_cat' 'ps_car_05_cat'
 'ps_car_06_cat' 'ps_car_07_cat' 'ps_car_08_cat' 'ps_car_09_cat'
 'ps_car_10_cat' 'ps_car_11_cat' 'ps_car_11' 'ps_car_12' 'ps_car_13'
 'ps_car_14' 'ps_car_15' 'ps_calc_01' 'ps_calc_02' 'ps_calc_03'
 'ps_calc_04' 'ps_calc_05' 'ps_calc_06' 'ps_calc_07' 'ps_calc_08'
 'ps_calc_09' 'ps_calc_10' 'ps_calc_11' 'ps_calc_12' 'ps_calc_13'
 'ps_calc_14' 'ps_calc_15_bin' 'ps_calc_16_bin' 'ps_calc_17_bin'
 'ps_calc_18_bin' 'ps_calc_19_bin' 'ps_calc_20_bin']


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     595212 non-null int64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595212 non-null int64
ps_ind_05_cat     595212 non-null int64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64
ps_re

In [6]:
#Let's drop all calculated features
col = [c for c in train.columns if not c.startswith('ps_calc_')]
train=train[col]
col = [c for c in test.columns if not c.startswith('ps_calc_')]
test=test[col]

In [7]:
corr_matrix = train.corr()
corr_matrix["target"].sort_values(ascending=False)

target           1.000000
ps_car_13        0.053899
ps_car_12        0.038790
ps_ind_17_bin    0.037053
ps_reg_02        0.034800
ps_ind_07_bin    0.034218
ps_car_04_cat    0.032900
ps_car_03_cat    0.032401
ps_reg_03        0.030888
ps_ind_05_cat    0.029165
ps_car_15        0.027667
ps_reg_01        0.022888
ps_car_05_cat    0.020754
ps_ind_01        0.018570
ps_car_01_cat    0.016256
ps_ind_08_bin    0.013147
ps_car_06_cat    0.011537
ps_ind_04_cat    0.009360
ps_ind_03        0.008360
ps_ind_12_bin    0.007810
ps_ind_14        0.007443
ps_car_11_cat    0.006129
ps_car_09_cat    0.005322
ps_ind_18_bin    0.004555
ps_ind_02_cat    0.004534
ps_ind_13_bin    0.002460
ps_ind_11_bin    0.002028
ps_ind_10_bin    0.001815
ps_car_10_cat    0.001038
id              -0.000188
ps_car_11       -0.001213
ps_car_14       -0.004474
ps_ind_09_bin   -0.008237
ps_car_08_cat   -0.020342
ps_ind_15       -0.021506
ps_ind_16_bin   -0.027778
ps_car_02_cat   -0.031534
ps_ind_06_bin   -0.034017
ps_car_07_ca

In [8]:
def change_datatype(df):
    float_cols = list(df.select_dtypes(include=['int']).columns)
    for col in float_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)

change_datatype(train)
change_datatype(test) 

In [9]:
def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        
change_datatype_float(train)
change_datatype_float(test)

In [10]:
train, validation, target, target_val = train_test_split(train, train['target'], test_size=0.10, random_state=42)

In [11]:
# drop id y target from train and validation sets
train_tr=train.drop("target", axis=1).drop("id", axis=1)
validation_tr=validation.drop("target", axis=1).drop("id", axis=1)
test_tr=test.drop("id", axis=1)

In [12]:
# Normalize the data
scaler=StandardScaler()

train_tr = scaler.fit_transform(train_tr)
validation_tr = scaler.fit_transform(validation_tr)
test_tr = scaler.fit_transform(test_tr)

In [13]:
train_tr.shape, validation_tr.shape, test_tr.shape

((535690, 37), (59522, 37), (892816, 37))

In [None]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
%time gaussian.fit(train_tr, target)
%time prediction = gaussian.predict_proba(test_tr)
prediction_nb=prediction[:,1]

joblib.dump(gaussian, 'gaussian.pkl', compress=True) 

%time acc_gaussian = (cross_val_score(gaussian, train_tr, target, cv=5, scoring="accuracy").mean()) * 100
acc_gaussian

In [18]:
np.savez('prediction_nb.npz', prediction_nb)

In [None]:
# Submission

# He didn't confess yet, but he will...

In [None]:
submission = pd.DataFrame({
        "id": test["id"],
        "target": prediction_nb
    })
submission.to_csv('submission_nb.csv', index=False)