In [None]:
#!pip3 install xgboost imblearn sklearn pickle pandas matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from dateutil import parser
import os
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from data_preprocessing_v2 import Data_Preprocessing
from model_v2 import model_fit, model_blending, feature_importance
from model_v2 import model_inference
import datetime
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

In [None]:
def validate_dirs(dir):
    try: 
        if not os.path.exists(dir):
            os.makedirs(dir)  
    except OSError:
        print('Error: Creating directory to store person')

In [None]:
validate_dirs("models")
validate_dirs("result")
validate_dirs("cm")

In [None]:
train_df = pd.read_csv('train/train.csv')
test_df= pd.read_csv('test/test.csv')

In [None]:
# Reading the dataframe
train_df.head(n=2)

In [None]:
# checking for the null values
train_df.isnull().sum()

In [None]:
# Checking for any duplicate rows 
train_df.duplicated().sum()

In [None]:
# looking at the unique values of the dataframe features for identifying possible categorical
# and numerical feautures 

unique_values = []
for i in range(len(train_df.columns)):
    unique_values.append([train_df.columns[i],train_df.iloc[:,i].nunique(),train_df.iloc[:,i].dtypes])


unique_values = pd.DataFrame(unique_values)
unique_values.columns = ['Column name', 'Unique_values','Dtypes']
#unique_values['datatype'] =unique_values[i].dtypes
print(unique_values)

In [None]:
# Looking at the statistics 
train_df.describe()

In [None]:
data_processor  = Data_Preprocessing()
train_drop_colums = ['Response','id']
test_drop_colums = ['id']
label_column = 'Response'
key = 'id'
custom_encode_col = "Vehicle_Age"
X, Y = data_processor.data_processing_pipeline(train_df, train_drop_colums , label_column, key, data_type = 'Train', custom_encode_col=custom_encode_col) 


X_test = data_processor.data_processing_pipeline(test_df, test_drop_colums , label_column, key, data_type = 'Test', custom_encode_col=custom_encode_col) 

In [None]:
X.head()

In [None]:
df = X.copy()
df["Target"] = Y

In [None]:
plt.rcParams['figure.figsize']=(15,8)
f, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5)


sns.boxplot('Target',y='Age',data=df, ax=ax1)
sns.boxplot('Target',y='Annual_Premium',data=df, ax=ax2)
sns.boxplot('Target',y='Region_Code',data=df, ax=ax3)
sns.boxplot('Target',y='Policy_Sales_Channel',data=df, ax=ax4)
sns.boxplot('Target',y='Vintage',data=df, ax=ax5)


f.tight_layout()

In [None]:
col = "Target"
hue = "Target"
plt.rcParams['figure.figsize']=(15,8)
g = sns.FacetGrid(df, col= col, hue=hue)
g.map(sns.distplot, df.columns[0], hist=True, rug=True)

g = sns.FacetGrid(df, col= col, hue=hue)
g.map(sns.distplot, df.columns[4], hist=True, rug=True)

In [None]:
df.hist(figsize = (20, 20))
plt.show()

In [None]:
## Correlation Matrix
sns.set(style="dark")

# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(120, 17, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# XGBoost, CatBoost and LGB
* `KFold`
* `StratifiedShuffleSplit`
* `stratifiedKFold`


In [None]:
def save_result(result, file_name):
    df_sub = pd.read_csv('sample_submission.csv')
    df_sub.head()
    df_xgb = df_sub.copy()
    df_xgb['Response'] = result
    df_xgb.head()
    df_xgb.to_csv('result/'+ filename +'.csv', index=False )

In [None]:
%%capture
# model---> xgboost 
split_type = "stratifiedsuffleSplit" # stratifiedsuffleSplit, stratifiedKFold, kfold
model_name_xgb = "xgboost"
probs_xgb, probs_xgb_train, model = model_fit(X, Y, X_test, 5, split_type, model_name_xgb)

In [None]:
# Feature Importance 
plt.style.use('ggplot')
plt.subplots(figsize=(15, 4))
feature_importance(model, X, model_name_xgb)

In [None]:
%%capture
# model---> catboost 
split_type = "stratifiedsuffleSplit" # stratifiedsuffleSplit, stratifiedKFold, kfold
model_name_cb = "catboost"
probs_cb, probs_cb_train, model = model_fit(X, Y, X_test, 5, split_type, model_name_cb )

In [None]:
# Feature Importance
fig, ax = plt.subplots(figsize=(15, 4))
feature_importance(model, X, model_name_cb)

In [None]:
%%capture
# model---> lgb 
split_type = "stratifiedsuffleSplit" # stratifiedsuffleSplit, stratifiedKFold, kfold
model_name_lgb = "lgb"
probs_lgb, probs_lgb_train, model = model_fit(X, Y, X_test, 5, split_type, model_name_lgb)


In [None]:
# Feature Importance
fig, ax = plt.subplots(figsize=(15, 4))
feature_importance(model, X, model_name_lgb)

In [None]:
# Test set predict_probs
p_cb = probs_cb/5
p_xgb = probs_xgb/5 
# Train Set Predict_probs 
p_cb_train = probs_cb_train/5
p_xgb_train = probs_xgb_train/5 

In [None]:
from model_v2 import model_blending as blend
from sklearn.metrics import roc_auc_score,confusion_matrix
plt.subplots(figsize=(10, 4))
best_w, best_roc = blend(p_xgb_train, p_cb_train, Y)
display(best_w)
display(best_roc)

In [None]:
w = best_w
result = w * p_xgb + (1-w) * p_cb
filename = "model_blending_cb_xgb_"+split_type+"_best_w-" + str(w) + "-"+ datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
save_result(result, filename)

In [None]:
w = 0.5
result = w * p_xgb + (1-w) * p_cb
filename = "model_blending_cb_xgb_"+split_type+"_best_w-" + str(w) + "-"+ datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
save_result(result, filename)

# Model Load and Predict, predict_proba

In [None]:
def model_predict(X,model_prefix, prediction=None, predict_probs=None):  
    import pickle, joblib
    import glob
    pridict = np.zeros(shape=(len(X),))
    models_path = glob.glob(model_prefix)
    for i, v in enumerate(models_path):
        model = joblib.load(v)
        if prediction:
            result = model.predict(X)
        if predict_probs:
            result = model.predict_proba(X)    
        pridict += result
    return pridict    


# Confusion Matrix

In [None]:
# Xgboost
model_path_prefix_xgb = "models/" + model_name_xgb + "/" + split_type + "/" + model_name_xgb +"_" + split_type + "*"
y_pred_xgb = (model_predict(X, model_path_prefix_xgb, prediction=True, predict_probs=False)/5).astype(int) 
cm_xgb = confusion_matrix(Y, y_pred_xgb)
cm_df_xgb = pd.DataFrame(data=cm_xgb, columns=["Response_0", "Response_1"], index=["Response_0", "Response_1"])
cm_df_xgb.to_csv("cm/xgb_cm_" + split_type + ".csv")
print("::SAVED XGB CONFUSION MATRIX::")

In [None]:
#catboost
model_path_prefix_cb = "models/" + model_name_cb + "/" + split_type + "/" + model_name_cb +"_" + split_type + "*"
y_pred_cb = (model_predict(X, model_path_prefix_cb, prediction=True, predict_probs=False)/5).astype(int)
cm_cb = confusion_matrix(Y, y_pred_cb)
cm_df_cb = pd.DataFrame(data=cm_cb, columns=["Response_0", "Response_1"], index=["Response_0", "Response_1"])
cm_df_cb.to_csv("cm/catboost_cm_" + split_type + ".csv")
print("::SAVED CATBOOST CONFUSION MATRIX::")

In [None]:
#LightGB
model_path_prefix_lgb = "models/" + model_name_lgb + "/" + split_type + "/" + model_name_lgb +"_" + split_type + "*"
y_pred_cb = (model_predict(X, model_path_prefix_lgb, prediction=True, predict_probs=False)/5).astype(int)
cm_cb = confusion_matrix(Y, y_pred_lgb)
cm_df_cb = pd.DataFrame(data=cm_cb, columns=["Response_0", "Response_1"], index=["Response_0", "Response_1"])
cm_df_cb.to_csv("cm/" + model_name_lgb + "_cm_" + split_type + ".csv")
print("::SAVED LGB CONFUSION MATRIX::")