# Insurance Company Assessment

## 1. Import the train dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("exercise_01_train.csv")

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
df.head(3)

In [None]:
df.describe()

In [None]:
df.info()

Total number of columns: 101  
float data type: 94  
int data type: 1  
object data type: 6  

## 2. Preprocessing / Preparation

### Number of null values for each columns (Top 5)

In [None]:
df.isnull().sum().sort_values(ascending=False).head()

Column 'x96' has most missing values 15.   

### Find all categorical ('object') columns/features

In [None]:
object_columns = df.select_dtypes("object")
cat_cols = object_columns.columns
cat_cols

Columns with object data type: x34, x35, x41, x68, x93

#### Number of missing values from only 'object' columns

In [None]:
pd.isnull(object_columns).sum()

'x41' and 'x45' can be changed float data type remove signs and replace missing values with median (or mean)
The missing values from other categorical features can be replaced with the most frequent category within the column. 

### For x41, x45, remove '$' or '%' sign and change the data type as float

In [None]:
df['x41'] = df['x41'].str.replace('$','').astype(float)
df['x45'] = df['x45'].str.replace('%','').astype(float)

In [None]:
object_columns = df.select_dtypes("object")
cat_cols = object_columns.columns
cat_cols

### Check unique values from category columns and fix them.

In [None]:
for col in cat_cols:
    print(col+":", df[col].unique(), '\n')

In [None]:
df['x34'] = df['x34'].str.upper()
df['x34'].unique()

In [None]:
df['x35']= (
            df['x35'].replace(['wed', 'wednesday'], 'Wednesday')
                    .replace(['thur', 'thurday'], 'Thursday')
                    .replace(['fri', 'friday'], 'Friday')
                    .str.capitalize()
            )

In [None]:
df['x68']= (
            df['x68'].replace('July', 'Jul')
                    .replace('sept.', 'Sep')
                    .replace('Dev', 'Dec')
                    .replace('January', 'Jan')           
                    .str.capitalize()
            )

In [None]:
df['x93'] = df['x93'].str.capitalize()

### Imputation: replace missing categorical features with the most frequent in category

In [None]:
for col in cat_cols: 
    df[col][pd.isnull(df[col])] = df[col].value_counts().index[0]

### Separate dataset into categorical and numerical features

In [None]:
# categorical features
cat_df = df.loc[:, df.dtypes == np.object]
cat_df.head()

In [None]:
# One-hot ecoding - categorical features
onehot_cat_df = pd.get_dummies(cat_df)


In [None]:
# numerical features
num_df = df.loc[:, df.dtypes == np.float64]

### Imputation: replace missing numerical features with the median

In [None]:
# Imputation with median for NA values in numerical features
num_df = num_df.fillna(df.median())

### Scaling: Starndard Scaler 

In [None]:
scalar = StandardScaler()
scaled_num_df = pd.DataFrame(scalar.fit_transform(num_df))

Almost all features are normally distributed but thier scales are different (from .describe() and EDA).  
Standardized feautures help SVM perform better.

### Concatenate: combine dataframes into one dataframe

In [None]:
X_df = pd.concat([onehot_cat_df, scaled_num_df], axis=1)

In [None]:
X_df.head()

### Train/Test Split (from train set csv file)

In [None]:
y = df.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.20, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

## 3. Modeling - Random Forest & Support Vector Machine

## Random Forest (with Hyperparameter tuning)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
##### Random Forest Hyperparameter Tuning

param_grid_rf = {
                'n_estimators': [200, 250, 300], 
                'max_depth': [16, 18, 20], 
                'min_samples_leaf': [8, 10],
                'min_samples_split': [2, 4],
}


In [None]:
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)

In [None]:
grid_search_rf = GridSearchCV(rf_clf, param_grid=param_grid_rf, cv=3, scoring='roc_auc', n_jobs=-1, verbose = True)


In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
grid_search_rf.best_score_

In [None]:
grid_search_rf_proba = grid_search_rf.predict_proba(X_test)


In [None]:
print('AUC: ', roc_auc_score(y_test, grid_search_rf_proba[:,1]))
print('Accuracy: ', accuracy_score(y_test, grid_search_rf.predict(X_test)))

### Support Vector Machine - SVM (with Hyperparameter Tuning)

In [None]:
from sklearn import svm

param_grid_svm = {'C': [1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']}  
svm_clf = svm.SVC(probability=True)  
grid_search_svm = GridSearchCV(svm_clf, param_grid_svm, verbose = True, cv=3, scoring='roc_auc', n_jobs=-1) 
  
# fitting the model for grid search 
grid_search_svm.fit(X_train, y_train) 

In [None]:
svm_clf_probs = grid_search_svm.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, svm_clf_probs[:,1]))
print('Accuracy: ', accuracy_score(y_test, grid_search_svm.predict(X_test)))

### Logistic Regression (with Hyperparameter Tuning)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
param_grid_lr = {'penalty': ['l1', 'l2'], 
                 'C': [0.01, 0.1, 1.0, 10, 100]}

In [None]:
logreg = LogisticRegression(random_state=0, max_iter=10000)

In [None]:
grid_search_lr = GridSearchCV(logreg, param_grid=param_grid_lr, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)

In [None]:
grid_search_lr.fit(X_train, y_train)

In [None]:
grid_search_lr.best_params_

In [None]:
grid_search_lr.best_score_

In [None]:
lr_clf_probs = grid_search_lr.predict_proba(X_test)

print('AUC: ', roc_auc_score(y_test, lr_clf_probs[:,1]))
print('Accuracy: ', accuracy_score(y_test, grid_search_lr.predict(X_test)))

### XGBoost Regression

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_clf = XGBClassifier(n_estimators = 400, learning_rate=0.1, max_depth=3)
xgb_clf.fit(X_train, y_train)
pred = xgb_clf.predict(X_test)

In [None]:
xgb_clf_probs = xgb_clf.predict_proba(X_test)

print('AUC: ', roc_auc_score(y_test, xgb_clf_probs[:,1]))

## 4. Import and transform test dataset 

In [None]:
new_test = pd.read_csv('exercise_01_test.csv')

In [None]:
# 'x41', 'x45': remove sign 
def transform_new_test(df=new_test):

    df['x41'] = df['x41'].str.replace('$','').astype(float)
    df['x45'] = df['x45'].str.replace('%','').astype(float)

    # 'x34', 'x35', 'x68', 'x93': fix some categorical values
    df['x34'] = df['x34'].str.upper()
    df['x34'].unique()

    df['x35']= (
                df['x35'].replace(['wed', 'wednesday'], 'Wednesday')
                        .replace(['thur', 'thurday'], 'Thursday')
                        .replace(['fri', 'friday'], 'Friday')
                        .str.capitalize()
                )

    df['x68']= (
                df['x68'].replace('July', 'Jul')
                        .replace('sept.', 'Sep')
                        .replace('Dev', 'Dec')
                        .replace('January', 'Jan')           
                        .str.capitalize()
                )

    df['x93'] = df['x93'].str.capitalize()

    # Imputatiom with most frequent category for NA values in categorical features
    object_columns = df.select_dtypes("object")
    cat_cols = object_columns.columns

    for col in cat_cols: 
        df[col][pd.isnull(df[col])] = df[col].value_counts().index[0]

    # Separate dataset into categorical and numerical columns
    cat_df = df.loc[:, df.dtypes == np.object]
    onehot_cat_df = pd.get_dummies(cat_df)

    num_df = df.loc[:, df.dtypes == np.float64]

    # Imputation with median for NA values in numerical features
    num_df = num_df.fillna(df.median())

    # Standard Scaler (esp. for SVM)
    scalar = StandardScaler()
    scaled_num_df = pd.DataFrame(scalar.fit_transform(num_df))

    # Concat cleaned categorical and numerical data as one dataframe
    X_df = pd.concat([onehot_cat_df, scaled_num_df], axis=1)
    
    return X_df

In [None]:
X_df_new = transform_new_test(new_test)
X_df_new.head()

## 5. Create two .csv results files 

In [None]:
# Results 1 - SVM
results1 = pd.DataFrame(grid_search_svm.predict_proba(X_df_new)[:,1], index=None)
results1.to_csv("results1.csv", header=False, index=False)

In [None]:
# Results 2 - Random Forest
results2 = pd.DataFrame(grid_search_rf.predict_proba(X_df_new)[:,1], index=None)
results2.to_csv("results2.csv", header=False, index=False)