# 0.0. Imports

In [63]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from IPython.display import HTML

from sklearn import model_selection as ms
from sklearn import neighbors as ng

## 0.1. Helper Functions

In [69]:
def precision_at_k(data, k):
    data = data.reset_index(drop=True)
    data['ranking'] = data.index + 1
    
    data['precision_at_k'] = data['response'].cumsum() / data['ranking']
    
    return (data.loc[k, 'precision_at_k'], data)


def recall_at_k( data, k ):
    data = data.reset_index( drop=True )
    data['ranking'] = data.index + 1
    
    data['recall_at_k'] = data['response'].cumsum() / data['response'].sum()
    
    return ( data.loc[ k, 'recall_at_k'], data )


def perfomance_metrics_at_k(x_val, y_val, yhat, model_name, percentage_of_base):  
    df = x_val.copy()
    df['response'] = y_val
    df['score'] = yhat[:, 1].tolist()
    df = df.sort_values( 'score', ascending=False )

    k = int(percentage_of_base * df.shape[0])

    precision,  data = precision_at_k(df, k)
    recall, data = recall_at_k(df, k)

    return pd.DataFrame({'model_name': [model_name],
                         'Precision@K': [round(precision, 3)],
                         'Recall@K': [round(recall, 3)]}, index=[0])
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [8]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2. Loading Data

In [11]:
df_raw = pd.read_csv('../data/raw/train.csv')

# 1.0. Data Description

In [12]:
df1 = df_raw.copy()

## 1.1. Rename Columns

In [16]:
cols_new = ['id', 'gender', 'age', 'driving_license', 'region_code', 'previously_insured', 'vehicle_age', 
            'vehicle_damage', 'annual_premium', 'policy_sales_channel', 'vintage', 'response']
df1.columns = cols_new

# 2.0. Feature Engineering

In [37]:
df2 = df1.copy()

## 2.1. Feature Creation

In [38]:
# gender
gender_map = {'Male': 1, 'Female': 0}
df2.loc[:, 'gender'] = df2['gender'].map(gender_map)

# vehicle_age
vehicle_age_map = {'1-2 Year': 'between_1_2_years', '< 1 Year': 'bellow_1_year', '> 2 Years': 'over_2_year'}
df2.loc[:, 'vehicle_age'] = df2['vehicle_age'].map(vehicle_age_map)

# vehicle_damage
vehicle_damage_map = {'Yes': 1, 'No': 0}
df2.loc[:, 'vehicle_damage'] = df2['vehicle_damage'].map(vehicle_damage_map)

# 3.0. Data Filtering

In [39]:
df3 = df2.copy()

# 4.0. EDA

In [40]:
df4 = df3.copy()

# 5.0. Data Preparation

In [52]:
X = df4.drop(['response', 'id'], axis=1).copy()
y = df4['response'].copy()

x_train, x_validation, y_train, y_validation = ms.train_test_split(X, y, test_size=0.2)

df5 = pd.concat([x_train, y_train], axis=1)

## 5.1. Standardization

## 5.2. Rescaling

## 5.3. Encoding

In [53]:
df5 = pd.get_dummies(df5, columns=['vehicle_age'], prefix=['vehicle_age'])

## 5.4. Data Preparation - Validation 

In [54]:
x_validation = pd.get_dummies(x_validation, columns=['vehicle_age'], prefix=['vehicle_age'])

# 6.0. Feature Selection

In [58]:
cols_selected = ['gender', 'age', 'driving_license', 'region_code', 
                 'previously_insured','vehicle_damage', 'annual_premium', 
                 'policy_sales_channel', 'vintage', 'vehicle_age_bellow_1_year',
                 'vehicle_age_between_1_2_years', 'vehicle_age_over_2_year']

# 7.0. Machine Learning Modelling

In [59]:
x_train = df5[cols_selected]

x_val = x_validation[cols_selected]
y_val = y_validation

## 7.1. KNN

In [70]:
# model definition and fit
model_knn = ng.KNeighborsClassifier().fit(x_train, y_train)

# Predictions
yhat_knn = model_knn.predict_proba(x_val)

In [71]:
perfomance_metrics_at_k(x_val, y_val, yhat_knn, 'KNeighborsClassifier', 0.2)

Unnamed: 0,model_name,Precision@K,Recall@K
0,KNeighborsClassifier,0.185,0.302
