In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
pd.set_option('display.max_columns', None)

In [2]:
numerical_df = pd.read_csv('numerical_df') 
categorical_df = pd.read_csv('categorical_df') 

In [3]:
data = pd.concat([numerical_df, categorical_df], axis=1) 
data

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,Year,Month,Day,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,2763.519279,56274,69.0,32,5,0,1,384.811147,2011,2,24,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,3,Offer1,Agent,Two-Door Car,Medsize
1,6979.535903,0,94.0,13,42,0,8,1131.464935,2011,1,31,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,3,Offer3,Agent,Four-Door Car,Medsize
2,12887.431650,48767,108.0,18,38,0,2,566.472247,2011,2,19,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,3,Offer1,Agent,Two-Door Car,Medsize
3,7645.861827,0,106.0,18,65,0,7,529.881344,2011,1,20,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,2,Offer1,Call Center,SUV,Medsize
4,2813.692575,43836,73.0,12,44,0,1,138.130879,2011,2,3,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,1,Offer1,Agent,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,6279.177892,71941,73.0,18,89,0,2,198.234764,2011,2,10,California,No,Basic,Bachelor,Employed,M,Urban,Married,Personal Auto,1,Offer2,Web,Four-Door Car,Medsize
9130,3096.511217,21604,79.0,14,28,0,1,379.200000,2011,2,12,California,Yes,Extended,College,Employed,F,Suburban,Divorced,Corporate Auto,3,Offer1,Branch,Four-Door Car,Medsize
9131,8163.890428,0,85.0,9,37,3,2,790.784983,2011,2,6,California,No,Extended,Bachelor,Unemployed,M,Suburban,Single,Corporate Auto,2,Offer1,Branch,Four-Door Car,Medsize
9132,7524.442436,21941,96.0,34,3,0,3,691.200000,2011,2,3,California,No,Extended,College,Employed,M,Suburban,Married,Personal Auto,2,Offer3,Branch,Four-Door Car,Large


# 1.Concatenate Numerical and Categorical dataframes into one dataframe called data. Split into X=features y=target.

In [4]:
data['policy'] = data['policy'].astype(str)

y = data['total_claim_amount']
X = data.drop(['total_claim_amount'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2.Separate X_train and X_test into numerical and categorical (X_train_cat , X_train_num , X_test_cat , X_test_num)

In [5]:
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

# 3.Use X_train_num to fit scalers. Transform BOTH X_train_num and X_test_num.

In [6]:
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(X_train_num)
X_train_normalized = transformer.transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
X_test_normalized = transformer.transform(X_test_num)
X_test_norm = pd.DataFrame(X_test_normalized, columns=X_test_num.columns)

# 4.Encode the categorical variables X_train_cat and X_test_cat (See the hint below for encoding categorical data

### 4.1 We first encode data that we can't order with OneHotEncoder

In [7]:
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['state', 'location_code', 'employmentstatus', 'marital_status', 'policy', 'policy_type', 'renew_offer_type',
                     'sales_channel', 'vehicle_class', 'response', 'gender']

X_train_1hot = X_train_cat[columns_to_encode]

encoder = OneHotEncoder(drop='first').fit(X_train_1hot)

cols = encoder.get_feature_names_out(X_train_1hot.columns)

X_train_1encode = pd.DataFrame(encoder.transform(X_train_1hot).toarray(), columns=cols)

X_train_1encode

Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7304,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7305,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


### 4.2 Then we concatenate the encoded categorical datas with the ones we'll encode with another method

In [8]:
X_train_ordinal = X_train_cat.drop(columns=columns_to_encode)
X_train_ordinal.reset_index(drop=True, inplace=True)
X_train_ordinal

Unnamed: 0,coverage,education,vehicle_size
0,Basic,Bachelor,Medsize
1,Extended,College,Medsize
2,Basic,College,Small
3,Basic,Bachelor,Medsize
4,Basic,Bachelor,Medsize
...,...,...,...
7302,Basic,College,Small
7303,Basic,College,Medsize
7304,Extended,High School or Below,Medsize
7305,Extended,Bachelor,Medsize


In [9]:
X_train_encode = pd.concat([X_train_ordinal, X_train_1encode, ], axis=1)
X_train_encode

Unnamed: 0,coverage,education,vehicle_size,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,Basic,Bachelor,Medsize,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,Extended,College,Medsize,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Basic,College,Small,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Basic,Bachelor,Medsize,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Basic,Bachelor,Medsize,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,Basic,College,Small,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,Basic,College,Medsize,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7304,Extended,High School or Below,Medsize,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7305,Extended,Bachelor,Medsize,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


### 4.3 Then we ordinal encode the relevant categorical datas

In [10]:
ordinal_columns = ['coverage', 'vehicle_size', 'education']
for col in ordinal_columns:
    print(X_train_encode[col].unique())

['Basic' 'Extended' 'Premium']
['Medsize' 'Small' 'Large']
['Bachelor' 'College' 'High School or Below' 'Master or Doctor']


In [11]:
X_train_encode['coverage'] = X_train_encode['coverage'].map({'Basic' : 0, 'Extended' : 0.5, 'Premium' : 1})

In [12]:
X_train_encode['education'] = X_train_encode['education'].map({'High School or Below' : 0, 'College' : 0.3333, 'Bachelor' : 0.6666, 'Master or Doctor' : 1})

In [13]:
X_train_encode['vehicle_size'] = X_train_encode['vehicle_size'].map({'Small' : 0, 'Medsize' : 0.5, 'Large' : 1})
X_train_encode

Unnamed: 0,coverage,education,vehicle_size,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,0.0,0.6666,0.5,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.5,0.3333,0.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.3333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.6666,0.5,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.6666,0.5,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,0.0,0.3333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,0.0,0.3333,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7304,0.5,0.0000,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7305,0.5,0.6666,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


### 4.4 Finally we apply the same steps to the test model

In [14]:
X_test_1hot = X_test_cat[columns_to_encode]
cols = encoder.get_feature_names_out(input_features=X_test_1hot.columns) 
X_test_1encode = pd.DataFrame(encoder.transform(X_test_1hot).toarray(),columns=cols) 
X_test_1encode

Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1823,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1824,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1825,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
X_test_ordinal = X_test_cat.drop(columns=columns_to_encode)
X_test_ordinal.reset_index(drop=True, inplace=True)
X_test_ordinal

Unnamed: 0,coverage,education,vehicle_size
0,Basic,High School or Below,Medsize
1,Basic,High School or Below,Medsize
2,Extended,College,Small
3,Extended,College,Medsize
4,Basic,High School or Below,Medsize
...,...,...,...
1822,Basic,College,Medsize
1823,Basic,High School or Below,Large
1824,Basic,Bachelor,Medsize
1825,Basic,High School or Below,Small


In [16]:
X_test_encode = pd.concat([X_test_ordinal, X_test_1encode, ], axis=1)
X_test_encode

Unnamed: 0,coverage,education,vehicle_size,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,Basic,High School or Below,Medsize,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,Basic,High School or Below,Medsize,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Extended,College,Small,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,Extended,College,Medsize,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Basic,High School or Below,Medsize,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,Basic,College,Medsize,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1823,Basic,High School or Below,Large,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1824,Basic,Bachelor,Medsize,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1825,Basic,High School or Below,Small,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
X_test_encode['coverage'] = X_test_encode['coverage'].map({'Basic' : 0, 'Extended' : 0.5, 'Premium' : 1})
X_test_encode['education'] = X_test_encode['education'].map({'High School or Below' : 0, 'College' : 0.3333, 'Bachelor' : 0.6666, 'Master or Doctor' : 1})
X_test_encode['vehicle_size'] = X_test_encode['vehicle_size'].map({'Small' : 0, 'Medsize' : 0.5, 'Large' : 1})
X_test_encode

Unnamed: 0,coverage,education,vehicle_size,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,0.0,0.0000,0.5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0000,0.5,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.5,0.3333,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.5,0.3333,0.5,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0000,0.5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,0.0,0.3333,0.5,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1823,0.0,0.0000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1824,0.0,0.6666,0.5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1825,0.0,0.0000,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# 5.Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.



In [18]:
X_train_encode.dtypes

coverage                          float64
education                         float64
vehicle_size                      float64
state_California                  float64
state_Nevada                      float64
state_Oregon                      float64
state_Washington                  float64
location_code_Suburban            float64
location_code_Urban               float64
employmentstatus_Employed         float64
employmentstatus_Medical Leave    float64
employmentstatus_Retired          float64
employmentstatus_Unemployed       float64
marital_status_Married            float64
marital_status_Single             float64
policy_2                          float64
policy_3                          float64
policy_type_Personal Auto         float64
policy_type_Special Auto          float64
renew_offer_type_Offer2           float64
renew_offer_type_Offer3           float64
renew_offer_type_Offer4           float64
sales_channel_Branch              float64
sales_channel_Call Center         

In [19]:
X_test_encode.dtypes

coverage                          float64
education                         float64
vehicle_size                      float64
state_California                  float64
state_Nevada                      float64
state_Oregon                      float64
state_Washington                  float64
location_code_Suburban            float64
location_code_Urban               float64
employmentstatus_Employed         float64
employmentstatus_Medical Leave    float64
employmentstatus_Retired          float64
employmentstatus_Unemployed       float64
marital_status_Married            float64
marital_status_Single             float64
policy_2                          float64
policy_3                          float64
policy_type_Personal Auto         float64
policy_type_Special Auto          float64
renew_offer_type_Offer2           float64
renew_offer_type_Offer3           float64
renew_offer_type_Offer4           float64
sales_channel_Branch              float64
sales_channel_Call Center         

# 6.Try a simple linear regression with all the data to see whether we are getting good results.

In [20]:
X_train_transformed = pd.concat([X_train_norm, X_train_encode], axis=1)
X_train_transformed

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,Year,Month,Day,coverage,education,vehicle_size,state_California,state_Nevada,state_Oregon,state_Washington,location_code_Suburban,location_code_Urban,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,marital_status_Married,marital_status_Single,policy_2,policy_3,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_Offer2,renew_offer_type_Offer3,renew_offer_type_Offer4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Luxury,vehicle_class_SUV,vehicle_class_Two-Door Car,response_Yes,gender_M
0,0.197865,0.511537,0.000000,0.800000,0.323232,0.2,1.000,0.0,0.0,0.300000,0.0,0.6666,0.5,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.064064,0.000000,0.165138,0.857143,0.252525,0.0,0.000,0.0,1.0,0.566667,0.5,0.3333,0.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.069945,0.811124,0.119266,0.200000,0.676768,0.0,0.000,0.0,1.0,0.900000,0.0,0.3333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.223316,0.722777,0.027523,0.057143,0.666667,0.0,1.000,0.0,0.0,0.233333,0.0,0.6666,0.5,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.049096,0.547474,0.045872,0.085714,0.868687,0.0,0.000,0.0,0.0,0.566667,0.0,0.6666,0.5,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,0.374669,0.879737,0.000000,0.885714,0.636364,0.0,0.125,0.0,0.0,0.933333,0.0,0.3333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,0.248175,0.225243,0.110092,0.485714,0.646465,0.0,0.250,0.0,1.0,0.133333,0.0,0.3333,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7304,0.488969,0.000000,0.623853,0.371429,0.040404,0.0,0.750,0.0,0.0,0.866667,0.5,0.0000,0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7305,0.877860,0.000000,0.715596,0.142857,0.565657,0.0,0.125,0.0,1.0,0.866667,0.5,0.6666,0.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [21]:
X_train_transformed.isna().sum()

customer_lifetime_value           0
income                            0
monthly_premium_auto              0
months_since_last_claim           0
months_since_policy_inception     0
number_of_open_complaints         0
number_of_policies                0
Year                              0
Month                             0
Day                               0
coverage                          0
education                         0
vehicle_size                      0
state_California                  0
state_Nevada                      0
state_Oregon                      0
state_Washington                  0
location_code_Suburban            0
location_code_Urban               0
employmentstatus_Employed         0
employmentstatus_Medical Leave    0
employmentstatus_Retired          0
employmentstatus_Unemployed       0
marital_status_Married            0
marital_status_Single             0
policy_2                          0
policy_3                          0
policy_type_Personal Auto   

In [22]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train_transformed,y_train)

In [23]:
X_test_transformed = np.concatenate([X_test_norm, X_test_encode], axis=1)

In [24]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train_transformed)
r2_score(y_train, predictions)

0.7102523370647722

In [25]:
predictions = lm.predict(X_train_transformed)
predictions_test = lm.predict(X_test_transformed )

compare= pd.DataFrame({'Actual': y_test , 'Predicted': predictions_test})
compare.head()



Unnamed: 0,Actual,Predicted
708,218.598065,198.792771
47,447.79344,432.694058
3995,451.2,401.041539
1513,355.641958,244.123287
3686,470.097411,385.45801


# 7.Define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

In [47]:
data = pd.concat([numerical_df, categorical_df], axis=1) 
data['policy'] = data['policy'].astype(str)
data.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,Year,Month,Day,state,response,coverage,education,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,2763.519279,56274,69.0,32,5,0,1,384.811147,2011,2,24,Washington,No,Basic,Bachelor,Employed,F,Suburban,Married,Corporate Auto,3,Offer1,Agent,Two-Door Car,Medsize
1,6979.535903,0,94.0,13,42,0,8,1131.464935,2011,1,31,Arizona,No,Extended,Bachelor,Unemployed,F,Suburban,Single,Personal Auto,3,Offer3,Agent,Four-Door Car,Medsize
2,12887.43165,48767,108.0,18,38,0,2,566.472247,2011,2,19,Nevada,No,Premium,Bachelor,Employed,F,Suburban,Married,Personal Auto,3,Offer1,Agent,Two-Door Car,Medsize
3,7645.861827,0,106.0,18,65,0,7,529.881344,2011,1,20,California,No,Basic,Bachelor,Unemployed,M,Suburban,Married,Corporate Auto,2,Offer1,Call Center,SUV,Medsize
4,2813.692575,43836,73.0,12,44,0,1,138.130879,2011,2,3,Washington,No,Basic,Bachelor,Employed,M,Rural,Single,Personal Auto,1,Offer1,Agent,Four-Door Car,Medsize


In [43]:
def prediction (X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_num = X_train.select_dtypes(include = np.number)
    X_test_num  = X_test.select_dtypes(include = np.number)
    
    X_train_cat = X_train.select_dtypes(include = object)
    X_test_cat  = X_test.select_dtypes(include = object)
    
    transformer = MinMaxScaler().fit(X_train_num)
    
    X_train_normalized = transformer.transform(X_train_num)
    X_train_norm = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
    
    X_test_normalized = transformer.transform(X_test_num)
    X_test_norm = pd.DataFrame(X_test_normalized, columns=X_test_num.columns)
    
    encoder = OneHotEncoder(drop='first').fit(X_train_cat)
    
    cols = encoder.get_feature_names_out(input_features=X_train_cat.columns) 
    X_train_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols) 
    
    cols = encoder.get_feature_names_out(input_features=X_test_cat.columns) 
    X_test_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols) 
    
    X_train_transformed = np.concatenate([X_train_norm, X_train_encode], axis=1)
    X_test_transformed = np.concatenate([X_test_norm, X_test_encode], axis=1)

    if model=='linear':
        LR=linear_model.LinearRegression().fit(X_train_transformed,y_train)
        predictions = LR.predict(X_train_transformed)
        print ('The r2 score is: ', r2_score(y_train,predictions))
    elif model=='kneighbors':
        scores = []
        k_val=[]
        for k in range(2,25):
            model = KNeighborsRegressor(n_neighbors=k,metric='manhattan',weights='distance')
            model.fit(X_train_transformed, y_train)
            scores.append(model.score(X_test_transformed, y_test))
            k_val.append(k)
        k_val=k_val[scores.index(max(scores))]
        print ('The optimum value of k is: ', k_val)
        print ('The accuracy score is: ', max(scores))  



# 8.Use the function to check LinearRegressor and KNeighborsRegressor.

In [44]:
y = data['total_claim_amount']
X = data.drop(['total_claim_amount'], axis=1)

prediction(X,y,'linear')

The r2 score is:  0.7115666654517692


In [46]:
from sklearn.neighbors import KNeighborsRegressor

prediction(X,y,'kneighbors')

The optimum value of k is:  13
The accuracy score is:  0.6033908941823378


# 9. You can check also the MLPRegressor for this task!

In [52]:
from sklearn.neural_network import MLPRegressor

MLP = MLPRegressor().fit(X_train_transformed, y_train)
MLP.score(X_test_transformed,y_test)



0.7382268173489306

The result is slightly better than the linear regression