In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats as stats
from sklearn import linear_model
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

In [169]:
# To make run all you ask in one cell, not only the last required
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [170]:
# data=pd.read_csv('C:/Student/IRONHACK/Week1/D3/Lessons/merged_clean_reduced.csv')
data=pd.read_csv('C:/Student/IRONHACK/Week1/D4_Regression/Labs/lab-customer-analysis-round-5/files_for_lab/csv_files/marketing_customer_analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


### DATA PROCESSING

#### x,y split

In [171]:
# identify numerical variables
numerical = data.select_dtypes(np.number)
categorical = data.select_dtypes(np.object)
#separate the features from the labels
y_num = numerical['Total Claim Amount']
X_num = numerical.drop(['Total Claim Amount'], axis=1)

#### Normalize (numerical)

In [172]:
# does not deal well with getting rid of outliers
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num)
x_normalized = MinMaxtransformer.transform(X_num)
print(x_normalized.shape)
x_normalized = pd.DataFrame(x_normalized,columns=X_num.columns)
x_normalized.head()

(9134, 7)


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875
2,0.13496,0.487763,0.198312,0.514286,0.383838,0.0,0.125
3,0.070589,0.0,0.189873,0.514286,0.656566,0.0,0.75
4,0.011245,0.438443,0.050633,0.342857,0.444444,0.0,0.0


#### One Hot/Label Encoding (categorical).

In [192]:
X_cat = data.select_dtypes(include = np.object)
# X_cat.columns
X_cat_reduced = X_cat[['Coverage', 'Education', 'Gender']]
X_cat_reduced.head()

Unnamed: 0,Coverage,Education,Gender
0,Basic,Bachelor,F
1,Extended,Bachelor,F
2,Premium,Bachelor,F
3,Basic,Bachelor,M
4,Basic,Bachelor,M


In [204]:
#one hot encoding 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(X_cat_reduced)
print(encoder.categories_)
encoded = encoder.transform(X_cat_reduced).toarray()

enc = [columname for sublist in encoder.categories_ for columname in sublist]
onehot_encoded = pd.DataFrame(encoded,columns=enc)
# flatten
onehot_encoded.head()

[array(['Basic', 'Extended', 'Premium'], dtype=object), array(['Bachelor', 'College', 'Doctor', 'High School or Below', 'Master'],
      dtype=object), array(['F', 'M'], dtype=object)]


Unnamed: 0,Basic,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [175]:
#because one of the variables can be deduced from the others, no point in keeping all these columns around
onehot_encoded = onehot_encoded.drop(['Basic'],axis=1)
onehot_encoded.head()

Unnamed: 0,Extended,Premium
0,0.0,0.0
1,1.0,0.0
2,0.0,1.0
3,0.0,0.0
4,0.0,0.0


In [191]:
#label encoding keeps just one column and makes it numerical, but watch out: you may be introducing unintended semantics
# turn table into a single column (array)
# [] - creates array, [[]] - creates df

from sklearn.preprocessing import LabelEncoder
label_cat= X_cat_reduced['Coverage']
label_encoded = LabelEncoder().fit(label_cat).transform(label_cat) # ordered wrt value counts

label_encoded = pd.DataFrame(label_encoded,columns=X_cat_reduced.columns)
# print(label_encoded)
label_encoded.head()

# Concatenate first and then normalise

Unnamed: 0,Coverage
0,0
1,1
2,2
3,0
4,0


#### Concat dataframes

In [177]:
X_num.shape
onehot_encoded.shape
X = pd.concat([X_num, onehot_encoded],axis=1)
X.head(2)

(9134, 7)

(9134, 2)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,"(Extended,)","(Premium,)"
0,2763.519279,56274,69,32,5,0,1,0.0,0.0
1,6979.535903,0,94,13,42,0,8,1.0,0.0


In [208]:
# wont work...
X.rename(columns={('Extended',): 'Extended Coverage', ('Premium',):'Premium Coverage'}, inplace=True)
X.head(2)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Extended Coverage,Premium Coverage
0,2763.519279,56274,69,32,5,0,1,0.0,0.0
1,6979.535903,0,94,13,42,0,8,1.0,0.0


### LINEAR REGRESSION

#### Train, test, split

In [179]:
y = numerical['Total Claim Amount']
y.head()

0     384.811147
1    1131.464935
2     566.472247
3     529.881344
4     138.130879
Name: Total Claim Amount, dtype: float64

In [180]:
# train test split is the way ML generates its claim to fame: we build the model on a portion of the data but we then validate it in another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [181]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7307, 9)
(1827, 9)
(7307,)
(1827,)


In [182]:
X_train.head(2)
X_test.head(2)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,"(Extended,)","(Premium,)"
7706,3265.156348,25820,82,10,69,0,1,1.0,0.0
779,11318.13083,79270,95,28,61,3,2,1.0,0.0


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,"(Extended,)","(Premium,)"
7175,5112.570311,56587,65,4,88,1,6,0.0,0.0
7255,4882.286714,21236,123,3,12,0,1,1.0,0.0


#### Apply linear regression.

In [183]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [184]:
model.predict(X_test)

array([223.47538908, 650.6744376 , 316.9487838 , ..., 132.89899858,
       699.81840833, 335.0692084 ])

In [185]:
from sklearn.metrics import r2_score

predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.4743117109746039

### Model Validation

#### R2 / MSE / RMSE / MAE

In [186]:
from sklearn.metrics import mean_squared_error, r2_score

In [187]:
mse = mean_squared_error(y_test, predictions)
print(mse)

41872.21981739902


In [188]:
rmse = math.sqrt(mse)
print(rmse)

204.62702611678404


In [189]:
r2 = r2_score(y_test, predictions)
r2

0.4743117109746039