In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import pickle as pkl

import warnings
warnings.filterwarnings('ignore') 

In [2]:
mkt_data = pd.read_csv('marketing_customer_analysis.csv')
mkt_data

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.431650,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,23405.987980,No,Basic,Bachelor,2/10/11,Employed,M,71941,...,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize
9130,PK87824,California,3096.511217,Yes,Extended,College,2/12/11,Employed,F,21604,...,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize
9131,TD14365,California,8163.890428,No,Extended,Bachelor,2/6/11,Unemployed,M,0,...,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize
9132,UP19263,California,7524.442436,No,Extended,College,2/3/11,Employed,M,21941,...,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large


In [3]:
#X-y split.
y = mkt_data['Total Claim Amount']
X = mkt_data.drop(['Total Claim Amount'], axis=1)

In [4]:
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = np.object)
cat_data = pd.get_dummies(X, drop_first=True)
cat_data

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Customer_AA11235,Customer_AA16582,Customer_AA30683,...,Sales Channel_Branch,Sales Channel_Call Center,Sales Channel_Web,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Medsize,Vehicle Size_Small
0,2763.519279,56274,69,32,5,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,6979.535903,0,94,13,42,0,8,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,12887.431650,48767,108,18,38,0,2,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,7645.861827,0,106,18,65,0,7,0,0,0,...,0,1,0,0,0,1,0,0,1,0
4,2813.692575,43836,73,12,44,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,0,2,0,0,0,...,0,0,1,0,0,0,0,0,1,0
9130,3096.511217,21604,79,14,28,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
9131,8163.890428,0,85,9,37,3,2,0,0,0,...,1,0,0,0,0,0,0,0,1,0
9132,7524.442436,21941,96,34,3,0,3,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
#One Hot/Label Encoding (categorical).
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [6]:
# after the .fit()
{'Male': [1,0],
'Female': [0,0],
'U': [0,1]}

{'Male': [1, 0], 'Female': [0, 0], 'U': [0, 1]}

In [7]:
encoded = encoder.transform(X_cat).toarray()
encoded #.shape # 

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [8]:
encoder.categories_

[array(['AA10041', 'AA11235', 'AA16582', ..., 'ZZ89380', 'ZZ91716',
        'ZZ97035'], dtype=object),
 array(['Arizona', 'California', 'Nevada', 'Oregon', 'Washington'],
       dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Basic', 'Extended', 'Premium'], dtype=object),
 array(['Bachelor', 'College', 'Doctor', 'High School or Below', 'Master'],
       dtype=object),
 array(['1/1/11', '1/10/11', '1/11/11', '1/12/11', '1/13/11', '1/14/11',
        '1/15/11', '1/16/11', '1/17/11', '1/18/11', '1/19/11', '1/2/11',
        '1/20/11', '1/21/11', '1/22/11', '1/23/11', '1/24/11', '1/25/11',
        '1/26/11', '1/27/11', '1/28/11', '1/29/11', '1/3/11', '1/30/11',
        '1/31/11', '1/4/11', '1/5/11', '1/6/11', '1/7/11', '1/8/11',
        '1/9/11', '2/1/11', '2/10/11', '2/11/11', '2/12/11', '2/13/11',
        '2/14/11', '2/15/11', '2/16/11', '2/17/11', '2/18/11', '2/19/11',
        '2/2/11', '2/20/11', '2/21/11', '2/22/11', '2/23/11', '2/24/11',
        '2/25/11', '2/26/11', '2/27

In [9]:
#Concat DataFrames
X = np.concatenate([X_num, encoded], axis=1)
X.shape

(9134, 9241)

In [10]:
# Normalize (numerical).
transformer = Normalizer()
transformer.fit(X_num)

with open('transformer.pkl', 'wb') as file:
    pkl.dump(transformer, file)

with open('transformer.pkl', 'rb') as file:
    transformer = pkl.load(file)

x_normalized = transformer.transform(X_num)
#pd.DataFrame(x_normalized)
x_normalized.shape

(9134, 7)

In [11]:
#looking for relationships
mkt_data.groupby('Gender', as_index=False).agg({'Total Claim Amount': 'mean'})

Unnamed: 0,Gender,Total Claim Amount
0,F,412.856483
1,M,456.184439


In [12]:
### Linear Regression
    # Train-test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
 # Apply linear regression.
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [14]:
### Model Validation
predictions  = model.predict(X_test)
predictions.shape

(2741,)

In [18]:
# Description:
  # R2.
  # MSE.
  # RMSE.
  # MAE.
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.7301149187323629, 148.4314049254336, 22031.881968138037)