In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
## Install xlrd package to load Excel files
# conda install openpyxl
## conda install xlrd

## 0. importing the data

In [2]:
# import the data
df = pd.read_csv('C:/Users/dengd/Documents/GitHub/IronDuo/Class_Materials/Case_Studies/Customer_Analysis_Case_Study/Data/Data_Marketing_Customer_Analysis_Round3.csv')

# delet one column
df.drop(columns=['effective_to_date'], inplace = True)
df

Unnamed: 0,region,customer_lifetime_value,response,coverage,education,month,employment_status,gender,income,location_code,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,central,4809,no,basic,college,feb,employed,m,48029,suburban,...,52,0,9,corporate auto,corporate l3,offer3,agent,292,four-door car,medsize
1,west region,2228,no,basic,college,jan,unemployed,f,92260,suburban,...,26,0,1,personal auto,personal l3,offer4,call center,744,four-door car,medsize
2,east,14947,no,basic,bachelor,feb,employed,m,22139,suburban,...,31,0,2,personal auto,personal l3,offer3,call center,480,suv,medsize
3,north west,22332,yes,extended,college,jan,employed,m,49078,suburban,...,3,0,2,corporate auto,corporate l3,offer2,branch,484,four-door car,medsize
4,north west,9025,no,premium,bachelor,jan,medical leave,f,23675,suburban,...,31,0,7,personal auto,personal l2,offer1,branch,707,four-door car,medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10684,central,15563,no,premium,bachelor,jan,unemployed,f,61541,suburban,...,40,0,7,personal auto,personal l1,offer3,web,1214,luxury car,medsize
10685,north west,5259,no,basic,college,jan,employed,f,61146,urban,...,68,0,6,personal auto,personal l3,offer2,branch,273,four-door car,medsize
10686,central,23893,no,extended,bachelor,feb,employed,f,39837,rural,...,63,0,2,corporate auto,corporate l3,offer1,web,381,luxury suv,medsize
10687,west region,11971,no,premium,college,feb,employed,f,64195,urban,...,27,4,6,personal auto,personal l1,offer1,branch,618,suv,medsize


### 1. X-y split (y is the target variable, which is the total claim amount)

In [3]:
X = df.drop('total_claim_amount', axis=1)
y = df.total_claim_amount

### 2. Getting numerical and categorical columns

In [4]:
numericalX = X.select_dtypes(include=[np.number])
categoricalX = X.select_dtypes(include=[np.object])
numericalX

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(include=[np.object])


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,4809,48029,61,7,52,0,9
1,2228,92260,64,3,26,0,1
2,14947,22139,100,34,31,0,2
3,22332,49078,97,10,3,0,2
4,9025,23675,117,33,31,0,7
...,...,...,...,...,...,...,...
10684,15563,61541,253,12,40,0,7
10685,5259,61146,65,7,68,0,6
10686,23893,39837,201,11,63,0,2
10687,11971,64195,158,0,27,4,6


### 3. Train-test split.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(numericalX , y, test_size=0.3, random_state=100)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
861,5908,86277,74,19,33,0,3
9053,2397,22283,65,16,93,0,1
9515,5296,96510,73,33,44,0,3


### 4. Standardize the data (after the data split).

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numericalX_s = scaler.fit_transform(X_train)
numericalX_s

array([[-0.30279362,  1.38283862, -0.55997483, ..., -0.55202731,
        -0.4231219 ,  0.00755297],
       [-0.8099625 , -1.19862973, -0.82461546, ...,  1.60171543,
        -0.4231219 , -0.82349639],
       [-0.39119788,  1.79562991, -0.58937935, ..., -0.15717448,
        -0.4231219 ,  0.00755297],
       ...,
       [-0.15299751, -1.45813196, -0.94223352, ..., -1.23404584,
        -0.4231219 , -0.40797171],
       [ 0.10311483,  1.4965549 ,  0.52799221, ...,  0.0581998 ,
        -0.4231219 ,  0.00755297],
       [-0.55009442,  1.05354953, -0.85401998, ...,  0.9555926 ,
         0.67004042,  1.66965171]])

### 5. Apply linear regression.

In [14]:
import statsmodels.api as sm

# Fit a linear regression model using statsmodels
X_train_const = sm.add_constant(X_train) # adding a constant in the model
model = sm.OLS(y_train, X_train_const).fit()

# Print the summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.409
Model:                            OLS   Adj. R-squared:                  0.408
Method:                 Least Squares   F-statistic:                     737.9
Date:                Tue, 31 Jan 2023   Prob (F-statistic):               0.00
Time:                        17:49:47   Log-Likelihood:                -51128.
No. Observations:                7482   AIC:                         1.023e+05
Df Residuals:                    7474   BIC:                         1.023e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [11]:
model=LinearRegression()    # model
model.fit(X_train, y_train)   # model train

LinearRegression()

In [12]:
model.coef_

array([-8.87200400e-04, -1.27983361e-03,  5.47781352e+00,  1.63832508e-01,
       -5.30985751e-02, -2.42012553e+00,  1.34700208e+00])