In [75]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

df = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')

In [173]:
#Show DataFrame info.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9134 entries, 0 to 9133
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer                       9134 non-null   object 
 1   State                          9134 non-null   object 
 2   Customer Lifetime Value        9134 non-null   float64
 3   Response                       9134 non-null   object 
 4   Coverage                       9134 non-null   object 
 5   Education                      9134 non-null   object 
 6   Effective To Date              9134 non-null   object 
 7   EmploymentStatus               9134 non-null   object 
 8   Gender                         9134 non-null   object 
 9   Income                         9134 non-null   int64  
 10  Location Code                  9134 non-null   object 
 11  Marital Status                 9134 non-null   object 
 12  Monthly Premium Auto           9134 non-null   i

In [174]:
#Describe DataFrame.
df.describe()

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
count,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0,9134.0
mean,8004.940475,37657.380009,93.219291,15.097,48.064594,0.384388,2.96617,434.088794
std,6870.967608,30379.904734,34.407967,10.073257,27.905991,0.910384,2.390182,290.500092
min,1898.007675,0.0,61.0,0.0,0.0,0.0,1.0,0.099007
25%,3994.251794,0.0,68.0,6.0,24.0,0.0,1.0,272.258244
50%,5780.182197,33889.5,83.0,14.0,48.0,0.0,2.0,383.945434
75%,8962.167041,62320.0,109.0,23.0,71.0,0.0,4.0,547.514839
max,83325.38119,99981.0,298.0,35.0,99.0,5.0,9.0,2893.239678


In [176]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [None]:
#Show a plot of the total number of responses.

sns.set_style("white")

sns.countplot(x='Response', data=df)

plt.show()

In [None]:
#Show a plot of the response rate by the sales channel.

sns.set_style("whitegrid")

sns.countplot(x='Sales Channel', palette="pastel", color="b", alpha = .8, data=df)
sns.countplot(data=df, x='Sales Channel', hue='Response', palette="rocket", alpha = .9)
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
#Show a plot of the response rate by the total claim amount.

sns.set_style("darkgrid")

sns.barplot(data=df, x='Response', y='Total Claim Amount', palette="dark", alpha = .8)

plt.show()

In [None]:
#Show a plot of the response rate by income.

sns.set_theme(style="whitegrid")

sns.barplot(y='Response', x='Income', palette = 'rocket', data=df)
sns.despine(left=True, bottom=True)

plt.show()

In [None]:
#Check the data types of the columns.
df.dtypes

In [None]:
#Get the numeric data into dataframe called numerical
df_num = df.select_dtypes(np.number)
df_num

In [None]:
#and categorical columns in a dataframe called categoricals
df_cat = df.select_dtypes(object)
df_cat

In [None]:
#Use seaborn library to construct distribution plots for the numerical variables
sns.set_style = 'white'
for col in df_num.columns:
  print(col)
  sns.histplot(df_num[col], kde=True) #replaced distplot with histplot function
  plt.show()

In [None]:
#Use Matplotlib to construct histograms
for col in df_num.columns:
    print(col)
    plt.hist(df_num[col])
    plt.show()

- Do the distributions for different numerical variables look like a normal distribution

Almost every numerical column has data that follows a normal distribuition, or close to it.
The two exceptions are 'Months Since Last Claim' and 'Months Since Policy Inception'

- For the numerical variables, check the multicollinearity between the features.

In [None]:
#Write code for both the correlation matrix and for seaborn heatmap.
corr_matrix = df_num.corr()
display(corr_matrix)

sns.set_theme(style="dark")

mask = np.zeros_like(corr_matrix)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.heatmap(corr_matrix, mask=mask, annot=True)
plt.show()

- Drop one of the two features that show a high correlation between them (greater than 0.9). If there is no pair of features that have a high correlation, then do not drop any features.

No highly correlated features were found.

In [162]:
#X-y split.
x = df.drop(["customer", "total_claim_amount", 'effective_to_date'], axis=1)
y = df["total_claim_amount"]

In [163]:
x_num = x.select_dtypes(np.number)

scaler = StandardScaler()
scaler.fit(x_num)
x_num_scaled = scaler.transform(x_num)

x_num_scaled = pd.DataFrame(x_num_scaled, columns = x_num.columns)

In [164]:
x_cat = x.select_dtypes('object')

encoder = OneHotEncoder(drop='first')
encoder.fit(x_cat)

cols = []
for i in range(len(encoder.categories_)):
    cols += list(encoder.categories_[i])[1:]

og_cols = len(x_cat.columns)
x_cat[cols] = encoder.transform(x_cat).todense()
x_cat = x_cat.drop(x_cat.columns[:og_cols], axis=1)

In [166]:
x_concat = pd.concat([x_num_scaled, x_cat], axis=1)

In [169]:
x_train, x_test, y_train, y_test = train_test_split(x_concat, y, test_size=0.3, random_state=42)

In [170]:
lm = LinearRegression()
lm.fit(x_train, y_train)

predictions = lm.predict(x_test)

In [172]:
print("R2_score:", round(r2_score(y_test, predictions), 2)) 

mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = mean_squared_error(y_test, predictions, squared=False)
print("RMSE:", rmse)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)

R2_score: 0.77
MSE: 19182.508693113658
RMSE: 138.50093390700894
MAE: 94.52272816801205
