In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
## Install xlrd package to load Excel files
# conda install openpyxl
## conda install xlrd

# Linear Regression (Tuseday)

## 0. importing the data

In [2]:
# import the data
df = pd.read_csv('C:/Users/dengd/Documents/GitHub/IronDuo/Class_Materials/Case_Studies/Customer_Analysis_Case_Study/Data/Data_Marketing_Customer_Analysis_Round3.csv')

# delet one column
df.drop(columns=['effective_to_date'], inplace = True)

### 1. X-y split (y is the target variable, which is the total claim amount)

In [3]:
X = df.drop('total_claim_amount', axis=1)
y = df.total_claim_amount

### 2. Getting numerical and categorical columns

### 3. remove outliers

In [4]:
X['number_of_open_complaints'] = X['number_of_open_complaints'].astype('category')

In [5]:
from scipy.stats import iqr
def remove_outliers(df):
    for c in df.columns:
            pct_75 = np.percentile(df[c], 75)
            pct_25 = np.percentile(df[c], 25)
            upper_bound = pct_75 + 1.5*iqr(df[c])
            lower_bound = pct_25 - 1.5*iqr(df[c])
            condition = (df[c] < upper_bound) & (df[c] > lower_bound)
            df[c] = df[c][condition]  # Filter out the outliers
    return df

In [6]:
numericalX = X.select_dtypes(include=[np.number])
categoricalX = X.select_dtypes(include=[np.object])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(include=[np.object])


### 4.concatenate the numerical and catergorical columns

In [7]:
X = pd.concat([pd.get_dummies(categoricalX,drop_first=True),
               remove_outliers(numericalX)],
              axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10689 entries, 0 to 10688
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   region_east                      10689 non-null  uint8  
 1   region_north west                10689 non-null  uint8  
 2   region_west region               10689 non-null  uint8  
 3   response_yes                     10689 non-null  uint8  
 4   coverage_extended                10689 non-null  uint8  
 5   coverage_premium                 10689 non-null  uint8  
 6   education_college                10689 non-null  uint8  
 7   education_doctor                 10689 non-null  uint8  
 8   education_high school or below   10689 non-null  uint8  
 9   education_master                 10689 non-null  uint8  
 10  month_jan                        10689 non-null  uint8  
 11  employment_status_employed       10689 non-null  uint8  
 12  employment_status_

In [8]:
X.isna().any()

region_east                        False
region_north west                  False
region_west region                 False
response_yes                       False
coverage_extended                  False
coverage_premium                   False
education_college                  False
education_doctor                   False
education_high school or below     False
education_master                   False
month_jan                          False
employment_status_employed         False
employment_status_medical leave    False
employment_status_retired          False
employment_status_unemployed       False
gender_m                           False
location_code_suburban             False
location_code_urban                False
marital_status_married             False
marital_status_single              False
policy_type_personal auto          False
policy_type_special auto           False
policy_corporate l2                False
policy_corporate l3                False
policy_personal 

In [None]:
# 

### 4. Train-test split.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.3, random_state= 34)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head(3)

Unnamed: 0,region_east,region_north west,region_west region,response_yes,coverage_extended,coverage_premium,education_college,education_doctor,education_high school or below,education_master,...,vehicle_class_suv,vehicle_class_two-door car,vehicle_size_medsize,vehicle_size_small,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_policies
2705,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,4786.0,45515,61.0,10,33,
2209,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,,80340,71.0,32,35,2.0
4004,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,6412.0,58776,83.0,20,50,4.0


### 5. Standardize the data (after the data split).

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [11]:
X_train_s

array([[-0.31438793, -0.63065117, -0.72397468, ..., -0.50568408,
        -0.5312763 ,         nan],
       [-0.31438793, -0.63065117, -0.72397468, ...,  1.67628147,
        -0.45975825, -0.33380973],
       [-0.31438793, -0.63065117,  1.38126378, ...,  0.48611845,
         0.07662717,  0.63914766],
       ...,
       [-0.31438793, -0.63065117,  1.38126378, ..., -0.20814332,
        -0.5312763 , -0.82028843],
       [-0.31438793, -0.63065117, -0.72397468, ..., -0.60486433,
        -1.49677005, -0.82028843],
       [-0.31438793, -0.63065117,  1.38126378, ..., -0.20814332,
        -0.17368602,  0.63914766]])

### 6. Apply linear regression.

In [12]:
# import statsmodels.api as sm

# # Fit a linear regression model using statsmodels
# X_train_const = sm.add_constant(X_train) # adding a constant in the model
# model = sm.OLS(y_train, X_train_const).fit()

# # Print the summary of the regression results
# print(model.summary())

In [13]:
import statsmodels.api as sm

# Fit a linear regression model using statsmodels
X_train_const = sm.add_constant(X_train_s) # adding a constant in the model
model_train = sm.OLS(y_train, X_train_const).fit()

MissingDataError: exog contains inf or nans

In [None]:
X_test_const = sm.add_constant(X_test_s) # adding a constant in the model
model_test = sm.OLS(y_test, X_test_const).fit() 

In [None]:
# Print the summary of the regression results
print(model_test.summary())

### 7. model parameters

In [None]:
model=LinearRegression()    # model
model.fit(X_train_s, y_train)   # model train

In [None]:
model.coef_

### 8. Making prediction

In [None]:
y_pred = model.predict(X_test_s)  # model prediction
y_pred_train = model.predict(X_train_s)

# Evaluating Model Performance (Wednesday)

In [None]:
result= pd.DataFrame({"y_test": list(y_test),"y_pred": list(y_pred)})
result

In [None]:
# Make an scatter plot y_pred vs y
# What kind of plot you will get if all the all the predictions are ok?
# A stright line
fig, ax = plt.subplots(1,3,figsize=(14,4))

ax[0].plot(y_pred, y_test, 'o')
ax[0].set_xlabel("y_test")
ax[0].set_ylabel("y_pred")
ax[0].set_title("Test Set -Predicted vs real")

ax[1].hist(y_test-y_pred)
ax[1].set_xlabel("Test y-y_pred")
ax[1].set_title("Test Set Residual histogram")

ax[2].plot(y_pred, y_pred - y_test, "o")
ax[2].set_xlabel("predited")
ax[2].set_ylabel("residuals")
ax[2].set_title("Residuals by Predicted")
ax[2].plot(y_pred,np.zeros(len(y_pred)),linestyle='dashed')

In [None]:
yp_ = y_pred
yt_ = y_test
sns.regplot(yp_,yt_,scatter_kws={"color": "red"}, line_kws={"color": "black"})

### 2. Error metrics

In [None]:
import math
print(mse(y_test,y_pred)) # MSE(Mean squared error)
print(mae(y_test,y_pred)) # MAE (Mean Absolute Error)
print(math.sqrt(mae(y_test,y_pred)))# RMSE(Root Mean Square Error)
print(mse(y_train,y_pred_train)) # Mean Squared Error

In [None]:
# R squared
R2= r2_score(y_test,y_pred)
R2

In [None]:
# adjusted R squared
R2_test=model.score(X_test_s,y_test)
R2_train=model.score(X_train_s,y_train)
Adj_R2= 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
Adj_R2

### 3. Feature Importances

In [None]:
features_importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': abs(model.coef_.reshape(len(X_train.columns),))
})
features_importances = features_importances.sort_values(by='Importance', ascending=False)
features_importances

In [None]:
plt.bar(x=features_importances['Attribute'].iloc[:10], height=features_importances['Importance'].iloc[:10], color='#087E8B')
plt.title('Feature importance rankings', size=12)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.set_context('notebook')
sns.color_palette("bright")

f, ax = plt.subplots(figsize=(18, 12))
sns.despine(f, left=True, bottom=True)
sns.barplot(x="education", y='total_claim_amount', data=df, ax=ax)

# Model Iteration (Wednesday and Thursday)