# Linear Regression Lab

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

In [2]:
clean_df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

RAND_STATE = 34 # for reproducible shuffling

In [3]:
# Using the np.number and object methods to segregate the numerical data types and categorical data types
# respectively, since for now we will only use the numerical variables.

numerical_df = clean_df.select_dtypes(include=np.number)
categorical_df = clean_df.select_dtypes(include=object)

### Tuesday (31.01.2023)


#### Activity 1:
X-y split (y is the target variable, which is the total claim amount).

In [4]:
X = numerical_df.drop(['total_claim_amount'], axis = 1)
y = numerical_df.total_claim_amount

#### Activity 2:
Train-test split.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 30, random_state = RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

#### Activity 3:
Standardize the data (after the data split).

In [6]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train) # get the parameters using the train set and apply to them

In [7]:
X_test_s = scaler.transform(X_test) # apply the parameters of the train set to the test set

# You don't transform the target feature.

#### Activity 4:
Apply linear regression.

In [8]:
X_train_const = sm.add_constant(X_train_s)

model = sm.OLS(y_train, X_train_const).fit()
y_pred_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test_s)
y_pred_test = model.predict(X_test_const)

#### Activity 5:
Model Interpretation.

In [9]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.410
Method:                 Least Squares   F-statistic:                     1058.
Date:                Fri, 03 Feb 2023   Prob (F-statistic):               0.00
Time:                        16:25:30   Log-Likelihood:                -72834.
No. Observations:               10659   AIC:                         1.457e+05
Df Residuals:                   10651   BIC:                         1.457e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        434.2805      2.176    199.587      0.0

### Wednesday (01.02.2023)

#### Model Evaluation - activities:
- MSE
- RMSE
- MAE
- R2
- Adjusted R2
- Feature Importance


In [10]:
# Calculating the MSE - Mean squared error for both the train and test set:

print(mse(y_train, y_pred_train))
print(mse(y_test, y_pred_test))

# You can see that the MSE is bigger for the test set than for the train set, which is expected.

50427.47391684814
80055.97276654038


In [11]:
# Calculating the RMSE - Root mean squared error for both the train and test set:

import math

print(math.sqrt(mse(y_train, y_pred_train)))
print(math.sqrt(mse(y_test, y_pred_test)))

224.56062414601573
282.94164198035674


In [12]:
# Calculating the MAE - Mean Absolute Error for both the train and test set:

print(mae(y_train, y_pred_train))
print(mae(y_test, y_pred_test))

152.43643057559507
189.6765386550149


In [13]:
# Calculating the R2 for both the train and test set:

print(r2_score(y_train, y_pred_train))
print(r2_score(y_test, y_pred_test))

# Again, the "performance" of the model is better on the train set (higher R2) than in the test set.

0.4101208388205483
0.3173289787971604


In [14]:
# Calculating the adjusted R2 of the train set:

print(model.rsquared_adj)

0.40973316121954795


In [15]:
# Getting a data frame with the feature importance in the model

params = pd.DataFrame({"features": X_train.columns, "coef": abs(model.params).drop("const")})

In [16]:
features_importances = params.sort_values(by= "coef", ascending=False)
features_importances

Unnamed: 0,features,coef
x3,monthly_premium_auto,187.300914
x2,income,31.813133
x1,customer_lifetime_value,7.379252
x5,months_since_policy_inception,1.965856
x4,months_since_last_claim,1.260289
x7,number_of_policies,1.1484
x6,number_of_open_complaints,1.005595


#### Model Iteration - activity:

Rerun the model after adding the hot encoded categorical variables as well as other numeric categorical variables (e.g. number of open complaintes).

In [17]:
# Splitting the dependent and the independent features:

X = clean_df.drop(['total_claim_amount'], axis = 1)
y = clean_df.total_claim_amount

numerical = X.select_dtypes(exclude = object)
categorical = pd.get_dummies(X.select_dtypes(include=object),prefix="dmy",drop_first=True)

X = pd.concat([numerical,categorical],axis=1)

In [18]:
# Test-train split:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=42)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [19]:
# Standard Scaler to the numerical columns:

scaler = StandardScaler()
X_train[numerical.columns] = scaler.fit_transform(X_train[numerical.columns])
X_test[numerical.columns] = scaler.transform(X_test[numerical.columns])

In [20]:
# Applying linear regression:

X_train_const = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train_const).fit()
y_pred_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test)
y_pred_test = model.predict(X_test_const)

In [21]:
# Veryfing the model:

print(model.summary()) # way better R2

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.771
Method:                 Least Squares   F-statistic:                     342.3
Date:                Fri, 03 Feb 2023   Prob (F-statistic):               0.00
Time:                        16:25:30   Log-Likelihood:                -67755.
No. Observations:               10659   AIC:                         1.357e+05
Df Residuals:                   10553   BIC:                         1.365e+05
Df Model:                         105                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [22]:
# R2 for the test set:

print(r2_score(y_test, y_pred_test))

0.8136289341329848


#### Optional activity:
Rerun the model after removing the outliers and compare the results using R2 metric.

In [51]:
from scipy.stats import iqr
def remove_outliers(df):
    if isinstance(df,pd.core.frame.DataFrame):
        Q1 = df.quantile(0.15)
        Q3 = df.quantile(0.85)
        IQR = Q3 - Q1
        trueList = ~((df < (Q1 - 1.5*IQR)) |(df > (Q3 + 1.5*IQR)))
    else:
        raise TypeError
    return trueList

In [52]:
X = clean_df.drop(['total_claim_amount'], axis = 1)
y = clean_df.total_claim_amount

numerical = X.select_dtypes(exclude = object)
categorical = pd.get_dummies(X.select_dtypes(include=object),prefix="dmy",drop_first=True)

non_outliers = remove_outliers(numerical)

numerical = numerical[non_outliers.all(1)]
categorical = categorical[non_outliers.all(1)]

X = pd.concat([numerical,categorical],axis=1)
y = y[non_outliers.all(1)]

In [53]:
# Train/test splitting:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=55)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [54]:
transformer = StandardScaler()
transformer.fit(X_train[numerical.columns] )

X_test[numerical.columns] = transformer.transform(X_test[numerical.columns])
X_train[numerical.columns] = transformer.transform(X_train[numerical.columns])

In [55]:
# Applying linear regression:

X_train_const = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train_const).fit()
y_pred_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test)
y_pred_test = model.predict(X_test_const)

In [56]:
print(model.summary()) # only slightly better R2 -> 0.770 without removing outliers and 0.747 removing outliers

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.757
Model:                            OLS   Adj. R-squared:                  0.754
Method:                 Least Squares   F-statistic:                     282.5
Date:                Fri, 03 Feb 2023   Prob (F-statistic):               0.00
Time:                        16:37:18   Log-Likelihood:                -60542.
No. Observations:                9633   AIC:                         1.213e+05
Df Residuals:                    9527   BIC:                         1.221e+05
Df Model:                         105                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [57]:
# R2 of the test set:

print(r2_score(y_test, y_pred_test))

0.7921129522929404
