# Lab | Comparing regression models

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

import warnings
warnings.filterwarnings("ignore")


In [7]:
clean_df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

RAND_STATE = 34

In [8]:
numerical_df = clean_df.select_dtypes(include=np.number)

X = numerical_df.drop(['total_claim_amount'], axis = 1)
y = numerical_df.total_claim_amount

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 30, random_state = RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [27]:
scaler = StandardScaler()
X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_s = scaler.transform(X_test)

## Monday, 20.02.2023
### Activity 1: fit the models LinearRegression, Lasso and Ridge and compare the model performances.

In [11]:
#Applying Linear regression:

model=LinearRegression()
model.fit(X_train_s, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_s, y_train)}, Test -> {model.score(X_test_s, y_test)}")

LinearRegression: Train -> 0.4101208388205483, Test -> 0.31732897879716015


In [21]:
model.coef_

array([ -7.37925164, -31.81313268, 187.30091363,   1.26028882,
        -1.96585614,  -1.0055951 ,   1.14839999])

In [15]:
#Applying Lasso:

model=Lasso(alpha=0)
model.fit(X_train_s, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_s, y_train)}, Test -> {model.score(X_test_s, y_test)}")

Lasso: Train -> 0.4101208388205483, Test -> 0.3173289787971606


In [19]:
#Applying Ridge:

model=Ridge(alpha=0)
model.fit(X_train_s, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_s, y_train)}, Test -> {model.score(X_test_s, y_test)}")

Ridge: Train -> 0.4101208388205483, Test -> 0.3173289787971606


### Activity 2: use feature selection techniques (P-Value, RFE) to select subset of features to train the model with (if necessary).

In [28]:
#Applying RFE:

from sklearn.feature_selection import RFE

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 3, step = 1, verbose = 1)
selector.fit(X_train_s, y_train)

kept_features = selector.get_support(indices = True)
kept_features = list(X_train_s.iloc[:,kept_features].columns)

X_train_s = selector.transform(X_train_s)
X_test_s = selector.transform(X_test_s)

X_train_s = pd.DataFrame(X_train_s, columns=kept_features)
X_test_s = pd.DataFrame(X_test_s, columns=kept_features)

print("Final selected features: ")
display(X_train_s)

Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Final selected features: 


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto
0,1.212836,-0.900892,1.533803
1,-0.804404,-1.112906,-0.847931
2,1.634978,-0.637175,1.911395
3,-0.585338,0.938261,0.226754
4,-0.644211,0.951419,-0.092747
...,...,...,...
10654,-0.197938,-0.302545,0.168663
10655,0.452428,-0.192361,0.546255
10656,-0.784198,1.879915,-0.818886
10657,-0.672267,-1.154477,-0.150838


### Optional: refit the models with the selected features.

In [29]:
numerical_df = clean_df[["customer_lifetime_value", "income", "monthly_premium_auto", "total_claim_amount"]]

X = numerical_df.drop(['total_claim_amount'], axis = 1)
y = numerical_df.total_claim_amount

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 30, random_state = RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [30]:
scaler = StandardScaler()
X_train_s = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_s = scaler.transform(X_test)

In [31]:
model=LinearRegression()
model.fit(X_train_s, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train_s, y_train)}, Test -> {model.score(X_test_s, y_test)}")

LinearRegression: Train -> 0.41002725773733406, Test -> 0.31372976126729446
