# Linear Regression Lab

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

In [2]:
clean_df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

RAND_STATE = 34 # for reproducible shuffling

In [3]:
# Using the np.number and object methods to segregate the numerical data types and categorical data types
# respectively, since for now we will only use the numerical variables.

numerical_df = clean_df.select_dtypes(include=np.number)
categorical_df = clean_df.select_dtypes(include=object)

### Tuesday (31.01.2023)


#### Activity 1:
X-y split (y is the target variable, which is the total claim amount).

In [4]:
X = numerical_df.drop(['total_claim_amount'], axis = 1)
y = numerical_df.total_claim_amount

#### Activity 2:
Train-test split.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 30, random_state = RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

#### Activity 3:
Standardize the data (after the data split).

In [7]:
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train) # get the parameters using the train set and apply to them

In [8]:
X_test_s = scaler.transform(X_test) # apply the parameters of the train set to the test set

# You don't transform the target feature.

#### Activity 4:
Apply linear regression.

In [9]:
X_train_const = sm.add_constant(X_train_s)

model = sm.OLS(y_train, X_train_const).fit()
predictions_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test_s)
predictions_test = model.predict(X_test_const)


#### Activity 5:
Model Interpretation.

In [10]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:     total_claim_amount   R-squared:                       0.410
Model:                            OLS   Adj. R-squared:                  0.410
Method:                 Least Squares   F-statistic:                     1058.
Date:                Wed, 01 Feb 2023   Prob (F-statistic):               0.00
Time:                        09:40:53   Log-Likelihood:                -72834.
No. Observations:               10659   AIC:                         1.457e+05
Df Residuals:                   10651   BIC:                         1.457e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        434.2805      2.176    199.587      0.0