In [None]:
# work in progress

# last updated: 2022-05-18

# Linear Regression Python Cookbook: Examples

---

## 0. Prepare the Workspace 
***use for both methods***

In [1]:
# import data analysis packages
import numpy as np
import pandas as pd

# import functions from sci-kit learn 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split , KFold 
from sklearn import metrics

# import functions from statsmodels
import statsmodels.formula.api as smf

# import data visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# allow graphs to display in the notebook inline
%matplotlib inline

In [5]:
# load data

# load the 'advertising' sample dataset from "An Introduction to Statistical Learning" resources 
df = pd.read_csv('https://www.statlearning.com/s/Advertising.csv', index_col=0) 

# glimpse the first 10 rows
df.head(10)

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
6,8.7,48.9,75.0,7.2
7,57.5,32.8,23.5,11.8
8,120.2,19.6,11.6,13.2
9,8.6,2.1,1.0,4.8
10,199.8,2.6,21.2,10.6


In [6]:
# edit the DataFrame to contain one or more features and a single target

# sample data is ready to use

## ***Method A: statsmodels***

In [7]:
# step 1A.1 - create a fitted model

lm = smf.ols(formula='sales ~ TV', data=df).fit()

In [9]:
# step 1A.2 - print the coefficients

lm.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.612
Model:,OLS,Adj. R-squared:,0.61
Method:,Least Squares,F-statistic:,312.1
Date:,"Mon, 16 May 2022",Prob (F-statistic):,1.47e-42
Time:,13:39:41,Log-Likelihood:,-519.05
No. Observations:,200,AIC:,1042.0
Df Residuals:,198,BIC:,1049.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.0326,0.458,15.360,0.000,6.130,7.935
TV,0.0475,0.003,17.668,0.000,0.042,0.053

0,1,2,3
Omnibus:,0.531,Durbin-Watson:,1.935
Prob(Omnibus):,0.767,Jarque-Bera (JB):,0.669
Skew:,-0.089,Prob(JB):,0.716
Kurtosis:,2.779,Cond. No.,338.0


In [10]:
# step 1A.3 - predict for a new observation

# first, you must create a DataFrame because the Statsmodels formula interface expects it
X_new = pd.DataFrame({'TV': [50]}) 

# then, use the new DataFrame to predict for a new observation
lm.predict(X_new)

0    9.409426
dtype: float64

## ***Method B: scikit-learn*** 

###  1B. Create X and y

In [11]:
# step 1B.1 - create a feature matrix

X = df[['TV', 'radio','newspaper']]

In [12]:
# step 1B.2 - create a response vector

y = df['sales']

### 2B. Instantiate and Fit

In [13]:
# step 2B.1 - instantiate linear regression model

lr = LinearRegression()

In [14]:
# step 2B.2 - fit linear regression model

lr.fit(X,y)

LinearRegression()

### 3B. Print the Cooefficients


In [15]:
# step 3B.1 - print the y-intercept

print(lr.intercept_)

2.9388893694594067


In [16]:
# step 3B.2 - print the regression coefficient

print (lr.coef_)

[ 0.04576465  0.18853002 -0.00103749]


In [None]:
# step 3B.3 - predict for a single value

# lr.predict([[100]])

In [20]:
# step 3B.4 - Predict the response values

lr.predict(X)

array([20.52397441, 12.33785482, 12.30767078, 17.59782951, 13.18867186,
       12.47834763, 11.72975995, 12.12295317,  3.72734086, 12.55084872,
        7.0322992 , 17.28512918, 10.57712073,  8.82630048, 18.43436638,
       20.81929952, 12.82365674, 23.22495716,  9.95168206, 14.16607293,
       18.10076728, 14.7405382 ,  6.4891503 , 16.5459329 ,  8.14651887,
       15.6100386 , 14.98951429, 17.05167344, 19.41053803,  9.14402389,
       21.6339338 , 11.3460929 ,  7.63888314, 18.86426829,  7.57483051,
       17.00682618, 23.40590052, 15.62347779,  9.90868103, 20.44761039,
       16.37766467, 17.2959832 , 21.59580326, 13.96385684,  8.88787996,
       15.16152314,  8.87338673, 21.7226299 , 16.26362018,  8.1681656 ,
       12.63121132,  9.33981296, 20.66297563, 19.94469957, 20.37443008,
       21.2926106 ,  8.52771254, 12.77458802, 21.89805198, 18.13348698,
        5.74215558, 22.89067208, 16.78426073, 13.21069202, 16.97773556,
        7.84904532,  9.01603163, 12.0370073 , 18.97657924, 21.10

## 4. Calculate Loss Functions

>Comparing these metrics:
>- **Mean Absolute Error (MAE)** represents the average error.
>- **Mean Squared Error (MSE)** punishes larger errors
>- **Root Mean Squared Error (RMSE)** punishes larger errors and is interpretable in the "y" units.<br><br>These are all are ***loss functions***, because we want to ***minimize*** them.

In [18]:
# step 4.1 - calculate the mean absolute error

#Calculate MAE using sklearn

# metrics.mean_absolute_error(y_true, y_pred)
metrics.mean_absolute_error(y, lr.predict(X))

1.2520112296870687

In [21]:
# step 4.2 - calculate the mean squared error

#Calculate MSE using sklearn

# metrics.mean_squared_error(y_true, y_pred)
metrics.mean_squared_error(y, lr.predict(X))

2.784126314510936

In [22]:
# step 4.3 - calculate the root mean squared error

#Calculate RMSE using NumPy

# np.sqrt(metrics.mean_squared_error(y_true, y_pred))
np.sqrt(metrics.mean_squared_error(y, lr.predict(X)))

1.6685701407225697

## 5. Calculate R<sup>2</sup> 
>Use R<sup>2</sup> to find out how well our model performs versus the mean


In [23]:
# step 4.4 - use statsmodels to calculate the R-squared value for the model

# R-squared value for the model with one feature
lm.rsquared

0.611875050850071

In [24]:
# R-squared value for the model with two features

lm = smf.ols(formula='sales ~ TV + radio', data=df).fit()
lm.rsquared

0.8971942610828956

In [25]:
# R-squared value for the model with three features

lm = smf.ols(formula='sales ~ TV + radio + newspaper', data=df).fit()
lm.rsquared

0.8972106381789522

---

---