In [None]:
pip install ISLP

Simple Linear Regression

In [2]:
# load the dataset
from ISLP import load_data
boston_data = load_data('Boston')
boston_data.head(5)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [3]:
# specify x1
from ISLP.models import ModelSpec as MS
design = MS(['lstat']) # X1
X = design.fit_transform(boston_data) # intercept
print(X[:4])

   intercept  lstat
0        1.0   4.98
1        1.0   9.14
2        1.0   4.03
3        1.0   2.94


In [4]:
y = boston_data['medv'] # grab the y/dependent variable
y.head(5)

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: medv, dtype: float64

In [5]:
# create and train the model
import statsmodels.api as sm

model = sm.OLS(y, X) # create the model
result = model.fit() # train the model

# show model results
from ISLP.models import summarize
summarize(result)


Unnamed: 0,coef,std err,t,P>|t|
intercept,34.5538,0.563,61.415,0.0
lstat,-0.95,0.039,-24.528,0.0


In [6]:
result.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,601.6
Date:,"Thu, 01 Feb 2024",Prob (F-statistic):,5.08e-88
Time:,20:59:58,Log-Likelihood:,-1641.5
No. Observations:,506,AIC:,3287.0
Df Residuals:,504,BIC:,3295.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,34.5538,0.563,61.415,0.000,33.448,35.659
lstat,-0.9500,0.039,-24.528,0.000,-1.026,-0.874

0,1,2,3
Omnibus:,137.043,Durbin-Watson:,0.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,291.373
Skew:,1.453,Prob(JB):,5.36e-64
Kurtosis:,5.319,Cond. No.,29.7


# Multiple Linear Regression

In [8]:
X = MS(['lstat', 'age', 'rm']).fit_transform(boston_data)
model2 = sm.OLS(y, X)
result2 = model2.fit()
summarize(result2)

Unnamed: 0,coef,std err,t,P>|t|
intercept,-1.1753,3.182,-0.369,0.712
lstat,-0.6685,0.054,-12.298,0.0
age,0.0091,0.011,0.811,0.418
rm,5.0191,0.454,11.048,0.0


# Interaction Terms

In [10]:
X = MS(['lstat', 'age', ('lstat', 'age')]).fit_transform(boston_data)
model3 = sm.OLS(y,X)
result3 = model3.fit()
summarize(result3)

Unnamed: 0,coef,std err,t,P>|t|
intercept,36.0885,1.47,24.553,0.0
lstat,-1.3921,0.167,-8.313,0.0
age,-0.0007,0.02,-0.036,0.971
lstat:age,0.0042,0.002,2.244,0.025


In [11]:
result3.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.556
Model:,OLS,Adj. R-squared:,0.553
Method:,Least Squares,F-statistic:,209.3
Date:,"Thu, 01 Feb 2024",Prob (F-statistic):,4.86e-88
Time:,21:14:25,Log-Likelihood:,-1635.0
No. Observations:,506,AIC:,3278.0
Df Residuals:,502,BIC:,3295.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,36.0885,1.470,24.553,0.000,33.201,38.976
lstat,-1.3921,0.167,-8.313,0.000,-1.721,-1.063
age,-0.0007,0.020,-0.036,0.971,-0.040,0.038
lstat:age,0.0042,0.002,2.244,0.025,0.001,0.008

0,1,2,3
Omnibus:,135.601,Durbin-Watson:,0.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,296.955
Skew:,1.417,Prob(JB):,3.2900000000000003e-65
Kurtosis:,5.461,Cond. No.,6880.0


# Qualitative Predictors

In [12]:
carseat_data = load_data('Carseats')
carseat_data.head(5)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [13]:
all_predictors = list(carseat_data.columns.drop('Sales'))
print(all_predictors)

['CompPrice', 'Income', 'Advertising', 'Population', 'Price', 'ShelveLoc', 'Age', 'Education', 'Urban', 'US']


In [14]:
X = MS(all_predictors).fit_transform(carseat_data)
y = carseat_data['Sales']
model4 = sm.OLS(y,X)
result4 = model4.fit()
summarize(result4)


Unnamed: 0,coef,std err,t,P>|t|
intercept,5.6606,0.603,9.38,0.0
CompPrice,0.0928,0.004,22.378,0.0
Income,0.0158,0.002,8.565,0.0
Advertising,0.1231,0.011,11.066,0.0
Population,0.0002,0.0,0.561,0.575
Price,-0.0954,0.003,-35.7,0.0
ShelveLoc[Good],4.8502,0.153,31.678,0.0
ShelveLoc[Medium],1.9567,0.126,15.516,0.0
Age,-0.046,0.003,-14.472,0.0
Education,-0.0211,0.02,-1.07,0.285
