# Linear Regression

## Simple Linear Regression

In [1]:
import statsmodels.api as sm
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt

# parameters
n = 100
beta_0 = 5
beta_1 = 2
np.random.seed(1)

# create x and y
x = 10 * ss.uniform.rvs(size=n)
y = beta_0 + beta_1 * x + ss.norm.rvs(loc=0, scale = 1, size = n)


In [2]:
# add intercept in x
X = sm.add_constant(x)

# fit OLS
mod = sm.OLS(y, X)
est = mod.fit()
est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,4115.0
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,7.47e-82
Time:,15:21:02,Log-Likelihood:,-130.72
No. Observations:,100,AIC:,265.4
Df Residuals:,98,BIC:,270.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.2370,0.174,30.041,0.000,4.891,5.583
x1,1.9685,0.031,64.151,0.000,1.908,2.029

0,1,2,3
Omnibus:,2.308,Durbin-Watson:,2.206
Prob(Omnibus):,0.315,Jarque-Bera (JB):,1.753
Skew:,-0.189,Prob(JB):,0.416
Kurtosis:,3.528,Cond. No.,11.2


## scikit-learn for linear regression

In [3]:
np.random.seed(1)

# parameters
n = 500
beta_0 = 5
beta_1 = 2
beta_2 = -1

# create input and output variables
x_1 = 10*ss.uniform.rvs(size = n)
x_2 = 10*ss.uniform.rvs(size = n)
y = beta_0 + beta_1*x_1 + beta_2*x_2 + ss.norm.rvs(loc = 0, scale = 1, size = n)

# bind x_1 and x_2
X = np.stack([x_1, x_2], axis = 1)

In [8]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression(fit_intercept = True)
lm.fit(X, y)

print("Estimated beta_0: %.2f" % lm.intercept_)
print("Estimated beta_1: %.2f" % lm.coef_[0])
print("Estimated beta_2: %.2f" % lm.coef_[1])

Estimated beta_0: 5.15
Estimated beta_1: 2.00
Estimated beta_2: -1.02


In [10]:
# predict y for given X
X_0 = np.array([2, 4])
lm.predict(X_0.reshape(1, -1))

array([5.07289561])

In [12]:
# R-squared
lm.score(X, y)

0.9798997316600129

## Assessing Model Accuracy

In [16]:
from sklearn.model_selection import train_test_split

# dividing in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.5, random_state = 1)

# fitting model
lm = LinearRegression(fit_intercept = True)
lm.fit(X_train, y_train)

# computing score
lm.score(X_test, y_test)

0.9794930834681773