In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import random
from scipy.stats import chi2_contingency
import statsmodels.formula.api as smf
from sympy import solve, symbols
%matplotlib inline

# Simple Linear Regression model

Simple linear regression is an approach for predicting a **quantitative response** using a **single feature** (or "predictor" or "input variable"). It takes the following form: <p>
$$Y= \beta_0 +\beta_1 * X+\epsilon$$

What does each term represent?
- $Y$ is the response
- $X$ is the feature
- $\beta_0$ is the intercept
- $\beta_1$ is the coefficient for X
- $\epsilon$ is noise.

Together, $\beta_0$ and $\beta_1$ are called the **model coefficients**. <p>
And because $E(\epsilon)=0$, we have
$$E(Y)=\beta_0 +\beta_1 * E(X)$$

# Prediction Equation
$$\hat{y}=b_0+b_1x$$
- $\hat{y}$: predicted value
- $b_0$: estimated value of $\beta_0$
- $b_1$: estimated value of $\beta_1$

Generally speaking, model coefficients are estimated using the **least squares criterion**, which means we are find the line (mathematically) which minimizes the **sum of squared residuals** (or "sum of squared errors", **SSE**):
$$SSE=\sum_{i=1}^n(y_i-\hat{y}_i)^2$$<p>
that is $$SSE=\sum_{i=1}^n(y_i-b_0-b_1x_i)^2$$<p>
$$b_1=\frac{S_{XY}}{S_{XX}}, b_0=\bar{y}-b_1\bar{x}$$
where $$S_{XX}=\sum_{i=1}^n(x_i-\bar{x})^2,~ S_{XY}=\sum_{i=1}^n[(y_i-\bar{y})(x_i-\bar{x})]$$

We need to define a function to compute **SSE**.

**Variance of model:** With repeated sampling, the variation of position of line. </p>
**Bias of model:** goodness of captuing the true relationship.<p>
Linear Regression Model is **low-variance** and **high-bias** model. </p>
A closely related concept is **confidence intervals** for the model efficicents.

For $\beta_0$, CI is $[b_0-SE,b_0+SE]$</p>
For $\beta_1$, CI is $[b_1-SE,b_1+SE]$<p>
(SE: Standard Error)<p>
If we know the CI is [a,b], i.e. $b_0-SE=a$, $b_0+SE=b$, we can deduce that $b_0=\frac{a+b}{2}$, $SE=\frac{b-a}{2}$

In [3]:
X=[1,1,1,2,2,3,4,5,6,6,7,8,9,1,1,1,2,4,5,5,20,16,15,15,6]
Y=[50,60,60,55,55,60,70,65,70,85,85,85,50,55,40,50,70,60,70,75,100,90,85,90,60]
predict_x=[10,5,11,12,13,25,15]
newhours=[12,23,11,5]
data1=pd.DataFrame({"X":X,"Y":Y})
mm=smf.ols(formula='Y ~ X', data=data1).fit()
test_X=pd.DataFrame({"X":predict_x}) # Use the same feature name
test_new=pd.DataFrame({"X":newhours})# use the same feature name as that used when you build the model
print test_X
print mm.params[0], mm.params[1]
print mm.predict(test_X)
print mm.predict(test_new)

    X
0  10
1   5
2  11
3  12
4  13
5  25
6  15
54.1928745503 2.32998723454
[  77.4927469    65.84281072   79.82273413   82.15272136   84.4827086
  112.44255541   89.14268307]
[  82.15272136  107.78258094   79.82273413   65.84281072]


In [5]:
print mm.conf_int(0.01) # alpha=0.01 , hence confidence level is 0.99
print mm.conf_int() # 95% confidence level
print mm.conf_int(0.05) # 95% confidence level 
print type(mm.conf_int(0.05))
print mm.conf_int(0.01).ix['Intercept'] # 99% confidence interval for beta0

                   0          1
Intercept  46.242656  62.143093
X           1.317693   3.342282
                   0          1
Intercept  48.334551  60.051198
X           1.584052   3.075923
                   0          1
Intercept  48.334551  60.051198
X           1.584052   3.075923
<class 'pandas.core.frame.DataFrame'>
0    46.242656
1    62.143093
Name: Intercept, dtype: float64


In [6]:
mm.rsquared

0.64480108478847975

In [None]:
def RMSE(x,y,lm):
    y_predicted=lm.params[0]+lm.params[1]*x
    return np.sqrt(((y-y_predicted)**2).sum()/(len(x)-2))