# Regression modeling with statsmodel

In [None]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std

%matplotlib inline

## Basic Example

In [None]:
# load example data
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)

In [None]:
# fit and summarize OLS model
mod = sm.OLS(spector_data.endog, spector_data.exog)
res = mod.fit()
print(res.summary())

## More detailed example

In [None]:
# create artificial data
np.random.seed(9876789)
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)

In [None]:
# we need an intercept so create a column of ones
X = sm.add_constant(X)
y = np.dot(X, beta) + e

In [None]:
# fit and summary
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
print('Parameters: ', results.params)
print('R2: ', results.rsquared)

In [None]:
# now simulate artificial data with a non-linear relationship between x and y
nsample = 50
sig = 0.5
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, np.sin(x), (x-5)**2, np.ones(nsample)))
beta = [0.5, 0.5, -0.02, 5.]

y_true = np.dot(X, beta)
y = y_true + sig * np.random.normal(size=nsample)

In [None]:
# fit and summary
res = sm.OLS(y, X).fit()
print(res.summary())

In [None]:
print('Parameters: ', res.params)
print('Standard errors: ', res.bse)
print('Predicted values: ', res.predict())

In [None]:
# plot to compare the true relationship to OLS predictions
prstd, iv_l, iv_u = wls_prediction_std(res)
fig, ax = plt.subplots(figsize=(8,6))

ax.plot(x, y, 'o', label="data")
ax.plot(x, y_true, 'b-', label="True")
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');