In [None]:
# Imports

from sklearn.datasets import make_regression

import statsmodels.api  as sm

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

## Make some Noise!

In [None]:
# Create the regression dataset
X, y = make_regression(n_samples=1000, n_features=1, noise=12, random_state=42)

# Reshape y to make it usable
y = y.reshape(-1, 1)

In [None]:
#create dataframe for easier manipulation

data = pd.DataFrame(np.concatenate((X,y), axis=1), columns = ['X','y'])

In [None]:
# Visualize

fig, ax = plt.subplots(figsize=(10,10))

plt.xkcd()
ax.set_title('Simple Regression')
sns.regplot(x='X', y='y', data=data, ax=ax, line_kws={"color": "red"});

## [Statsmodels](https://www.statsmodels.org/stable/index.html) First

`statsmodels` tends to be used more for inferential work.

[Endogenous? Exogenous?](https://www.statsmodels.org/stable/endog_exog.html)

In [None]:
# Statsmodels First

# Create endog and exog with constant
endog = data['y']
exog = sm.add_constant(data['X'])
lin_reg_model = sm.OLS(endog, exog)

In [None]:
# Fit and Summarize
results  = lin_reg_model.fit()

In [None]:
results.summary()

### Woah. That's a lot of information!

No worries! [Here's](https://medium.com/swlh/interpreting-linear-regression-through-statsmodels-summary-4796d359035a) a good breakdown of how to interpret it.

Remember the equation for a line?

$y =  \beta_1x_1 + \beta_0$

Based on our summary, our line of best fit is

$y =  16.6x_1 + .05$

### Double check things

In [None]:
print(f'The intercept: {results.params[0]}')

In [None]:
print(f'The beta_1 value: {results.params[1]}')

In [None]:
data['prediction'] = data['X']*results.params[1] + results.params[0]

In [None]:
data['residuals'] = data['y'] - data['prediction']

In [None]:
data.head()

### Metrics!

Mean Squared Error:

$ MSE = \dfrac{1}{n}\Sigma(y - \hat y)^2$

Root Mean Squared Error (in the same units as the target)

$ RMSE = \sqrt{\dfrac{1}{n}\Sigma(y - \hat y)^2}$


In [None]:
# First, by hand


MSE = sum( data['residuals']**2)  / len(data)

MSE

In [None]:
RMSE = MSE**(1/2)

RMSE

In [None]:
# Of course there's a library for that!

MSE_p = mean_squared_error(data['y'], data['prediction'])
RMSE_p = mean_squared_error(data['y'], data['prediction'], squared=False) 

print(f'Mean Squared Error = {MSE_p:.2f} \nRoot Mean Squared Error = {RMSE_p:.2f}')

## [scikit-learn](https://scikit-learn.org/stable/)

`sklearn` is set up for predictions!

In [None]:
# Instaniate the algorithm
lr = LinearRegression()

# Create our X and y

X = data[['X']]
y = data['y']

# fit the data, creating the model
lr.fit(X, y)

In [None]:
# Coefficient
lr.coef_

In [None]:
# Intercept

lr.intercept_

In [None]:
# Predictions!

data['y_pred'] = lr.predict(X)

data['lr_diff'] = data['y'] - data['y_pred']

In [None]:
data.head()

In [None]:
lr.score(X,y)

In [None]:
MSE_lr = mean_squared_error(y, data['y_pred'])
RMSE_lr = mean_squared_error(y, data['y_pred'], squared=False) 

print(f'Mean Squared Error = {MSE_lr:.2f} \nRoot Mean Squared Error = {RMSE_lr:.2f}')

In [None]:
# Visualize

fig, ax = plt.subplots(figsize=(10,10))

plt.xkcd()
ax.set_title('Simple Regression')
sns.regplot(x='X', y='y', data=data, ax=ax, line_kws={"color": "red"});
plt.plot(data['X'],data['y_pred'], 'g+', markersize=12)

In [None]:
# New data?
X_new = [[2]]
y_new = [60]

In [None]:
new_pred = lr.predict(X_new)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

plt.xkcd()
ax.set_title('Simple Regression')
sns.regplot(x='X', y='y', data=data, ax=ax, line_kws={"color": "red"});
plt.plot(X_new, y_new, 'mx', markersize=20, label='Actual')
plt.plot(X_new, new_pred, 'y*', markersize=20, label='Predicted')
plt.legend();