# SLR Practice

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor

import sklearn.metrics as metrics

import statsmodels.api as sm

from statsmodels.stats.stattools import durbin_watson


### Get the data

In [None]:
df = pd.read_csv('./data/train.csv')

I always fix my column names first. 

In [None]:
df.columns = [name.strip().lower() for name in df.columns]

In [None]:
df.head()

In [None]:
df.info()

I'll create an SLR using one feature and `saleprice` as the target.

In [None]:
# Create target/output and features/input
X = df[['grlivarea']]
y = df.saleprice

### Assumption: Linear Relationship between feature and target

In [None]:
# Assumption #1. Check to see if there is a linear relationship

plt.scatter(X['grlivarea'],y)

In [None]:
# another method

sns.regplot(x=X,y=y);

In [None]:
# Create a correct number of bins, and let's look at the distribution
bins = int( (len(X))**.5 )
plt.hist(X['grlivarea'], bins = bins);

In [None]:
# And for Sale price, as well
plt.hist(y, bins=bins);

What does this tell us?

In [None]:
# statsmodel first
sm_lr = sm.OLS(y, sm.add_constant(X))

In [None]:
# Summary
sm_lr.fit().summary()

### How do we compare this?

Baseline!

There's a [class](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html) for it!

In [None]:
# Instantiate
dummy_lr = DummyRegressor(strategy='mean')

In [None]:
# fit
dummy_lr.fit(X,y)

In [None]:
# score
dummy_lr.score(X,y)

Well that's a relief. We're at least better then that.

### Error assumptions

#### Normally Distributed with a mean of 0

In [None]:
# Instantiate
sk_lr = LinearRegression()

In [None]:
# fit
sk_lr.fit(X,y)

In [None]:
# score
sk_lr.score(X,y)

In [None]:
# predict
preds = sk_lr.predict(X)

In [None]:
# calculate residuals

resids = y - preds

In [None]:
# Histogram of residuals
plt.hist(resids, bins=bins);

In [None]:
sns.displot(resids, bins=bins, kde=True);

In [None]:
# QQ!

sm.qqplot(resids, line='r');

#### Thoughts?

#### Heteroskedacity

In [None]:
# redid plot
sns.residplot(x=X, y=y);

In [None]:
# scatter
plt.scatter(preds, resids)

In [None]:
sns.regplot(x=preds, y=resids)

In [None]:
sns.regplot(x=preds,y=y);

#### Thoughts?

#### No autocorrelation in residuals

Hello, [Durbin-Watson!](https://en.wikipedia.org/wiki/Durbin%E2%80%93Watson_statistic)

In [None]:
print('\nPerforming Durbin-Watson Test')
print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data')
print('0 to 2< is positive autocorrelation')
print('>2 to 4 is negative autocorrelation')
print('-------------------------------------')
durbinWatson = durbin_watson(resids)
print('Durbin-Watson:', durbinWatson)
if durbinWatson < 1.5:
    print('Signs of positive autocorrelation', '\n')
    print('Assumption not satisfied')
elif durbinWatson > 2.5:
    print('Signs of negative autocorrelation', '\n')
    print('Assumption not satisfied')
else:
    print('Little to no autocorrelation', '\n')
    print('Assumption satisfied')

The above code was audaciously stolen from [this](https://jeffmacaluso.github.io/post/LinearRegressionAssumptions/) excellent article.

#### Thoughts?

## Your turn!

Your goal is find a feature or an interaction of features that outperforms this model. GO!