<div class="notebook-buttons" style="display:flex; padding-top: 5rem;padding-bottom: 2.5rem;line-height: 2.15;">
    <a href="https://colab.research.google.com/github/magdasalatka/fantastic-features/blob/main/main.ipynb">
        <div id="colab-link" style="display: flex;padding-right: 3.5rem;padding-bottom: 0.625rem;border-bottom: 1px solid #ececed; align-items: center;">
            <img class="call-to-action-img" src="img/colab.svg" width="30" height="30" style="margin-right: 10px;margin-top: auto;margin-bottom: auto;">
            <div class="call-to-action-txt">Run in Google Colab</div>
        </div>
    </a>
    <a href="https://raw.githubusercontent.com/magdasalatka/fantastic-features/main/main.ipynb" download>
        <div id="download-link" style="display: flex;padding-right: 3.5rem;padding-bottom: 0.625rem;border-bottom: 1px solid #ececed; height: auto;align-items: center;">
            <img class="call-to-action-img" src="img/download.svg" width="22" height="30" style="margin-right: 10px;margin-top: auto;margin-bottom: auto;">
            <div class="call-to-action-txt">Download Notebook</div>
        </div>
    </a>
    <a href="https://github.com/magdasalatka/fantastic-features/blob/main/main.ipynb">
        <div id="github-link" style="display: flex;padding-right: 3.5rem;padding-bottom: 0.625rem;border-bottom: 1px solid #ececed; height: auto;align-items: center;">
            <img class="call-to-action-img" src="img/github.svg" width="25" height="30" style="margin-right: 10px;margin-top: auto;margin-bottom: auto;">
            <div class="call-to-action-txt">View on GitHub</div>
        </div>
    </a>
</div>

# Back to the Feature
### Statistical feature engineering

In [None]:
from sklearn import datasets, linear_model
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import statsmodels as sm

X, y = datasets.load_diabetes(return_X_y=True)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
thershold = int(len(X)*0.8)
X_train, X_test = X[:thershold], X[thershold:]
y_train, y_test = y[:thershold], y[thershold:]

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
fitted = model.predict(X_train)

### Regression results

In [None]:
X2 = sm.api.add_constant(X)
est = sm.api.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

In [None]:
# VALUES : Actual vs predicted
fig, ax = plt.subplots()
ax.scatter(fitted, y_train)
ax.set_xlabel("Predicted")
ax.set_ylabel("Observed")

In [None]:
# Coefficients


In [None]:
# How does the data fit the model?  

fig, axs = plt.subplots(5,2, figsize=(15,20))

for i, x in enumerate(X_train.T):
    row = i//2
    col = i%2
    axs[row, col].scatter(x, y_train)
    axs[row, col].set_xlabel("Independent var {}".format(i))
    axs[row, col].set_ylabel("Dependent variable")

    x_vals = np.array(axs[row, col].get_xlim())
    y_vals = model.intercept_ + model.coef_[i] * x_vals
    axs[row, col].plot(x_vals, y_vals, '--', color="orange")
    axs[row, col].grid(True)

# Regression diagnostics

## Assumption 1: 
E[error] = 0

In [None]:
residuals = fitted - y_train
print("Expected error estimate: {}".format(sum(residuals)/len(residuals)))

In [None]:


fig, ax = plt.subplots()
plt.grid(True)
plt.scatter(fitted, residuals)
ax.set_xlabel("Fitted values")
ax.set_ylabel("Residuals")
plt.hlines(0, xmin=min(fitted), xmax=max(fitted), colors="orange")
plt.title('Residuals vs Fitted')

plt.show()

# Assumption 2:
var(error) = constant

In [None]:
sorted_residuals = [x for _, x in sorted(zip(fitted, residuals))]
window = 100 # TODO: Try different windows

start = 0
variances = []
while start+window < len(sorted_residuals):
    variances.append(np.var(sorted_residuals[start:start+window]))
    start += window

variances[-1] = np.var(sorted_residuals[start-window:])
variances

In [None]:
# Plot as above

fig, ax = plt.subplots()
plt.grid(True)
plt.scatter(fitted, residuals)
ax.set_xlabel("Fitted values")
ax.set_ylabel("Residuals")
plt.vlines(range(0, len(fitted), window), ymin=min(residuals), ymax=max(residuals), colors="red")
plt.title('Residuals vs Fitted')

plt.show()


In [None]:
plt.bar(range(len(variances)), variances)
plt.title('Variances')

# Assumption 3:
errors ~N(0, const)

In [None]:
# TODO: try different bins ?
plt.hist(residuals, 50, density=True, facecolor='g', alpha=0.75)

plt.xlabel('Error')
plt.ylabel('Probability')
plt.title('Residuals: empirical distribution')
plt.grid(True)
plt.show()

In [None]:
test = np.random.normal(0,1, 1000)

fig = sm.api.qqplot(residuals/np.std(residuals), line='45')

## Assumption 4: 
errors are not correlated


In [None]:
# Check 1: Residuals vs lagged residuals
fig = sm.graphics.tsaplots.plot_acf(residuals)

In [None]:
# Check 1a (optional): Residuals vs lagged residuals
fig, axs = plt.subplots(2,2, figsize=(12,10))

for i in range(2,6):
    row = (i-2)//2
    col = (i-2)%2
    axs[row, col].scatter(residuals[0:len(residuals)-i], residuals[i:])
    axs[row, col].set_xlabel("Lagged residuals: {}".format(i))
    axs[row, col].set_ylabel("Residuals")
    axs[row, col].hlines(0, xmin=min(residuals), xmax=max(residuals), colors="orange")
    axs[row, col].grid(True)

In [None]:
# Check 2: Residuals vs independent variables
fig, axs = plt.subplots(5,2, figsize=(15,20))

for i, x in enumerate(X_train.T):
    row = i//2
    col = i%2
    axs[row, col].scatter(x, residuals)
    axs[row, col].set_xlabel("Independent var {}".format(i))
    axs[row, col].set_ylabel("Residuals")
    axs[row, col].hlines(0, xmin=min(x), xmax=max(x), colors="orange")
    axs[row, col].grid(True)

In [None]:
# Check 2: Correlations
df = pd.DataFrame(X_train)

f = plt.figure(figsize=(8, 8))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);