In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from sklearn.datasets import load_diabetes
from math import sqrt

import os
os.getcwd()

## Let's start with a simple example

In [None]:
diabetes = load_diabetes(as_frame=True)
print(diabetes.DESCR)

In [None]:
diabetes_df = diabetes.frame
diabetes_df.head()

In [None]:
results = smf.ols('target ~ bmi', data=diabetes_df).fit()
results.summary()

In [None]:
fig = sm.graphics.abline_plot(model_results=results)
ax = fig.axes[0]
ax.scatter(diabetes_df['bmi'], diabetes_df['target'])
plt.show

## What if we use simulated data?

In [None]:
beta_1 = 5
beta_0 = 2

x = np.random.randint(low=0, high=25, size=(50,))
print(x)

In [None]:
epsilon = np.random.normal(scale = 5.0, size = (50,))
print(epsilon)

In [None]:
# this is the linear regression equation
y = beta_0 + beta_1*x + epsilon
print(y)

In [None]:
df = pd.DataFrame({'x': x, 'y': y})
df.head()

In [None]:
plt.scatter(x,y)

In [None]:
results = smf.ols('y ~ x', data=df).fit()
results.summary()

In [None]:
fig = sm.graphics.abline_plot(model_results=results)
ax = fig.axes[0]
ax.scatter(df['x'], df['y'])
plt.show

#### What would happen if we shrunk the standard deviation of the noise?

## Here is an example with some real data

In [None]:
inventory_df = pd.read_csv('/Users/sharad/Courses/DATA_5600/Data/Metro_invt_fs_uc_sfrcondo_sm_month.csv')
inventory_df.head()

In [None]:
inventory_df[inventory_df['StateName'].isin(['UT'])]

In [None]:
slc_inventory = inventory_df[inventory_df['RegionName'].isin(['Salt Lake City, UT'])]
slc_inventory

In [None]:
sale_to_list_df = pd.read_csv('/Users/sharad/Courses/DATA_5600/Data/Metro_mean_sale_to_list_uc_sfrcondo_month.csv')
sale_to_list_df.head()

In [None]:
slc_stol = sale_to_list_df[sale_to_list_df['RegionName'].isin(['Salt Lake City, UT'])]
slc_stol

In [None]:
slc_stol = slc_stol.transpose()
slc_inventory = slc_inventory.transpose()

slc_stol.head(10)

In [None]:
slc_stol = slc_stol.iloc[5:,:]
slc_inventory = slc_inventory.iloc[5:,:]
slc_inventory.head()

In [None]:
slc_stol.head()

In [None]:
slc_inventory.set_axis(['inventory'], axis=1, inplace=True)

slc_stol.set_axis(['sale_to_list_ratio'], axis=1, inplace=True)

slc_inventory.head()

In [None]:
slc_inventory.reset_index(inplace=True)
slc_inventory = slc_inventory.rename(columns = {'index':'month'})

slc_stol.reset_index(inplace=True)
slc_stol = slc_stol.rename(columns = {'index':'month'})

slc_inventory.head()

In [None]:
slc_stol.head()

In [None]:
slc_housing = slc_inventory.merge(slc_stol, on = 'month')
slc_housing.head()

In [None]:
slc_housing.shape

In [None]:
slc_housing.dropna(inplace = True)
slc_housing.shape

In [None]:
plt.scatter(slc_housing['inventory'], slc_housing['sale_to_list_ratio'])

In [None]:
slc_housing.info()

In [None]:
slc_housing["inventory"] = pd.to_numeric(slc_housing["inventory"])
slc_housing["sale_to_list_ratio"] = pd.to_numeric(slc_housing["sale_to_list_ratio"])
slc_housing.info()

In [None]:
results = smf.ols('sale_to_list_ratio ~ inventory', data=slc_housing).fit()
results.summary()

In [None]:
fig = sm.graphics.abline_plot(model_results=results)
ax = fig.axes[0]
ax.scatter(slc_housing['inventory'], slc_housing['sale_to_list_ratio'])
plt.show()

In [None]:
slc_housing['inventory_norm'] = (slc_housing['inventory'] - slc_housing['inventory'].mean())/slc_housing['inventory'].std()
slc_housing['sale_to_list_ratio_norm'] = (slc_housing['sale_to_list_ratio'] - slc_housing['sale_to_list_ratio'].mean())/slc_housing['sale_to_list_ratio'].std()

slc_housing.head()

In [None]:
plt.scatter(slc_housing['inventory_norm'], slc_housing['sale_to_list_ratio_norm'])

In [None]:
results = smf.ols('sale_to_list_ratio_norm ~ inventory_norm', data=slc_housing).fit()
results.summary()

## Calculate Coefficients by Hand

$$\hat{\beta}_1=\frac{\sum_{i=1}^n (x_i - \bar{x})(y_i-\bar{y})}{\sum_{i=1}^n (x_i - \bar{x})^2}$$


$$\hat{\beta}_0=\bar{y} - \hat{\beta}_1 \bar{x}$$

In [None]:
xbar = np.mean(slc_housing['inventory_norm'])
ybar = np.mean(slc_housing['sale_to_list_ratio_norm'])
numerator = np.sum((slc_housing['inventory_norm'] - xbar)*(slc_housing['sale_to_list_ratio_norm'] - ybar))
denominator = np.sum((slc_housing['inventory_norm'] - xbar)**2)
beta_1 = numerator/denominator
beta_1

In [None]:
beta_0 = ybar - beta_1*xbar
beta_0

### How would you calculate the SE, confidence interval, and test statistic by hand? (Homework)

## Plot Residuals

In [None]:
results = smf.ols('sale_to_list_ratio_norm ~ inventory_norm', data=slc_housing).fit()

fig = plt.figure(figsize=(8, 6))
sm.graphics.plot_regress_exog(results, 'inventory_norm', fig=fig)