# DS-SF-23 | Codealong 07 | Introduction to Regression and Model Fit, Part 2

In [1]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

## Activity | Model's F-statistic

In [2]:
df = pd.read_csv(os.path.join('..', '..', '07', 'datasets', 'zillow-07-start.csv'), index_col = 'ID')

In [8]:
model = smf.ols(formula = 'SalePrice ~ IsAStudio' , data = df ).fit()

In [9]:
model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.07775
Date:,"Fri, 27 May 2016",Prob (F-statistic):,0.78
Time:,18:02:40,Log-Likelihood:,-1847.4
No. Observations:,986,AIC:,3699.0
Df Residuals:,984,BIC:,3709.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.3811,0.051,27.088,0.000,1.281 1.481
IsAStudio,0.0829,0.297,0.279,0.780,-0.501 0.666

0,1,2,3
Omnibus:,1682.807,Durbin-Watson:,1.488
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1342290.714
Skew:,10.942,Prob(JB):,0.0
Kurtosis:,182.425,Cond. No.,5.92


## Accessing the model's F-value and its p-value

### F-value (with significance level of `5%`)

In [None]:
model.fvalue

### Corresponding p-value

In [None]:
model.f_pvalue

## Part A - Linear Modeling with `sklearn`

In [None]:
# TODO

In [None]:
def linear_modeling_with_sklearn(X, y):
    model = linear_model.LinearRegression(fit_intercept = True)
    model.fit(X, y)

    print 'F-statistic (performed for each regressor independently)'
    print '- F-value', feature_selection.f_regression(X, y)[0]
    print '- p-value', feature_selection.f_regression(X, y)[1]
    print 'R^2 =', model.score(X, y)
    print 'Coefficients'
    print '- beta_0 (intercept) =', model.intercept_
    print '- beta_n (n > 0)     =', model.coef_

### SalePrice ~ IsAStudio with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = subset_df).fit().summary()

### SalePrice ~ IsAStudio with `sklearn` (Simple Linear Modeling)

In [None]:
X = subset_df[ ['IsAStudio'] ]
y = subset_df.SalePrice

linear_modeling_with_sklearn(X, y)

### SalePrice ~ Size + LotSize with `statsmodels`

In [None]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = subset_df).fit().summary()

### SalePrice ~ IsAStudio with `sklearn` (Multiple Linear Modeling)

In [None]:
X = subset_df[ ['Size', 'LotSize'] ]
y = subset_df.SalePrice

linear_modeling_with_sklearn(X, y)

# Advertising dataset

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [None]:
df

## Plots

### Sales ~ TV

In [None]:
# TODO

### Sales ~ Radio

In [None]:
# TODO

### Sales ~ Newspaper

In [None]:
# TODO

## Simple linear regressions

### Sales ~ TV

In [None]:
model_tv = smf.ols(formula = 'TODO', data = df).fit()

In [None]:
model_tv.summary()

### Sales ~ Radio

In [None]:
model_radio = smf.ols(formula = 'TODO', data = df).fit()

In [None]:
model_radio.summary()

### Sales ~ Newspaper

In [None]:
model_newspaper = smf.ols(formula = 'TODO', data = df).fit()

In [None]:
model_newspaper.summary()

## Residuals

### Sales ~ TV

In [None]:
# TODO

In [None]:
# TODO

### Sales ~ Radio

In [None]:
# TODO

In [None]:
# TODO

### Sales ~ Newspaper

In [None]:
# TODO

In [None]:
# TODO

### Sales ~ TV + Radio + Newspaper

In [None]:
# TODO

In [None]:
# TODO

### Sales ~ TV + Radio

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

## Part B - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

In [None]:
# TODO

## Part C - Binary/Dummy Variables

In [None]:
df = pd.read_csv(os.path.join('..', '..', '07', 'datasets', 'zillow-07-start.csv'), index_col = 'ID')

In [None]:
# TODO

In [None]:
# TODO

### What's the bathrooms' distribution in the dataset?

In [None]:
# TODO

### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [None]:
# TODO

In [None]:
# TODO

### We can create the binary variables manually

In [None]:
# TODO

In [None]:
df.columns

### But we can also use `get_dummies` from `pandas` as well (on `BedCount` for the sake of variety)

In [None]:
# TODO

In [None]:
beds_df

In [None]:
beds_df.rename(columns={'Bed_1.0': 'Bed_1',
                        'Bed_2.0': 'Bed_2',
                        'Bed_3.0': 'Bed_3',
                        'Bed_4.0': 'Bed_4',
                        'Bed_5.0': 'Bed_5',
                        'Bed_6.0': 'Bed_6',
                        'Bed_7.0': 'Bed_7',
                        'Bed_8.0': 'Bed_8',
                        'Bed_9.0': 'Bed_9'}, inplace = True)

In [None]:
beds_df

In [None]:
df = df.join([beds_df])

In [None]:
df.columns

### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [None]:
smf.ols(formula = 'SalePrice ~ Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [None]:
# TODO

### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [None]:
# TODO

### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [None]:
# TODO