In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  8 20:48:37 2017

@author: amiao
"""

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)
# packages ------
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import scipy.stats as stats

# Read Data ------------------------------
df = pd.read_csv("C:\\Users\\amiao\\Google Drive\\Study\\GA\\datasets\\dataset-08-zillow.csv")

# Process Data ------------------------------
df.Size /= 10 ** 3 # Size in 1,000 sqft
df.LotSize /= 10 ** 3 # Lot size in 1,000 sqft
df = df.dropna(subset = ['Size', 'SalePrice'])
    
# Run Model ---------------------------------
X = df[ ['Size'] ] # X is a DataFrame
X = sm.add_constant(X)  # add a constant
y = df.SalePrice # y is a Series
model = smf.OLS(y, X).fit()
model.summary()

# Predict
predict_X = pd.DataFrame({'Size': [1.2, 1.4, 1.6]}, columns = ['Size'])
predict_X = sm.add_constant(predict_X)
predict_y = model.predict(predict_X)

# Model Parameters
model.params
model.params.Size
model.tvalues   #t-values
model.pvalues   #p-values
model.fvalue    #f-values
model.conf_int(alpha = .10).rename(columns = {0: '5%', 1: '95%'})   #confidence interval
model.rsquared
model.rsquared_adj

# Using Formulas smf ---------------------------------------
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()  #with intercept
model = smf.ols(formula = 'SalePrice ~ 0 + Size', data = df).fit()  #without intercept

# dropping Outliers-------------------------------------
def drop_outliers(df):
    print ('Dropping outliers')
    print ('- n (before) =', df.shape[0])

    Q1 = df.SalePrice.quantile(.25)
    Q2 = df.SalePrice.quantile(.5)
    Q3 = df.SalePrice.quantile(.75)
    IQR = Q3 - Q1

    print ('- Q1         =', Q1, '($M)')
    print ('- Q2/Median  =', Q2, '($M)')
    print ('- Q3         =', Q3, '($M)')

    df.drop(df[(df.SalePrice < Q1 - 1.5 * IQR) | (df.SalePrice > Q3 + 1.5 * IQR)].index, inplace = True)
    print ('- n (after)  =', df.shape[0])
    
# check residual distributions--------------------------------
model.resid.plot(kind = 'hist', bins = 250, figsize = (8, 8))   #plot residuals
sm.qqplot(model.resid, line = 's')
sm.graphics.plot_regress_exog(model, 'Size', fig = plt.figure(figsize = (12, 8)))
sns.lmplot(x = 'Size', y = 'SalePrice', data = df)


# Linear Regression Modeling with sklearn
from sklearn import linear_model
import seaborn as sns
model_linear = linear_model.LinearRegression().fit(X, y)
model_linear.intercept_
model_linear.coef_
model_linear.predict(X)
model_linear.score(X,y)

# one-hot encoding
df = df[df.Baths.isin([1, 2, 3, 4])]
df.Baths.value_counts(dropna = False).sort_index()
baths_df = pd.get_dummies(df.Baths, prefix = 'Bath')
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)
df = df.join([baths_df])
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()

