In [37]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score

from math import sqrt
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import env
import util
from wrangle_zillow import wrangle_zillow
import explore
import split_scale
import features_zillow
import model_zillow
import evaluate

# Model Notes and Practice

- Regression is a supervised machine learning technique.


- Linear Regression is used to model relationships between one or more independent variables and a continuous target dependent variable.


- Our goal is to find the line of best fit, or in other words, the equation (y-intercept and coefficients(s)) that minimizes the errors between your actual (y) and predicted (yhat) target values.

In [7]:
df = data('tips')

In [8]:
df.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2


In [9]:
train, test = split_scale.split_my_data(df)

In [10]:
print(train.shape, test.shape)

(170, 7) (74, 7)


In [11]:
X_train = train[['size']]
X_test = test[['size']]
y_train = train[['tip']]
y_test = test[['tip']]

## Create the LR Object using sklearn

In [12]:
lm1 = LinearRegression()
lm1

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Fit/Train the Model

In [None]:
lm1.fit(X_train, y_train)

## Return the intercept and coefficients created by model

In [13]:
lm1_y_intercept = lm1.intercept_
print('intercept: ', lm1_y_intercept)
lm1_coefficients = lm1.coef_
print('coefficients: ', lm1_coefficients)

intercept:  [1.3281486]
coefficients:  [[0.62602051]]


## Make Predictions

In [15]:
yhat = lm1.predict(X_train)

In [17]:
# returns a numpy array

yhat[:5]

array([[2.58018962],
       [2.58018962],
       [2.58018962],
       [2.58018962],
       [3.83223065]])

## Evaluate Sklearn LinearRegression Model Performance

In [36]:
mse_lm1 = mean_squared_error(y_train, yhat)
print(f"Linear Model Evaluation Metrics:\nMean Squared Error: {mse_lm1:.3}") 
print()
r2_lm1 = r2_score(y_train, yhat)
print(f"r2_score: {r2_lm1:.2%}")
print()
print(f"This means that {r2_lm1:.2%} of the variance in the value of tips can be explained by the size of the party.")

Linear Model Evaluation Metrics:
Mean Squared Error: 1.22

r2_score: 20.51%

This means that 20.51% of the variance in the value of tips can be explained by the size of the party.


In [34]:
# This is another way to return the r-squared of our model

print('r2 = ', lm1.score(X_train, y_train))  # feeding in X_train, y_train
print('r2 = ', r2_score(y_train, yhat))      # feeding in y_train, yhat

r2 =  0.20513559210842236
r2 =  0.20513559210842236


## Let's Check out the Residuals

- A residual is a measure of how far away a point is vertically from the regression line. It is the error between a predicted value and the observed actual value.


- A typical Residual Plot has the residual values on the Y-axis and the independent variable on the x_axis. 


- The most important assumption of a linear regression model is that the errors are independent and normally distributed. So, what does a good Residual Plot look like?

    - It has a high density of points close to the origin and low density of points away from the origin.
    
    - It is symmetric about the origin.
    
    - If we project all of the residuals onto the y_axis, we should have a normally distributed curve.
    
    - We should not see any patterns in the residuals as we move along the x-axis.
    

- If we do not see the characteristics above, it means we have not completely captured the predictive information of the data in our model.


- Finding patterns in our residuals may mean that there is a non-linear relationship.

## Create the OLS Model and Print out Evaluation Summary

- OLS stands for Ordinary Least Squares, and the method 'Least Squares' means that we're trying to fit a regression line that would minimize the square of distance from the regression line.


- The r-squared value returned here is .205 which means that about 21% of the variance in tip values is explained by the size of the parties at the restaurant.


- Our r-squared value here is pretty right on with our Linear Regression model above.

In [31]:
# ols_model = ols('y ~ x', data=df).fit()
# ols_yhat = ols_model.predict(x)

tips_model = ols('tip ~ size', data=train).fit()
tips_yhat = tips_model.predict(X_train)

tips_model.summary()

0,1,2,3
Dep. Variable:,tip,R-squared:,0.205
Model:,OLS,Adj. R-squared:,0.2
Method:,Least Squares,F-statistic:,43.36
Date:,"Mon, 23 Mar 2020",Prob (F-statistic):,5.6e-10
Time:,15:22:53,Log-Likelihood:,-258.07
No. Observations:,170,AIC:,520.1
Df Residuals:,168,BIC:,526.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.3281,0.253,5.249,0.000,0.829,1.828
size,0.6260,0.095,6.585,0.000,0.438,0.814

0,1,2,3
Omnibus:,14.508,Durbin-Watson:,1.931
Prob(Omnibus):,0.001,Jarque-Bera (JB):,15.903
Skew:,0.742,Prob(JB):,0.000352
Kurtosis:,3.212,Cond. No.,8.91
