# Table of Contents
 <p>

In [1]:
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf

from sklearn import linear_model

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# statsmodels has a formula syntax api, similar to R
# it's nice because it creates dummy variables for us
model = smf.ols(
    'tip ~ total_bill + size + sex + smoker + day + time',
    data=tips).fit()  ## note i'm calling fit here

In [5]:
model.summary()

0,1,2,3
Dep. Variable:,tip,R-squared:,0.47
Model:,OLS,Adj. R-squared:,0.452
Method:,Least Squares,F-statistic:,26.06
Date:,"Sun, 03 Dec 2017",Prob (F-statistic):,1.1999999999999999e-28
Time:,16:16:44,Log-Likelihood:,-347.48
No. Observations:,244,AIC:,713.0
Df Residuals:,235,BIC:,744.4
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5908,0.256,2.310,0.022,0.087,1.095
sex[T.Female],0.0324,0.142,0.229,0.819,-0.247,0.311
smoker[T.No],0.0864,0.147,0.589,0.556,-0.202,0.375
day[T.Fri],0.1623,0.393,0.412,0.680,-0.613,0.937
day[T.Sat],0.0408,0.471,0.087,0.931,-0.886,0.968
day[T.Sun],0.1368,0.472,0.290,0.772,-0.793,1.066
time[T.Dinner],-0.0681,0.445,-0.153,0.878,-0.944,0.808
total_bill,0.0945,0.010,9.841,0.000,0.076,0.113
size,0.1760,0.090,1.966,0.051,-0.000,0.352

0,1,2,3
Omnibus:,27.86,Durbin-Watson:,2.096
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.555
Skew:,0.607,Prob(JB):,3.87e-12
Kurtosis:,4.923,Cond. No.,281.0


In [6]:
# we have to manually create dummy variables when working with sklearn
# the drop_first drops the first variable in the dummy variables to reduce multi-colinearity
tips_dummy = pd.\
    get_dummies(tips[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], drop_first=True)

In [7]:
lr = linear_model.LinearRegression()

In [8]:
tips_dummy.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


In [9]:
# fit the linear regression model
predicted = lr.fit(
    tips_dummy,
    tips['tip']
)

In [10]:
# get the coefficients
predicted.coef_

array([ 0.09448701,  0.175992  ,  0.03244094,  0.08640832,  0.1622592 ,
        0.04080082,  0.13677854, -0.0681286 ])