In [1]:
import pandas as pd
from scipy.stats import f_oneway

rate = pd.read_csv('data/rate_by_city.csv')
rate.head(5)

Unnamed: 0,Rate,City
0,13.75,1
1,13.75,1
2,13.5,1
3,13.5,1
4,13.0,1


In [2]:
# Pivot City values to represent new Columns
rate['city_count'] = rate.groupby('City').cumcount()
rate_pivot = rate.pivot(index='city_count', columns='City', values='Rate')
rate_pivot.columns = ['City_'+str(x) for x in rate_pivot.columns.values]
rate_pivot.head()

Unnamed: 0_level_0,City_1,City_2,City_3,City_4,City_5,City_6
city_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,13.75,14.25,14.0,15.0,14.5,13.5
1,13.75,13.0,14.0,14.0,14.0,12.25
2,13.5,12.75,13.51,13.75,14.0,12.25
3,13.5,12.5,13.5,13.59,13.9,12.0
4,13.0,12.5,13.5,13.25,13.75,12.0


In [3]:
# Preform ANOVE F-test
f_oneway(rate_pivot.City_1,rate_pivot.City_2,rate_pivot.City_3,\
         rate_pivot.City_4,rate_pivot.City_5,rate_pivot.City_6)

F_onewayResult(statistic=4.8293848737024, pvalue=0.001174551414504048)

In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Generate and ANOVE table
model = ols('Rate ~ C(City)', data=rate).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(City),10.945667,5.0,4.829385,0.001175
Residual,21.758133,48.0,,


In [5]:
from scipy.stats import linregress

auto = pd.read_csv('data/auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horse_power,weight,acceleration,model_year,car_name
0,18.0,8,307.0,130.0,3504,12.0,70,"\t""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693,11.5,70,"\t""buick skylark 320"""
2,18.0,8,318.0,150.0,3436,11.0,70,"\t""plymouth satellite"""
3,16.0,8,304.0,150.0,3433,12.0,70,"\t""amc rebel sst"""
4,17.0,8,302.0,140.0,3449,10.5,70,"\t""ford torino"""


In [6]:
# Create a linear model that predicts MPG using acceleration in the auto-mpg dataset
slope, intercept, r_value, p_value, std_err = linregress(auto.acceleration, auto.mpg)
slope, intercept, r_value, p_value, std_err

(1.1912045293502274,
 4.9697930042539085,
 0.4202889121016507,
 1.8230915350787203e-18,
 0.12923643283101396)

In [7]:
import statsmodels.api as sm

# Linear Regressions with statsmodels

X = sm.add_constant(auto.acceleration) # We must add the intercept using the add_constant function
Y = auto.mpg

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.177
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                     84.96
Date:                Wed, 10 Jul 2019   Prob (F-statistic):           1.82e-18
Time:                        13:48:47   Log-Likelihood:                -1343.9
No. Observations:                 398   AIC:                             2692.
Df Residuals:                     396   BIC:                             2700.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            4.9698      2.043      2.432   

  return ptp(axis=axis, out=out, **kwargs)


In [8]:
# Linear Regression with more than 1 predictor variable
X = sm.add_constant(auto[['cylinders', 'weight', 'acceleration']]) # adding a constant
Y = auto.mpg

model = sm.OLS(Y, X).fit()
predictions = model.predict(X) 

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     306.7
Date:                Wed, 10 Jul 2019   Prob (F-statistic):          1.14e-102
Time:                        13:49:46   Log-Likelihood:                -1142.9
No. Observations:                 398   AIC:                             2294.
Df Residuals:                     394   BIC:                             2310.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           42.3811      1.960     21.627   