In [1]:
from   faraway.datasets import cheddar, prostate
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf

## 1. 

In [2]:
pros = prostate.load()
pros.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
0,-0.579819,2.7695,50,-1.386294,0,-1.38629,6,0,-0.43078
1,-0.994252,3.3196,58,-1.386294,0,-1.38629,6,0,-0.16252
2,-0.510826,2.6912,74,-1.386294,0,-1.38629,7,20,-0.16252
3,-1.203973,3.2828,58,-1.386294,0,-1.38629,6,0,-0.16252
4,0.751416,3.4324,62,-1.386294,0,-1.38629,6,0,0.37156


In [3]:
lmod = smf.ols(
    'lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason '
    '+ pgg45',
    pros
).fit()
lmod.summary()

0,1,2,3
Dep. Variable:,lpsa,R-squared:,0.655
Model:,OLS,Adj. R-squared:,0.623
Method:,Least Squares,F-statistic:,20.86
Date:,"Wed, 15 Jun 2022",Prob (F-statistic):,2.24e-17
Time:,07:33:22,Log-Likelihood:,-99.476
No. Observations:,97,AIC:,217.0
Df Residuals:,88,BIC:,240.1
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6693,1.296,0.516,0.607,-1.907,3.246
lcavol,0.5870,0.088,6.677,0.000,0.412,0.762
lweight,0.4545,0.170,2.673,0.009,0.117,0.792
age,-0.0196,0.011,-1.758,0.082,-0.042,0.003
lbph,0.1071,0.058,1.832,0.070,-0.009,0.223
svi,0.7662,0.244,3.136,0.002,0.281,1.252
lcp,-0.1055,0.091,-1.159,0.250,-0.286,0.075
gleason,0.0451,0.157,0.287,0.775,-0.268,0.358
pgg45,0.0045,0.004,1.024,0.309,-0.004,0.013

0,1,2,3
Omnibus:,0.235,Durbin-Watson:,1.507
Prob(Omnibus):,0.889,Jarque-Bera (JB):,0.026
Skew:,-0.017,Prob(JB):,0.987
Kurtosis:,3.073,Cond. No.,1280.0


In [4]:
# b) Conclusion p between 0.05 and 0.1
df_resid = 88
for ci in [0.9, 0.95]:
    qt = np.array(sp.stats.t.interval(ci, df_resid))
    ci_age = lmod.params[3] + lmod.bse[3] * qt
    print(f'{ci}: {ci_age}')


0.9: [-0.0382102  -0.00106415]
0.95: [-0.04184062  0.00256627]


In [5]:
# c)
ITERS = 4000
fs = []
lmod = smf.ols('lpsa ~ age', pros).fit()
               
for i in range(ITERS):
    pros['y_samp'] = np.random.permutation(pros.lpsa.copy())
    lmod_i = smf.ols('y_samp ~ age', pros).fit()
    fs.append(lmod_i.fvalue)
    
np.mean(fs > lmod.fvalue)

0.09875

In [6]:
lmod.summary()

0,1,2,3
Dep. Variable:,lpsa,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,2.813
Date:,"Wed, 15 Jun 2022",Prob (F-statistic):,0.0968
Time:,07:33:31,Log-Likelihood:,-149.64
No. Observations:,97,AIC:,303.3
Df Residuals:,95,BIC:,308.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7991,1.008,0.793,0.430,-1.202,2.800
age,0.0263,0.016,1.677,0.097,-0.005,0.057

0,1,2,3
Omnibus:,2.538,Durbin-Watson:,0.067
Prob(Omnibus):,0.281,Jarque-Bera (JB):,2.096
Skew:,0.152,Prob(JB):,0.351
Kurtosis:,3.653,Cond. No.,558.0


In [8]:
# d)
lmod_red = smf.ols('lpsa ~ -1 + age + pgg45', pros).fit()
lmod_red.summary()

0,1,2,3
Dep. Variable:,lpsa,R-squared (uncentered):,0.851
Model:,OLS,Adj. R-squared (uncentered):,0.848
Method:,Least Squares,F-statistic:,272.2
Date:,"Wed, 15 Jun 2022",Prob (F-statistic):,4.67e-40
Time:,07:36:35,Log-Likelihood:,-142.64
No. Observations:,97,AIC:,289.3
Df Residuals:,95,BIC:,294.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,0.0326,0.002,14.298,0.000,0.028,0.037
pgg45,0.0155,0.004,3.935,0.000,0.008,0.023

0,1,2,3
Omnibus:,5.669,Durbin-Watson:,0.449
Prob(Omnibus):,0.059,Jarque-Bera (JB):,6.127
Skew:,0.329,Prob(JB):,0.0467
Kurtosis:,4.041,Cond. No.,2.77


# 2. 

In [2]:
ched = cheddar.load()
print(ched.shape)
ched.head()

(30, 4)


Unnamed: 0,taste,Acetic,H2S,Lactic
0,12.3,4.543,3.135,0.86
1,20.9,5.159,5.043,1.53
2,39.0,5.366,5.438,1.57
3,47.9,5.759,7.496,1.81
4,5.6,4.663,3.807,0.99


In [3]:
# a)
lm = smf.ols('taste ~ Acetic + H2S + Lactic', ched).fit()
lm.summary()

0,1,2,3
Dep. Variable:,taste,R-squared:,0.652
Model:,OLS,Adj. R-squared:,0.612
Method:,Least Squares,F-statistic:,16.22
Date:,"Fri, 17 Jun 2022",Prob (F-statistic):,3.81e-06
Time:,11:56:44,Log-Likelihood:,-109.89
No. Observations:,30,AIC:,227.8
Df Residuals:,26,BIC:,233.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-28.8768,19.735,-1.463,0.155,-69.444,11.690
Acetic,0.3277,4.460,0.073,0.942,-8.839,9.495
H2S,3.9118,1.248,3.133,0.004,1.346,6.478
Lactic,19.6705,8.629,2.280,0.031,1.933,37.408

0,1,2,3
Omnibus:,1.923,Durbin-Watson:,1.575
Prob(Omnibus):,0.382,Jarque-Bera (JB):,1.143
Skew:,0.474,Prob(JB):,0.565
Kurtosis:,3.13,Cond. No.,92.3


In [4]:
# b)
ched['AceticRaw'] = np.exp(ched.Acetic)
ched['H2SRaw'] = np.exp(ched.H2S)
lm2 = smf.ols('taste ~ AceticRaw + H2SRaw + Lactic', ched).fit()
lm2.summary()

0,1,2,3
Dep. Variable:,taste,R-squared:,0.575
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,11.75
Date:,"Fri, 17 Jun 2022",Prob (F-statistic):,4.75e-05
Time:,11:58:31,Log-Likelihood:,-112.86
No. Observations:,30,AIC:,233.7
Df Residuals:,26,BIC:,239.3
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-18.9727,11.268,-1.684,0.104,-42.135,4.189
AceticRaw,0.0189,0.016,1.210,0.237,-0.013,0.051
H2SRaw,0.0008,0.000,1.831,0.079,-9.4e-05,0.002
Lactic,25.0074,9.062,2.760,0.010,6.380,43.635

0,1,2,3
Omnibus:,0.802,Durbin-Watson:,1.667
Prob(Omnibus):,0.67,Jarque-Bera (JB):,0.719
Skew:,0.347,Prob(JB):,0.698
Kurtosis:,2.693,Cond. No.,43600.0
