## Statsmodels quickstart

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
data = sm.datasets.get_rdataset("Guerry", "HistData").data

In [5]:
data.head()

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,...,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,...,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,...,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,...,46,42,76,66,16,85,34,161.927,7340,298.26
3,4,E,Basses-Alpes,12935,7289,46,2733,23018,14238,1:Sm,...,70,12,37,80,32,29,2,351.399,6925,155.9
4,5,E,Hautes-Alpes,17488,8174,69,6962,23076,16171,1:Sm,...,22,23,64,79,35,7,1,320.28,5549,129.1


In [6]:
# Fit OLS regression use ln
# With R style formulas
res = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=data).fit()

In [7]:
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,22.2
Date:,"Thu, 27 Jun 2024",Prob (F-statistic):,1.9e-08
Time:,18:37:12,Log-Likelihood:,-379.82
No. Observations:,86,AIC:,765.6
Df Residuals:,83,BIC:,773.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,246.4341,35.233,6.995,0.000,176.358,316.510
Literacy,-0.4889,0.128,-3.832,0.000,-0.743,-0.235
np.log(Pop1831),-31.3114,5.977,-5.239,0.000,-43.199,-19.424

0,1,2,3
Omnibus:,3.713,Durbin-Watson:,2.019
Prob(Omnibus):,0.156,Jarque-Bera (JB):,3.394
Skew:,-0.487,Prob(JB):,0.183
Kurtosis:,3.003,Cond. No.,702.0


In [8]:
# Using numpy arrays
nobs = 100
X = np.random.random((nobs, 2))
X = sm.add_constant(X)

In [10]:
beta = [1, .1, .5] # coef
e = np.random.random(nobs) # error
y = np.dot(X, beta) + e

In [11]:
res_2 = sm.OLS(y, X).fit()

In [12]:
res_2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.22
Model:,OLS,Adj. R-squared:,0.204
Method:,Least Squares,F-statistic:,13.67
Date:,"Thu, 27 Jun 2024",Prob (F-statistic):,5.89e-06
Time:,18:41:06,Log-Likelihood:,-24.387
No. Observations:,100,AIC:,54.77
Df Residuals:,97,BIC:,62.59
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.4574,0.091,16.054,0.000,1.277,1.638
x1,0.1682,0.119,1.419,0.159,-0.067,0.404
x2,0.5735,0.112,5.102,0.000,0.350,0.797

0,1,2,3
Omnibus:,83.845,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7.938
Skew:,-0.059,Prob(JB):,0.0189
Kurtosis:,1.625,Cond. No.,5.84
