### Stats Models Examples

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# read in the data
data = pd.read_csv("2020-03-03-lecture10-phacking-data.txt", sep="\t")
print(data)

In [None]:
# option 1: specify regression using list of columns from your data
data = sm.add_constant(data)
model = sm.OLS(data["y"], data[["X", "const"]])

results = model.fit()
print(results.summary())

In [None]:
# option 2: specify regression using formula
import statsmodels.formula.api as smf

eq = "y ~ X + X^2"
model = smf.ols(formula=eq, data=data)

results = model.fit()
print(results.summary())

In [None]:
# example of modeling a non-linear relationship

x = np.random.uniform(1, 10, 100)
noise = np.random.uniform(0, 1, 100)
obs = x+noise
y = x**2
plt.scatter(obs**2, y)
plt.show()

data = pd.DataFrame.from_dict({"y": y, "obs": obs}) 
data["obs^2"] = data["obs"]**2
data = sm.add_constant(data)

model = sm.OLS(data["y"], data[["obs", "obs^2", "const"]])
results = model.fit()
print(results.summary())

### P-Value False Positives

In [None]:
from scipy.stats import ttest_ind

pvals = []

for _ in range(100):
    cs = np.random.normal(loc=130, scale=10, size=50)
    non = np.random.normal(loc=130, scale=10, size=50)

    t, p = ttest_ind(cs, non)
    pvals.append(p)

    #plt.hist([cs, non])
    #plt.show()
plt.hist(pvals, bins=20)
plt.axvline(0.05, color='k', ls='--')
plt.show()

plt.hist(pvals, bins=20, cumulative=True, density=True)
plt.axhline(0.05, color='k', ls='--')
plt.show()