In [None]:
# Simulate regression data: y, x1, x2
# with given corr level between x1 and x2:

import numpy as np
import pandas as pd

def simulate_data(n, corr, seed):
    np.random.seed(seed)
    x1 = np.random.normal(size=n)
    x2 = corr * x1 + np.sqrt(1 - corr**2) * np.random.normal(size=n)
    y = 0 + 1 * x1 + 0 * x2 + np.random.normal(size=n)
    df = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2})
    return


df = simulate_data(1000, 0.99, 0)

In [38]:
# For different cor levels, compute the fraction of times the model
# identifies x2 as significant:

import statsmodels.api as sm

cors = [-.995, -.99, -.9, -.5, 0, .5, .9, .99, .995]
n = 1_000
B = 1_000

# Init dict with keys:
results = []
for corr in cors:
    significant = 0
    pvalues_ = []
    for i in range(B):
        df = simulate_data(n, corr, i)
        model = sm.OLS(df['y'], sm.add_constant(df[['x1', 'x2']])).fit()
        # Save all pvalues for all vars:
        pvalues_.append(model.pvalues.to_dict())
    df_pvalues = pd.DataFrame(pvalues_)
    df_pvalues["corr"] = corr
    results.append(df_pvalues)


In [40]:
df_pvalues = pd.concat(results)
df_pvalues["signif_x1"] = df_pvalues["x1"] < 0.01
df_pvalues["signif_x2"] = df_pvalues["x2"] < 0.01
df_pvalues["signif_both"] = df_pvalues["signif_x1"] & df_pvalues["signif_x2"]
df_pvalues.groupby("corr")[["signif_x1", "signif_x2", "signif_both"]].mean()

Unnamed: 0_level_0,signif_x1,signif_x2,signif_both
corr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.995,0.339,0.007,0.004
-0.99,0.697,0.007,0.004
-0.9,1.0,0.007,0.007
-0.5,1.0,0.007,0.007
0.0,1.0,0.007,0.007
0.5,1.0,0.007,0.007
0.9,1.0,0.007,0.007
0.99,0.675,0.007,0.003
0.995,0.32,0.007,0.003
