# Simulations with regressions

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
import sys
sys.path.append('./shapley_compute/')

## High dimensional linear regression with independent features

In [4]:
from shapley_compute.ensembled_mp import mp_shapley
from shapley_compute.minipatches import minipatch_regression

In [5]:
def normal_linear_model(N, M, sigma2=0.4, s=0.2):
    """
    N: number of obs
    M: number of features
    sigma2: variance 
    s: sparsity level
    """ 
    np.random.seed(123)
    X = np.random.normal(0,1, size=(N,M))
    M1=int(s*M)
    beta = np.append(np.random.normal(5, 1, M1),np.array([0]*(M-M1))) # M-M1 beta are set to 0, M1 are non zeros
    eps = np.random.normal(0, sigma2, size=N)

    y = X@beta + eps
    return y, X, beta
    


In [12]:
N = 500
M = 20
y, X, beta = normal_linear_model(N,M)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
X_train.shape, X_test.shape, y_train.shape

((400, 20), (100, 20), (400,))

In [17]:
# fit a linear regression and evaluate shapley mp

model = LinearRegression()

res = minipatch_regression(X_train, y_train, X_test, model, 0.2, 1000)

In [None]:
shap = mp_shapley(X_test, res)

In [None]:
plt.barh([i for i in range(M)], np.mean(np.abs(shap),axis=0))

In [None]:
beta

In [None]:
import shap
model = LinearRegression()
model.fit(X_train, y_train)
explainer = shap.Explainer(model.predict, X_test)
# Calculates the SHAP values - It takes some time
shap_values_lr = explainer(X_test)

In [None]:
plt.barh([i for i in range(M)], np.mean(np.abs(shap_values_lr.values),axis=0))