In [3]:
import numpy as np 
import numpy.random as npr

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
import statsmodels.api as sm
from numpy.linalg import cond

In [4]:
N=2000
D=5 # number of features
mean = np.zeros(D)
corr = 0.9

In [6]:
y_noise = 0.1
# designate the core feature
num_corefea = np.int32(D/2)
true_cause = np.arange(num_corefea).astype(int)

## generate simulated datasets with core and spurious features
The outcome model is the same in training and testing; the outcome only depends on the core feature. 

In the training set, the covariates have high correlation. In the test set, the covariates have low correlation.

In [7]:
# simulate strongly correlated features for training
train_cov = np.ones((D, D)) * corr + np.eye(D) * (1 - corr)
train_x_true = npr.multivariate_normal(mean, train_cov, size=N)
train_x_true = train_x_true * np.concatenate([-1 * np.ones(D//2), np.ones(D - D//2)])  # create both positive and negatively correlated covariates
# train_x_true = np.exp(npr.multivariate_normal(mean, train_cov, size=N)) # exponential of gaussian; no need to be gaussian

In [8]:
# simulate weakly correlated features for testing
test_cov = np.ones((D, D)) * (1 - corr) + np.eye(D) * corr
test_x_true = npr.multivariate_normal(mean, test_cov, size=N)
# test_x_true = np.exp(npr.multivariate_normal(mean, test_cov, size=N))  # exponential of gaussian; no need to be gaussian

In [9]:
# add observation noise to the x
# spurious correlation more often occurs when the signal to noise ratio is lower
x_noise = np.array(list(np.ones(num_corefea)*0.4) + list(np.ones(D-num_corefea)*0.3))

train_x = train_x_true + x_noise * npr.normal(size=[N,D])
test_x = test_x_true + x_noise * npr.normal(size=[N,D])

In [10]:
print("\ntrain X correlation\n", np.corrcoef(train_x.T))
print("\ntest X correlation\n",np.corrcoef(test_x.T))


train X correlation
 [[ 1.          0.77540644 -0.79604528 -0.80342818 -0.79726048]
 [ 0.77540644  1.         -0.79111684 -0.79591483 -0.78772935]
 [-0.79604528 -0.79111684  1.          0.81572877  0.81272448]
 [-0.80342818 -0.79591483  0.81572877  1.          0.82228669]
 [-0.79726048 -0.78772935  0.81272448  0.82228669  1.        ]]

test X correlation
 [[1.         0.10719651 0.13171293 0.10472425 0.11143371]
 [0.10719651 1.         0.07822715 0.11168419 0.09614733]
 [0.13171293 0.07822715 1.         0.0700423  0.10161975]
 [0.10472425 0.11168419 0.0700423  1.         0.09288308]
 [0.11143371 0.09614733 0.10161975 0.09288308 1.        ]]


In [11]:
# generate outcome
# toy model y = x + noise
truecoeff = npr.uniform(size=num_corefea) * 10
train_y = train_x_true[:,true_cause].dot(truecoeff) + y_noise * npr.normal(size=N)
test_y = test_x_true[:,true_cause].dot(truecoeff) + y_noise * npr.normal(size=N)

In [21]:
npr.uniform(size=num_corefea) * 10

array([2.27431474, 5.04523172])

# baseline naive regression on all features

In [12]:
# regularization parameter for ridge regression
alpha = 10

In [24]:
def fitcoef(cov_train, train_y, cov_test=None, test_y=None):
	# linearReg
	print("linearReg")
	reg = LinearRegression()
	reg.fit(cov_train, train_y)
	print("coef", reg.coef_, "intercept", reg.intercept_)
	print("train accuracy", reg.score(cov_train, train_y))
	if cov_test is not None:
		print("test accuracy", reg.score(cov_test, test_y))

	# # linearReg with statsmodels
	# print("linearReg with statsmodels")
	# model = sm.OLS(train_y,sm.add_constant(cov_train, prepend=False))
	# result = model.fit()
	# print(result.summary())

	# ridgeReg
	print("ridgeReg")
	reg = Ridge(alpha=alpha)
	reg.fit(cov_train, train_y)
	print("coef", reg.coef_, "intercept", reg.intercept_)
	print("train accuracy", reg.score(cov_train, train_y))
	if cov_test is not None:
		print("test accuracy", reg.score(cov_test, test_y))

all three features have coefficient different from zeuo

test accuracy degrades much from training accuracy.

In [25]:
print("\n###########################\nall features")

cov_train = np.column_stack([train_x])
cov_test = np.column_stack([test_x])

fitcoef(cov_train, train_y, cov_test, test_y)


###########################
all features
linearReg
coef [ 4.45126266  2.92847886 -1.68428032 -1.6388807  -1.53846371] intercept 0.034717321073559904
train accuracy 0.9441345880837726
test accuracy 0.5942319771837574
ridgeReg
coef [ 4.40654794  2.91488649 -1.69756989 -1.65500736 -1.5544649 ] intercept 0.03411778264715021
train accuracy 0.9441290275123589
test accuracy 0.5874050783760889


next consider oracle, regression only on the core feature

In [26]:
print("\n###########################\nall features")

cov_train = np.column_stack([train_x[:,true_cause]])
cov_test = np.column_stack([test_x[:,true_cause]])

fitcoef(cov_train, train_y, cov_test, test_y)


###########################
all features
linearReg
coef [6.65421435 4.98376463] intercept 0.06655121616127596
train accuracy 0.9157677168589502
test accuracy 0.8463744251321661
ridgeReg
coef [6.62325634 4.98531319] intercept 0.06596618989802233
train accuracy 0.9157609661339334
test accuracy 0.8458971748243646


## causal-rep
now try adjust for pca factor, then learn feature coefficient, construct a prediction function using the learned feature mapping, predict on the test set

In [28]:
# fit pca to high correlated training dataset
pca = PCA(n_components=1)
pca.fit(train_x)
pca.transform(train_x)

array([[-1.08967569],
       [-1.5620393 ],
       [ 0.01461153],
       ...,
       [ 1.71756862],
       [ 0.71424637],
       [-0.22230497]])

In [29]:
# consider features 0,1 (have to consider a subset of features; 
# alternatively one can consider features 0,2
# cannot consider all three due to colinearity issues 
# (a.k.a. violation of overlap))
print("\n###########################\ncore + spurious 1 + pca")
candidate_trainfea = train_x[:,:-1]
candidate_testfea = test_x[:,:-1]
adjust_trainC = pca.transform(train_x)
cov_train = np.column_stack([candidate_trainfea, adjust_trainC])
print("linearReg")
feareg = LinearRegression()
feareg.fit(cov_train, train_y)
print("coef", feareg.coef_, "intercept", feareg.intercept_)
print("train accuracy", feareg.score(cov_train, train_y))


###########################
core + spurious 1 + pca
linearReg
coef [ 2.86393901  1.35455884 -0.12146095 -0.04293697 -3.51470226] intercept -0.15518568073231195
train accuracy 0.9441345880837726


In [18]:
# cond(candidate_trainfea.dot(candidate_trainfea.T))

above, after adjusting for pca factor, the spurious feature 1 returns close to zero coefficient

In [30]:
# construct a prediction model using the learned 
# feature combination of "core + spurious 1"
learned_fea_train = candidate_trainfea.dot(feareg.coef_[:candidate_trainfea.shape[1]])[:,np.newaxis]
predreg = LinearRegression()
predreg.fit(learned_fea_train, train_y)
print("trainfea_coef", predreg.coef_, "intercept", predreg.intercept_)
print("trainfea accuracy", predreg.score(learned_fea_train, train_y))

trainfea_coef [2.64977992] intercept 0.04003524109076689
trainfea accuracy 0.915908607480582


In [31]:
# apply the prediction model on the test data
learned_fea_test = candidate_testfea.dot(feareg.coef_[:candidate_trainfea.shape[1]])[:,np.newaxis]
print("testfea accuracy", predreg.score(learned_fea_test, test_y))

testfea accuracy 0.8717332930238679


above, the test accuracy no longer degrades much from the training accuracy.

also note that the test accuracy is very close to the oracle accuracy.