In [1]:
import numpy as np
from numpy.random import default_rng
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [10]:
def generate_data(sigma_e=0.5, sigma_v=1, N=10000, D=2, K=1, seed=12345,):
    '''
    Params:
        sigma_e - standard deviation for epsilon prob dist.
        sigma_v - standard deviation for v prob dist.
        N - num data pts to be generated
        D - num dimensions v has
        K - num  dimensions gamma has
        seed - seed for random generator
    '''
    
    # Sample from the specified PDFs
    rng = default_rng(seed) # pass seed value to default random number generator
    v = rng.multivariate_normal(np.zeros((D)), np.ones((D,D)) * sigma_v**2) # Use DxD covariance matrix of value sigma_v
    t = rng.multivariate_normal(np.zeros((D)), np.ones((D,D)), N) # Use DxD covariance matrix of ones
    gamma = rng.multivariate_normal(np.zeros((K)), np.ones((K,K))) # Use KxK covariance matrix of ones
    assert(gamma.any()!=0) # Some element of gamma must not == 0, otherwise there's no misspecification
    x = rng.multivariate_normal(np.zeros((K)), np.ones((K,K)), N)
    epsilon = rng.normal(0, sigma_e**2, N) # Must be independent of t and x
        
    # Combine the relevant PDFs in vectorized form, calculating y = v*t_i + gamma*x_i + epsilon_i
    y = np.matmul(v[np.newaxis,:], np.transpose(t)) # v * t
    print("V shape: ", v[np.newaxis,:].shape)
    print("V * t shape: ", y.shape)
    y += np.matmul(gamma[np.newaxis,:], np.transpose(x)) # gamma * x
    y += epsilon # epsilon
    return y, t, x

In [11]:
y, t, x = generate_data()

V shape:  (1, 2)
V * t shape:  (1, 10000)


In [4]:
# Linear Regression Using Incorrect, Misspecified Model
x_train, x_test, y_train, y_test = train_test_split(t, np.transpose(y), 
                                                    test_size=0.2, random_state=0)
reg_incorrect = LinearRegression()
reg_incorrect.fit(x_train, y_train)
y_pred = reg_incorrect.predict(x_test)

print("Mean Abs Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Abs Error: 0.39073255398835455
Mean Squared Error: 0.2364473442884385
Root Mean Squared Error: 0.48625851590325747


In [5]:
# Linear Regression Using Correct Model
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((x,t), axis=1), 
                                                    np.transpose(y), test_size=0.2, random_state=0)
reg_correct = LinearRegression()
reg_correct.fit(x_train, y_train)
y_pred = reg_correct.predict(x_test)

print("Mean Abs Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

Mean Abs Error: 0.20035031065769401
Mean Squared Error: 0.06369706947600555
Root Mean Squared Error: 0.2523827836362963
