In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [22]:
a = np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
b = np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
print(np.einsum('ij,ji->i',a,b))

[ 70 184 330]


In [76]:
# function to generate sparse synthetic data
# takes as input
# N: number of instances; D: dimensions of instances; C: the number of correlations to randomly select
# returns the 2nd degree polynomial synthetic data, along with the generating matrices A and B
def generate_sparse_data(N, D, C):
    X = np.random.rand(N,D)   
    A = np.zeros((D,D))
    for _ in range(0,C):
        A[np.random.randint(D),np.random.randint(D)] = np.random.rand()
    for i in range(0,D):
        A[i,i] = 0
    B = np.random.rand(D, 1)
    y = np.einsum('ij,ji->i', (X @ A), np.transpose(X)) + X @ B
    return X,y

In [84]:
# testing sparse A with sklearn linear classifier
X, y = generate_sparse_data(10, 20, 20)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Mean squared error: 0.45


In [101]:
def avg_prediction_error(N, D, C, nTrials):
    error = np.zeros(nTrials)
    for trial in range(0, nTrials):
        X, y = generate_sparse_data(N, D, C)
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
        lr = LinearRegression()
        lr.fit(X_train,y_train)
        y_pred = lr.predict(X_test)
        error[trial] = mean_squared_error(y_test, y_pred)
    return np.mean(error)

In [102]:
# test: effect of increasing C.
N = 500
D = 500
nTrials = 50
C = [500, 1000, 2000, 4000, 8000, 16000]
errors = []
for c in C:
    errors.append(avg_prediction_error(N, D, c, nTrials))
plt.plot(C, errors)
plt.show()

NameError: name 'plot' is not defined