In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Simulate a dataset for a linear regression

$$ y = x_1 + x_2 + x_3 + x_4 + \epsilon $$

The experimental error $\epsilon$ is normally distributed.

In [0]:
np.random.seed(0)

In [0]:
X = np.random.normal(size=4000000).reshape(1000000,4)

In [0]:
y = []

for record in X:
  y.append(np.sum(record) + np.random.normal())

y = np.array(y)

## Small test set, large training set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
def estimate_error(X_test,y_test):
  n_iter = 100
  np.random.seed(0)
  errors = []

  indices = list(range(X_test.shape[0]))
  for i in range(n_iter):
    new_indices = np.random.choice(indices,len(indices),replace=True)

    new_X_test = X_test[new_indices]
    new_y_test = y_test[new_indices]

    new_y_pred = model.predict(new_X_test)

    new_error = np.sqrt(mean_squared_error(new_y_test,new_y_pred))

    errors.append(new_error)

  return np.mean(errors),np.std(errors)

In [0]:
estimate_error(X_test,y_test)

(1.0028372852013618, 0.0015058423972133183)

## Large test set, small training set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [0]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
estimate_error(X_test,y_test)

(1.0007244861649207, 0.000753571738923046)