In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# Generate sample data
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 3 * X + np.random.randn(100).reshape(-1, 1)

In [3]:
X

array([[ 0.        ],
       [ 0.1010101 ],
       [ 0.2020202 ],
       [ 0.3030303 ],
       [ 0.4040404 ],
       [ 0.50505051],
       [ 0.60606061],
       [ 0.70707071],
       [ 0.80808081],
       [ 0.90909091],
       [ 1.01010101],
       [ 1.11111111],
       [ 1.21212121],
       [ 1.31313131],
       [ 1.41414141],
       [ 1.51515152],
       [ 1.61616162],
       [ 1.71717172],
       [ 1.81818182],
       [ 1.91919192],
       [ 2.02020202],
       [ 2.12121212],
       [ 2.22222222],
       [ 2.32323232],
       [ 2.42424242],
       [ 2.52525253],
       [ 2.62626263],
       [ 2.72727273],
       [ 2.82828283],
       [ 2.92929293],
       [ 3.03030303],
       [ 3.13131313],
       [ 3.23232323],
       [ 3.33333333],
       [ 3.43434343],
       [ 3.53535354],
       [ 3.63636364],
       [ 3.73737374],
       [ 3.83838384],
       [ 3.93939394],
       [ 4.04040404],
       [ 4.14141414],
       [ 4.24242424],
       [ 4.34343434],
       [ 4.44444444],
       [ 4

In [4]:
y

array([[ 0.49671415],
       [ 0.164766  ],
       [ 1.25374914],
       [ 2.43212077],
       [ 0.97796784],
       [ 1.28101456],
       [ 3.39739463],
       [ 2.88864685],
       [ 1.95476804],
       [ 3.26983277],
       [ 2.56688534],
       [ 2.86760358],
       [ 3.87832591],
       [ 2.02611369],
       [ 2.51750641],
       [ 3.98316702],
       [ 3.83565373],
       [ 5.46576248],
       [ 4.54652138],
       [ 4.34527206],
       [ 7.52625483],
       [ 6.13786006],
       [ 6.73419487],
       [ 5.54494878],
       [ 6.72834455],
       [ 7.68668017],
       [ 6.7277943 ],
       [ 8.5575162 ],
       [ 7.88420979],
       [ 8.49618504],
       [ 8.48920248],
       [11.24621758],
       [ 9.68347247],
       [ 8.94228907],
       [11.12557522],
       [ 9.38521696],
       [11.1179545 ],
       [ 9.25245109],
       [10.18696547],
       [12.01504305],
       [12.8596787 ],
       [12.59561071],
       [12.61162444],
       [12.72919933],
       [11.85481134],
       [12

In [5]:
# Add duplicate samples
X = np.vstack((X, X[:10]))
y = np.vstack((y, y[:10]))


In [6]:
X

array([[ 0.        ],
       [ 0.1010101 ],
       [ 0.2020202 ],
       [ 0.3030303 ],
       [ 0.4040404 ],
       [ 0.50505051],
       [ 0.60606061],
       [ 0.70707071],
       [ 0.80808081],
       [ 0.90909091],
       [ 1.01010101],
       [ 1.11111111],
       [ 1.21212121],
       [ 1.31313131],
       [ 1.41414141],
       [ 1.51515152],
       [ 1.61616162],
       [ 1.71717172],
       [ 1.81818182],
       [ 1.91919192],
       [ 2.02020202],
       [ 2.12121212],
       [ 2.22222222],
       [ 2.32323232],
       [ 2.42424242],
       [ 2.52525253],
       [ 2.62626263],
       [ 2.72727273],
       [ 2.82828283],
       [ 2.92929293],
       [ 3.03030303],
       [ 3.13131313],
       [ 3.23232323],
       [ 3.33333333],
       [ 3.43434343],
       [ 3.53535354],
       [ 3.63636364],
       [ 3.73737374],
       [ 3.83838384],
       [ 3.93939394],
       [ 4.04040404],
       [ 4.14141414],
       [ 4.24242424],
       [ 4.34343434],
       [ 4.44444444],
       [ 4

In [7]:
y

array([[ 0.49671415],
       [ 0.164766  ],
       [ 1.25374914],
       [ 2.43212077],
       [ 0.97796784],
       [ 1.28101456],
       [ 3.39739463],
       [ 2.88864685],
       [ 1.95476804],
       [ 3.26983277],
       [ 2.56688534],
       [ 2.86760358],
       [ 3.87832591],
       [ 2.02611369],
       [ 2.51750641],
       [ 3.98316702],
       [ 3.83565373],
       [ 5.46576248],
       [ 4.54652138],
       [ 4.34527206],
       [ 7.52625483],
       [ 6.13786006],
       [ 6.73419487],
       [ 5.54494878],
       [ 6.72834455],
       [ 7.68668017],
       [ 6.7277943 ],
       [ 8.5575162 ],
       [ 7.88420979],
       [ 8.49618504],
       [ 8.48920248],
       [11.24621758],
       [ 9.68347247],
       [ 8.94228907],
       [11.12557522],
       [ 9.38521696],
       [11.1179545 ],
       [ 9.25245109],
       [10.18696547],
       [12.01504305],
       [12.8596787 ],
       [12.59561071],
       [12.61162444],
       [12.72919933],
       [11.85481134],
       [12

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
# Calculate the training and testing errors (bias and variance)
y_train_pred = model.predict(X_train)
train_error = mean_squared_error(y_train, y_train_pred)
y_test_pred = model.predict(X_test)
test_error = mean_squared_error(y_test, y_test_pred)


In [11]:
print("Training error (bias):", train_error)
print("Testing error (variance):", test_error)


Training error (bias): 0.9087396281299075
Testing error (variance): 0.4189781106088322


In [12]:
# Remove duplicate samples
X_unique, indices = np.unique(X, axis=0, return_index=True)
y_unique = y[indices]


In [None]:
# Perform cross-validation
cross_val_errors = []
for i in range(5):
    X_train, X_val, y_train, y_val = train_test_split(X_unique, y_unique, test_size=0.2, random_state=i)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_error = mean_squared_error(y_val, y_val_pred)
    cross_val_errors.append(val_error)
print("Cross-validation errors:", cross_val_errors)
print("Average cross-validation error:", np.mean(cross_val_errors))


Cross-validation errors: [0.8943963399542353, 0.711838267559308, 0.9664698336127481, 0.9156270854451775, 1.1754593805313762]
Average cross-validation error: 0.932758181420569
