# Bias-Variance Tradeoff

In [1]:
from seaborn import load_dataset
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [2]:
cars = load_dataset('mpg',
                    usecols=['mpg', 'cylinders',
                            'displacement', 'horsepower',
                            'weight', 'acceleration',
                            'model_year']).dropna()
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70


In [3]:
X = cars.drop('mpg', axis=1)
y = cars['mpg']

In [4]:
train_mses = []
val_mses = []
for rs in range(10):
    X_train, X_val, y_train, y_val =\
    train_test_split(X, y, random_state=rs)

    tr_preds = (y_train.mean() * np.ones(len(y_train))).reshape(-1, 1)
    train_mses.append(mean_squared_error(tr_preds,
                                         y_train))
    val_preds = (y_train.mean() * np.ones(len(y_val))).reshape(-1, 1)
    val_mses.append(mean_squared_error(val_preds,
                                       y_val))

print(np.mean(train_mses), np.std(train_mses))
print(np.mean(val_mses), np.std(val_mses))

59.82252839094822 2.4737817631407633
63.828271275857276 7.780758423478221


## Make More Complex

In [6]:
train_mses = []
val_mses = []
for rs in range(10):
    X_train, X_val, y_train, y_val =\
    train_test_split(X, y, random_state=rs)

    poly = PolynomialFeatures(degree=3).fit(X_train)
    poly_tr = poly.transform(X_train)
    poly_val = poly.transform(X_val)
    lr = LinearRegression().fit(poly_tr, y_train)
    train_mses.append(mean_squared_error(lr.predict(poly_tr), y_train))
    val_mses.append(mean_squared_error(lr.predict(poly_val), y_val))

print(np.mean(train_mses), np.std(train_mses))
print(np.mean(val_mses), np.std(val_mses))

8.321460244462207 7.470903663218962
199.42920069914632 461.31811498621653


## Find the Sweet Spot

In [7]:
train_mses = []
val_mses = []
for rs in range(10):
    X_train, X_val, y_train, y_val =\
    train_test_split(X, y, random_state=rs)
    
    lr = LinearRegression().fit(X_train, y_train)
    train_mses.append(mean_squared_error(lr.predict(X_train), y_train))
    val_mses.append(mean_squared_error(lr.predict(X_val), y_val))

print(np.mean(train_mses), np.std(train_mses))
print(np.mean(val_mses), np.std(val_mses))

11.371483613451371 0.5859715309118807
12.668170106939984 1.8246927129692667
