In [None]:
import pandas as pd


df = pd.read_csv('~/Downloads/Auto.csv')
df = df[df['horsepower'] != '?']
df['horsepower'] = df['horsepower'].astype(float)

In [None]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


NUM_FOLDS = 5

seeds = np.arange(42, 142)
mean_mses = []

for seed in seeds:
    np.random.seed(seed)
    df = df.iloc[np.random.permutation(len(df))]
    df['fold'] = np.arange(len(df)) % NUM_FOLDS
    mses = []

    for fold in np.arange(NUM_FOLDS):
        train = df[df['fold'] != fold]
        validation = df[df['fold'] == fold]

        model = LinearRegression()
        model.fit(train[['horsepower', 'year']], train['mpg'])
        predictions = model.predict(validation[['horsepower', 'year']])
        mse = mean_squared_error(validation['mpg'], predictions)
        mses.append(mse)

    mean_mses.append(np.mean(mse))

print(f'MSE range: {np.amin(mean_mses):.2f}-{np.amax(mean_mses):.2f}')

In [None]:
from plotly.figure_factory import create_distplot


fig = create_distplot([mean_mses], ['Mean MSE'])
fig.show()

In [None]:
from sklearn.dummy import DummyRegressor


baseline = DummyRegressor()
baseline.fit(train[['horsepower', 'year']], train['mpg'])
baseline_predictions = baseline.predict(validation[['horsepower', 'year']])
baseline_mse = mean_squared_error(validation['mpg'], baseline_predictions)

print(f'Baseline MSE: {baseline_mse:.2f}')