In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler


predictors = ['cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'year']
target = 'mpg'

df = pd.read_csv('https://www.statlearning.com/s/Auto.csv', na_values='?')
print(df)
df = df[predictors + [target]]

df = df.sample(frac=1.)
train = df[:int(len(df) * 0.8)]
test = df[int(len(df) * 0.8):]
print(f'Using {len(train)} samples for training and {len(test)} for testing')

test.loc[np.isnan(df['horsepower']), 'horsepower'] = np.nanmean(test['horsepower'])
train.loc[np.isnan(df['horsepower']), 'horsepower'] = np.nanmean(train['horsepower'])

scaler = StandardScaler()
train[predictors] = scaler.fit_transform(train[predictors])
test[predictors] = scaler.transform(test[predictors])

      mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0    18.0          8         307.0       130.0    3504          12.0    70   
1    15.0          8         350.0       165.0    3693          11.5    70   
2    18.0          8         318.0       150.0    3436          11.0    70   
3    16.0          8         304.0       150.0    3433          12.0    70   
4    17.0          8         302.0       140.0    3449          10.5    70   
..    ...        ...           ...         ...     ...           ...   ...   
392  27.0          4         140.0        86.0    2790          15.6    82   
393  44.0          4          97.0        52.0    2130          24.6    82   
394  32.0          4         135.0        84.0    2295          11.6    82   
395  28.0          4         120.0        79.0    2625          18.6    82   
396  31.0          4         119.0        82.0    2720          19.4    82   

     origin                       name  
0         1  chevrolet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[predictors] = scaler.fit_transform(train[predictors])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[predictors] = scaler.transform(test[predictors])


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from typing import List


def fit_and_evaluate(train: pd.DataFrame, test: pd.DataFrame,
                     predictors: List[str], target: str):
    print(predictors)
    model = LinearRegression()
    model.fit(train[predictors], train[target])
    predictions = model.predict(test[predictors])

    return mean_absolute_error(test[target], predictions)

mae = fit_and_evaluate(train, test, predictors, target)
print(mae)

['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
2.6531479524644976


In [8]:
train[predictors]

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year
238,-0.865954,-0.930671,-0.582539,-1.055375,0.165439,0.303785
2,1.426383,1.140555,1.113484,0.514445,-1.620356,-1.589750
382,-0.865954,-0.996573,-0.987560,-1.182253,-0.162564,1.656310
360,0.280215,-0.488181,-0.759736,0.196098,1.513896,1.385805
170,-0.865954,-0.535255,-0.709108,-0.459051,1.113003,-0.237225
...,...,...,...,...,...,...
66,1.426383,1.008750,1.113484,0.786655,-1.438132,-1.048740
77,-0.865954,-0.714133,-0.759736,-0.552479,0.930779,-1.048740
314,-0.865954,-0.535255,-0.455970,-0.138397,0.967224,1.115300
59,-0.865954,-0.940085,-1.316639,-0.848911,2.935243,-1.048740


In [15]:
import numpy as np

def find_worst_predictor(train: pd.DataFrame, test: pd.DataFrame, 
                         predictors: pd.DataFrame, target: pd.DataFrame):
    maes = []
    for predictor in predictors:
        remaining = list(set(predictors) - set([predictor]))
        mae = fit_and_evaluate(train, test, remaining, target)
        maes.append(mae)
    

    return predictors[np.argmin(maes)]

find_worst_predictor(train, test, predictors, target)

['year', 'displacement', 'acceleration', 'horsepower', 'weight']
['year', 'acceleration', 'horsepower', 'weight', 'cylinders']
['year', 'displacement', 'acceleration', 'weight', 'cylinders']
['year', 'displacement', 'acceleration', 'horsepower', 'cylinders']
['year', 'displacement', 'horsepower', 'weight', 'cylinders']
['displacement', 'acceleration', 'horsepower', 'weight', 'cylinders']


'acceleration'

In [16]:
remaining = predictors

while len(remaining) > 1:
    remove = find_worst_predictor(train, test, remaining, target)
    print(remove)
    remaining = [predictor for predictor in remaining if predictor != remove]

['year', 'displacement', 'acceleration', 'horsepower', 'weight']
['year', 'acceleration', 'horsepower', 'weight', 'cylinders']
['year', 'displacement', 'acceleration', 'weight', 'cylinders']
['year', 'displacement', 'acceleration', 'horsepower', 'cylinders']
['year', 'displacement', 'horsepower', 'weight', 'cylinders']
['displacement', 'acceleration', 'horsepower', 'weight', 'cylinders']
acceleration
['year', 'displacement', 'horsepower', 'weight']
['year', 'horsepower', 'cylinders', 'weight']
['year', 'displacement', 'cylinders', 'weight']
['year', 'displacement', 'cylinders', 'horsepower']
['displacement', 'cylinders', 'horsepower', 'weight']
horsepower
['year', 'displacement', 'weight']
['year', 'cylinders', 'weight']
['year', 'displacement', 'cylinders']
['displacement', 'cylinders', 'weight']
displacement
['year', 'weight']
['year', 'cylinders']
['cylinders', 'weight']
cylinders
['year']
['weight']
year
