In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
from sklearn import model_selection

torch.manual_seed(1)

<torch._C.Generator at 0x1a4ffbdfa10>

In [2]:
df = pd.read_csv(
    "datasets/auto-mpg.csv",
    sep=" ",
    skipinitialspace=True,
    comment="\t",
    quotechar='"',
    names=[
        "MPG",
        "Cylinders",
        "Displacement",
        "Horsepower",
        "Weight",
        "Acceleration",
        "Model Year",
        "Origin",
    ],
)

In [3]:
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [4]:
def describe_data(f: pd.DataFrame):
    print(f"Shape: {f.shape}\n")
    print(f"Missing values:\n{f.isnull().sum()}\n")
    print(f"Columns type:\n{f.dtypes}\n")

In [5]:
describe_data(df)

Shape: (398, 8)

Missing values:
MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

Columns type:
MPG             float64
Cylinders         int64
Displacement    float64
Horsepower       object
Weight          float64
Acceleration    float64
Model Year        int64
Origin            int64
dtype: object



There are no missing value but the column Horsepower has the type object, hence it may contain invalid data

In [6]:
df['Horsepower'] = pd.to_numeric(df['Horsepower'], errors='coerce')
# remove values which could not be converted to float
df = df.dropna(subset=['Horsepower'])
df = df.reset_index(drop=True)

In [7]:
describe_data(df)

Shape: (392, 8)

Missing values:
MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

Columns type:
MPG             float64
Cylinders         int64
Displacement    float64
Horsepower      float64
Weight          float64
Acceleration    float64
Model Year        int64
Origin            int64
dtype: object



Now there should be no NA value and types are correct. All columns are numeric. However, the column Origin is categorical

In [8]:
df['Origin'].unique()

array([1, 3, 2], dtype=int64)

In [9]:
# One-hot encoding
df = pd.get_dummies(df, columns=['Origin']) 
df = df.rename(columns={'Origin_1': 'USA', 
                   'Origin_2': 'Europe', 
                   'Origin_3': 'Japan'})


In [10]:
df.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
387,27.0,4,140.0,86.0,2790.0,15.6,82,1,0,0
388,44.0,4,97.0,52.0,2130.0,24.6,82,0,1,0
389,32.0,4,135.0,84.0,2295.0,11.6,82,1,0,0
390,28.0,4,120.0,79.0,2625.0,18.6,82,1,0,0
391,31.0,4,119.0,82.0,2720.0,19.4,82,1,0,0


In [11]:
df_train, df_test = model_selection.train_test_split(df, train_size=0.8, random_state=1)

In [12]:
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

numeric_columns = ['Cylinders', 'Displacement','Horsepower','Weight', 'Acceleration', 'Model Year' ]
for c in numeric_columns:
    mean = df_train[c].mean()
    sd = df_train[c].std()
    
    # Min/Max normalization
    # df_train_norm[c] = (df_train_norm[c] - df_train_norm[c].min()) / (df_train_norm[c].max() - df_train_norm[c].min())
    # df_test_norm[c] = (df_test_norm[c] - df_test_norm[c].min()) / (df_test_norm[c].max() - df_test_norm[c].min())

    # Standardization
    df_train_norm[c] = (df_train_norm[c]-mean) / sd
    df_test_norm[c] = (df_test_norm[c]-mean) / sd

In [13]:
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,0.019124,0,0,1
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,0.563297,1,0,0
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,-1.069221,1,0,0
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,0.291211,1,0,0
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,-1.341307,1,0,0


In [14]:
y_train = torch.tensor(df_train_norm['MPG'].values).float()
x_train = torch.tensor(df_train_norm.drop('MPG', axis=1).values).float()

y_test = torch.tensor(df_test['MPG'].values).float()
x_test = torch.tensor(df_test_norm.drop('MPG', axis=1).values).float()

In [15]:
train_ds = TensorDataset(x_train, y_train)
test_ds = TensorDataset(x_test, y_test)

In [16]:
class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.l1 = nn.Linear(input_size, 8)
        self.l2 = nn.Linear(8, 4)
        self.l3 = nn.Linear(4, 1)

    def forward(self, x):
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return self.l3(x)        

In [17]:
model = Model(x_train.shape[1])
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [18]:
batch_size = 8
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size)
print(len(train_loader))

# Train the model
for epoch in range(200):
    epoch_loss = 0
    # Go through batches
    for x, y in train_loader:
        pred = model(x)
        loss = loss_fn(pred, y.unsqueeze(1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()
    
    if epoch ==0 or (epoch+1) % 20==0:
        print(f'Epoch {epoch+1} Loss '
        f'{epoch_loss/len(train_loader):.2f}')        

40
Epoch 1 Loss 514.93
Epoch 20 Loss 7.31
Epoch 40 Loss 6.95
Epoch 60 Loss 6.09
Epoch 80 Loss 5.78
Epoch 100 Loss 6.00
Epoch 120 Loss 5.79
Epoch 140 Loss 5.89
Epoch 160 Loss 5.50
Epoch 180 Loss 5.47
Epoch 200 Loss 5.42


In [19]:
with torch.no_grad():
    pred_train = model(x_train.float())[:, 0]
    loss_train = loss_fn(pred_train, y_train)
    print(f'Train MSE: {loss_train.item():.4f}')

    pred_test = model(x_test.float())[:, 0]
    loss_test = loss_fn(pred_test, y_test)
    print(f'Test MSE: {loss_test.item():.4f}')

Train MSE: 5.7865
Test MSE: 9.9112
