In [1]:
!mkdir data
%cd data

!gdown 1qiUDDoYyRLBiKOoYWdFl_5WByHE8Cugu

%cd ..

d:\AIVietNam\2024\aio-2024-hw\module-5\09_11_2024_M05W03\data


A subdirectory or file data already exists.


d:\AIVietNam\2024\aio-2024-hw\module-5\09_11_2024_M05W03


Downloading...
From: https://drive.google.com/uc?id=1qiUDDoYyRLBiKOoYWdFl_5WByHE8Cugu
To: d:\AIVietNam\2024\aio-2024-hw\module-5\09_11_2024_M05W03\data\Auto_MPG_data.csv

  0%|          | 0.00/15.4k [00:00<?, ?B/s]
100%|██████████| 15.4k/15.4k [00:00<00:00, 15.4MB/s]


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
random_state = 42
np.random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_state)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
df = pd.read_csv('./data/Auto_MPG_data.csv')
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
0,18.0,8,307.0,130.0,3504.0,12.0,70,0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,0,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,0,0,1


In [5]:
X = df.drop(columns='MPG').values
y = df['MPG'].values

In [6]:
val_size = 0.2
test_size = 0.125
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [7]:
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

y_train = torch.tensor(y_train, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [8]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
batch_size = 32

train_ds = CustomDataset(X_train, y_train)
val_ds = CustomDataset(X_val, y_val)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

In [9]:
class MLP(nn.Module):
    def __init__(
        self,
        input_dims,
        hidden_dims,
        output_dims,
    ):
        super().__init__()
        self.linear1 = nn.Linear(input_dims, hidden_dims)
        self.linear2 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)
    
    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        out = self.output(x)
        return out.squeeze(1)

In [10]:
input_dims = X_train.shape[1]
output_dims = 1
hidden_dims = 64

model = MLP(
    input_dims,
    hidden_dims,
    output_dims,
).to(device)

In [11]:
lr = 1e-2
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr)

In [12]:
def r_squared(y_true, y_pred):
    y_true = torch.Tensor(y_true).to(device)
    y_pred = torch.Tensor(y_pred).to(device)
    mean_true = torch.mean(y_true)
    ss_tot = torch.sum((y_true - mean_true) ** 2)
    ss_res = torch.sum((y_true - y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [13]:
epochs = 100
train_losses = []
val_losses = []
train_r2 = []
val_r2 = []

In [14]:
for epoch in range(epochs):
    train_loss = 0.0
    train_target = []
    val_target = []
    train_predict = []
    val_predict = []
    model.train()
    print(f'Epoch: {epoch + 1}/{epochs}')
    
    for X_samples, y_samples in train_loader:
        print(f'\tX samples shape: {X_samples.shape}')
        print(f'\ty samples shape: {y_samples.shape}')
        X_samples = X_samples.to(device)
        y_samples = y_samples.to(device)
        optimizer.zero_grad()
        outputs = model(X_samples)
        train_predict += outputs.tolist()
        train_target += y_samples.tolist()
        loss = criterion(outputs, y_samples)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    train_losses.append(train_losses)
    train_r2.append(r_squared(train_target, train_predict))
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_samples, y_samples in val_loader:
            X_samples, y_samples = X_samples.to(device), y_samples.to(device)
            outputs = model(X_samples)
            val_predict += outputs.tolist()
            val_target += y_samples.tolist()
            loss = criterion(outputs, y_samples)
            val_loss += loss.item()
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    val_r2.append(r_squared(val_target, val_predict))
    print(f'EPOCH {epoch + 1}:\tTraining loss: {train_loss:.3f}\tValidation loss: {val_loss:.3f}')
    
    
        

Epoch: 1/100
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([17, 9])
	y samples shape: torch.Size([17])
EPOCH 1:	Training loss: 397.720	Validation loss: 110.199
Epoch: 2/100
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y samples shape: torch.Size([32])
	X samples shape: torch.Size([32, 9])
	y 

In [15]:
model.eval()
with torch.no_grad():
    y_hat = model(X_test)
    test_set_r2 = r_squared(y_hat, y_test)
    print('Evaluation on test set:')
    print(f'R2: {test_set_r2}')

Evaluation on test set:
R2: 0.48016536235809326


In [16]:
print(train_predict)

[12.760104179382324, 29.621501922607422, 29.750469207763672, 25.45534896850586, 17.92928695678711, 22.367671966552734, 24.992233276367188, 14.797846794128418, 32.23433303833008, 23.009693145751953, 17.61780548095703, 27.208797454833984, 31.5474853515625, 17.567934036254883, 12.801819801330566, 26.704265594482422, 30.745174407958984, 12.584757804870605, 12.41042423248291, 30.391803741455078, 12.688191413879395, 27.644634246826172, 19.007080078125, 14.377476692199707, 14.97791576385498, 21.478553771972656, 12.34703540802002, 12.127272605895996, 26.29348373413086, 13.093685150146484, 38.206844329833984, 28.184402465820312, 21.80198097229004, 25.164573669433594, 25.54029655456543, 16.925352096557617, 21.936176300048828, 19.58197593688965, 15.134223937988281, 15.344589233398438, 20.8765811920166, 22.568498611450195, 27.770063400268555, 29.582202911376953, 31.116260528564453, 26.845027923583984, 13.502397537231445, 15.279560089111328, 29.169450759887695, 27.351341247558594, 23.29266929626465