In [None]:
import pandas as pd
import numpy as np
import typing as ty
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset



In [None]:
SEED = 777


torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)


np.random.seed(SEED)

In [None]:
df = pd.read_csv('nvda.us.txt')
df

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def create_sequences(input_data, output_data, window_size, step):
    sequences = []
    labels = []

    for i in range(0, len(input_data) - window_size, step):
        sequences.append(input_data[i:(i + window_size)])
        labels.append(output_data[i + window_size])
    return np.array(sequences), np.array(labels)


def prepare_data_loaders(
    raw_data: pd.DataFrame,
    use_feature_names: ty.List[str],
    batch_size: int = 32,
    window_size: int = 10,
    step: int = 15,
    normalization: str = None) -> ty.Tuple:

    # Select features
    features = df[use_feature_names]
    labels = df['High'].shift(-1)  # Next day's high price as label

    X, y = create_sequences(features, labels, window_size=10, step=15)

    print(f'Shape of data X: {X.shape}')
    print(f'Shape of data y: {y.shape}')

    # split the hold-out tests
    ind = np.linspace(0, len(X)-1, num=int(len(X)*0.1), dtype=int) # 10% hold-out
    x_test = X[ind]
    y_test = y[ind]
    all_ind = np.arange(len(X))
    remains_ind = np.delete(all_ind, ind)

    X = X[remains_ind]
    y = y[remains_ind]

    # shuffle dataset
    ind = np.random.permutation(len(X))
    X = X[ind]
    y = y[ind]
    split_point = int(X.shape[0]*0.8)

    x_train = X[:split_point]
    y_train = y[:split_point]
    x_val = X[split_point:]
    y_val = y[split_point:]

    print(f'Shape of data x_train: {x_train.shape}')
    print(f'Shape of data y_train: {y_train.shape}')
    print(f'Shape of data x_val: {x_val.shape}')
    print(f'Shape of data y_val: {y_val.shape}')
    print(f'Shape of data x_test: {x_test.shape}')
    print(f'Shape of data y_test: {y_test.shape}')

    # Apply normalization
    # Labels (y)
    y_scaler = StandardScaler()
    y_train_normalized = y_scaler.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
    y_val_normalized = y_scaler.transform(y_val.reshape(-1, 1)).reshape(-1)
    y_test_normalized = y_scaler.transform(y_test.reshape(-1, 1)).reshape(-1)

    # Features (X)
    if normalization is not None:
        assert normalization in ('minmax', 'std')

        for c in range(X.shape[2]):
            scaler = MinMaxScaler() if normalization == 'minmax' else StandardScaler()
            x_train[:, :, c] = scaler.fit_transform(x_train[:, :, c].reshape(-1, 1)).reshape(-1, window_size)
            x_val[:, :, c] = scaler.transform(x_val[:, :, c].reshape(-1, 1)).reshape(-1, window_size)
            x_test[:, :, c] = scaler.transform(x_test[:, :, c].reshape(-1, 1)).reshape(-1, window_size)

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()
    y_train_normalized = torch.from_numpy(y_train_normalized).float()

    x_val = torch.from_numpy(x_val).float()
    y_val = torch.from_numpy(y_val).float()
    y_val_normalized = torch.from_numpy(y_val_normalized).float()

    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()
    y_test_normalized = torch.from_numpy(y_test_normalized).float()

    # Create datasets
    train_dataset = TensorDataset(x_train, y_train, y_train_normalized)
    val_dataset = TensorDataset(x_val, y_val, y_val_normalized)
    test_dataset = TensorDataset(x_test, y_test, y_test_normalized)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    print(f'Number of samples in training and validation are {len(train_loader.dataset)} and {len(val_loader.dataset)}.')

    return (train_loader, val_loader, test_loader, y_scaler)


In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm.auto import tqdm
from collections import defaultdict



class LSTMModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, num_layers: int, output_dim: int) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


def train_one_epoch(
        model: nn.Module,
        loader: DataLoader,
        criterion: ty.Callable,
        optimizer: optim.Optimizer,
        device: str) -> float:

    model.train()
    total_loss = 0.0

    for features, _, labels in loader:
        features = features.to(device)
        labels = labels.to(device)
        outputs = model(features).squeeze(-1)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(loader)

    return avg_train_loss


@torch.no_grad()
def eval_one_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: ty.Callable,
    y_scaler: StandardScaler,
    device: str) -> float:

    model.eval()
    pred_value = []
    actual_value = []

    for features, labels, _ in loader:
            features = features.to(device)
            outputs = model(features).squeeze(-1).cpu().numpy()
            outputs = y_scaler.inverse_transform(outputs.reshape(-1, 1)).reshape(-1)
            pred_value.append(torch.from_numpy(outputs))
            actual_value.append(labels)

    pred_value = torch.cat(pred_value)
    actual_value = torch.cat(actual_value)
    eval_loss = criterion(pred_value, actual_value)

    return eval_loss.item()


def run_experiment(
    epochs: int,
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    test_loader: DataLoader,
    y_scaler: StandardScaler,
    device: str) -> ty.Dict[str, ty.List[float]]:

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0)

    metrics = defaultdict(list)
    best_val_loss = float('inf')

    for epoch in tqdm(range(epochs)):
        # Training
        _ = train_one_epoch(model, train_loader, criterion, optimizer, device)
        lr_scheduler.step()

        curr_train_loss = eval_one_epoch(model, train_loader, criterion, y_scaler, device)

        # Validation
        curr_val_loss = eval_one_epoch(model, val_loader, criterion, y_scaler, device)

        # Checkpoint
        if curr_val_loss < best_val_loss:
            best_val_loss = curr_val_loss
            torch.save(model.state_dict(), 'best_model.pth')

        print(f'Epoch {epoch+1}/{epochs}, Train loss: {curr_train_loss:.4f}, Val loss: {curr_val_loss:.4f}, Best Val loss: {best_val_loss:.4f}')

        metrics['train_losses'].append(curr_train_loss)
        metrics['val_losses'].append(curr_val_loss)


    # Testing
    model.load_state_dict(torch.load('best_model.pth'))

    test_loss = eval_one_epoch(model, test_loader, criterion, y_scaler, device)
    print(f'test_loss : {test_loss}')

    metrics['test_loss'].append(test_loss)

    return metrics


def find_best_scores(training_losses, validation_losses):
    # Find the index of the best (minimum) validation score
    best_index = validation_losses.index(min(validation_losses))

    # Retrieve the corresponding training and testing scores
    best_training_score = training_losses[best_index]
    best_validation_score = validation_losses[best_index]

    return best_training_score, best_validation_score

## Q1
Train the model using 3 different combinations of window size and step

In [None]:
USE_FEATURE_NAMES = ['Open', 'High', 'Low', 'Close']
HIDDEN_DIM = 500
NUM_LAYERS = 2
BATCH_SIZE = 32
WINDOW_SIZE = 10
STEP = 15
NUM_EPOCHS = 100
DEVICE = 'cpu'


EXP_COMBINATIONS = [
    {'window_size': 10, 'step': 30},
    {'window_size': 10, 'step': 10},
    {'window_size': 30, 'step': 15},
    {'window_size': 20, 'step': 30},
    {'window_size': 30, 'step': 30}

]


model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)


results_dict = defaultdict(list)

for combination in EXP_COMBINATIONS:
    print('combination :')
    print(combination)
    window_size = combination['window_size']
    step = combination['step']

    train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES, BATCH_SIZE, window_size, step)

    metrics = run_experiment(NUM_EPOCHS, model, train_loader, val_loader, test_loader, y_scaler, DEVICE)

    train_loss, val_loss = find_best_scores(metrics['train_losses'], metrics['val_losses'])

    results_dict['window_size'].append(window_size)
    results_dict['step'].append(step)
    results_dict['train_mse'].append(train_loss)
    results_dict['val_mse'].append(val_loss)
    results_dict['test_mse'].append(metrics['test_loss'][0])


results_df = pd.DataFrame(results_dict)
results_df


## Q2-1
Include 'Volume' as an additional input feature in your model.


In [None]:
USE_FEATURE_NAMES = ['Open', 'High', 'Low', 'Close', 'Volume']
HIDDEN_DIM = 500
NUM_LAYERS = 2
BATCH_SIZE = 32
WINDOW_SIZE = 10
STEP = 15
NUM_EPOCHS = 100
# DEVICE = 'cpu'

# four features
model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES[:4]),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)

train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES[:4], BATCH_SIZE, WINDOW_SIZE, STEP)

metrics_four_features = run_experiment(
    NUM_EPOCHS, model,
    train_loader, val_loader, test_loader, y_scaler, DEVICE)


# five features
model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)

train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES, BATCH_SIZE, WINDOW_SIZE, STEP)

metrics_five_features = run_experiment(
    NUM_EPOCHS, model,
    train_loader, val_loader, test_loader, y_scaler, DEVICE)


plt.plot(metrics_four_features['train_losses'], label='train loss w/ four features')
plt.plot(metrics_four_features['val_losses'], label='validation loss w/ four features')

plt.plot(metrics_five_features['train_losses'], label='train loss w/ five features')
plt.plot(metrics_five_features['val_losses'], label='validation loss w/ five features')

plt.title('Learning curve comparison')
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()

print(f'Test loss (w/ four features) : {metrics_four_features["test_loss"]}')
print(f'Test loss (w/ five features) : {metrics_five_features["test_loss"]}')

In [None]:
plt.hist(df.Volume, bins=30)
plt.title('Distribution of Volume')
plt.grid()
plt.show()

## Q2-2
Explore and report on the best combination of input features that yields the best MSE

In [None]:
USE_FEATURE_NAMES = ['Open', 'High', 'Low', 'Close', 'Volume']
HIDDEN_DIM = 500
NUM_LAYERS = 2
BATCH_SIZE = 32
WINDOW_SIZE = 10
STEP = 10
NUM_EPOCHS = 100
# DEVICE = 'cpu'


USE_FEATURE_NAMES_COMBINATIONS = [
    ['Open', 'High', 'Low', 'Close', 'Volume'],
    ['Open', 'High', 'Low', 'Close'],
    ['High', 'Low', 'Close', 'Volume'],
    ['Open', 'Low', 'Close', 'Volume'],
    ['Open', 'High', 'Close', 'Volume'],
    ['Open', 'High', 'Low', 'Volume'],
    ['Open'],
    ['High'],
    ['Low'],
    ['Close'],
    ['Volume']
]

results_dict = defaultdict(list)

for use_feature_names in USE_FEATURE_NAMES_COMBINATIONS:

    # four features
    model = LSTMModel(
        input_dim=len(use_feature_names),
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        output_dim=1).to(DEVICE)

    train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, use_feature_names, BATCH_SIZE, WINDOW_SIZE, STEP)

    metrics = run_experiment(
        NUM_EPOCHS, model,
        train_loader, val_loader, test_loader, y_scaler, DEVICE)


    train_loss, val_loss = find_best_scores(metrics['train_losses'], metrics['val_losses'])

    for feature_name in ['Open', 'High', 'Low', 'Close', 'Volume']:
        if feature_name in use_feature_names:
            results_dict[feature_name].append(1)
        else:
            results_dict[feature_name].append(0)

    results_dict['train_mse'].append(train_loss)
    results_dict['val_mse'].append(val_loss)
    results_dict['test_mse'].append(metrics['test_loss'][0])


results_df = pd.DataFrame(results_dict)
results_df


## Q3:
Analyze the performance of the model with and without normalized inputs in Lab 4

In [None]:
USE_FEATURE_NAMES = ['Open', 'High', 'Low', 'Close', 'Volume']
HIDDEN_DIM = 500
NUM_LAYERS = 2
BATCH_SIZE = 32
WINDOW_SIZE = 10
STEP = 15
NUM_EPOCHS = 100
# DEVICE = 'cpu'

# W/O Normalization
model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)

train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES[:4], BATCH_SIZE, WINDOW_SIZE, STEP)

metrics_without_normalization = run_experiment(
    NUM_EPOCHS, model,
    train_loader, val_loader, test_loader, y_scaler, DEVICE)


# Apply Minmax
model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)

train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES, BATCH_SIZE, WINDOW_SIZE, STEP, 'minmax')

metrics_with_minmax = run_experiment(
    NUM_EPOCHS, model,
    train_loader, val_loader, test_loader, y_scaler, DEVICE)


# Apply Z-score (std)
model = LSTMModel(
    input_dim=len(USE_FEATURE_NAMES),
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    output_dim=1).to(DEVICE)

train_loader, val_loader, test_loader, y_scaler = \
        prepare_data_loaders(df, USE_FEATURE_NAMES, BATCH_SIZE, WINDOW_SIZE, STEP, 'std')

metrics_with_std = run_experiment(
    NUM_EPOCHS, model,
    train_loader, val_loader, test_loader, y_scaler, DEVICE)



plt.plot(metrics_without_normalization['train_losses'], label='train loss w/o normalization')
plt.plot(metrics_without_normalization['val_losses'], label='validation loss w/o normalization')

plt.plot(metrics_with_minmax['train_losses'], label='train loss w/ Minmax normalization')
plt.plot(metrics_with_minmax['val_losses'], label='validation loss w/ Minmax normalization')

plt.plot(metrics_with_std['train_losses'], label='train loss w/ Z-score (std) normalization')
plt.plot(metrics_with_std['val_losses'], label='validation loss w/ Z-score (std) normalization')

plt.title('Learning curve comparison')
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()


print(f'Test loss (w/o normalization) : {metrics_without_normalization["test_loss"]}')
print(f'Test loss (w/ Minmax) : {metrics_with_minmax["test_loss"]}')
print(f'Test loss (w/ Z-scpre) : {metrics_with_std["test_loss"]}')