In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import tqdm
import time

In [2]:
X_train = pd.read_csv('X_train.csv').values
y_train = pd.read_csv('y_train.csv').values

y_train_row = y_train.reshape(-1)

X_train_np = X_train.reshape(80000, 100, 4)

X_train_tens = torch.from_numpy(X_train_np).float()
y_train_tens = torch.from_numpy(y_train).float()

## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=3,
    random_state=42,
    n_jobs=-1,
)

rf_model.fit(X_train, y_train_row)

In [17]:
start = time.perf_counter()
y_pred = rf_model.predict(X_train)
stop = time.perf_counter()

rf_time = stop - start
rf_speed = X_train.shape[0] / rf_time
print(f"Speed (predictions/s): {rf_speed:,.0f}")

Speed (predictions/s): 211,420


## XGBoost

In [7]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train_row)

In [16]:
start = time.perf_counter()
y_pred = xgb_model.predict(X_train)
stop = time.perf_counter()

xgb_time = stop - start
xgb_speed = X_train.shape[0] / xgb_time
print(f"Speed (predictions/s): {xgb_speed:,.0f}")

Speed (predictions/s): 742,303


## RNN

In [9]:
device = 'cpu'
batch_size = 128
train_dl = DataLoader(TensorDataset(X_train_tens, y_train_tens), batch_size=batch_size, shuffle=True)

class VolRNN(nn.Module):
    def __init__(self, input_size=1, hidden_size=32, num_layers=3):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.rnn(x)
        last = out[:, -1, :]
        return self.fc(last)

def train_rnn(model, train_dl, criterion, optimizer, epochs=100):
    model.train()
    train_losses = []

    for epoch in range(epochs):
        running_loss = 0

        for xb, yb in tqdm.tqdm(train_dl, desc=f"Epoch {epoch+1}/{epochs}", leave=False):
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_dl)
        train_losses.append(epoch_loss)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss:.4f}")

    return train_losses

def predict_rnn(model, test_dl):
    model.eval()
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for xb, yb in test_dl:
            xb = xb.to(device)
            logits = model(xb).cpu()
            all_logits.append(logits)
            all_labels.append(yb)

    all_logits = torch.cat(all_logits).squeeze()
    all_labels = torch.cat(all_labels).squeeze()

    probs = torch.sigmoid(all_logits)
    return probs

In [10]:
rnn = VolRNN(4, 32, 3)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=1e-3)
train_rnn(rnn, train_dl, criterion, optimizer, epochs=10)

                                                             

Epoch 1/10 | Loss: 0.6739


                                                             

Epoch 2/10 | Loss: 0.6729


                                                             

Epoch 3/10 | Loss: 0.6729


                                                             

Epoch 4/10 | Loss: 0.6727


                                                             

Epoch 5/10 | Loss: 0.6726


                                                             

Epoch 6/10 | Loss: 0.6726


                                                             

Epoch 7/10 | Loss: 0.6726


                                                             

Epoch 8/10 | Loss: 0.6726


                                                             

Epoch 9/10 | Loss: 0.6725


                                                              

Epoch 10/10 | Loss: 0.6725




[0.6739189477920532,
 0.672911431980133,
 0.672879695224762,
 0.6726763600349426,
 0.6725852390289306,
 0.6725567989349365,
 0.6725973952293396,
 0.6725576732635498,
 0.6725055369377136,
 0.6725175802230835]

In [15]:
start = time.perf_counter()
y_pred = predict_rnn(rnn, train_dl)
stop = time.perf_counter()

rnn_time = stop - start
rnn_speed = X_train.shape[0] / rnn_time
print(f"Speed (predictions/s): {rnn_speed:,.0f}")

Speed (predictions/s): 12,174


## LSTM

In [12]:
class LSTMClassifier(nn.Module):
    def __init__(self, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size = 4,
            hidden_size = hidden_size,
            batch_first = True,
            num_layers = num_layers,
            dropout = dropout
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.fc(last).squeeze(-1)

class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
def train_lstm(model, X_train, y_train, n_epochs, learning_rate, optimizer):
    criterion = nn.BCEWithLogitsLoss()
    if optimizer == 'ADAM':
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    elif optimizer == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    training_set = SequenceDataset(X_train, y_train)
    train_loader = DataLoader(training_set, batch_size=32, shuffle=True, num_workers=0)

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0

        for X_batch, y_batch in tqdm.tqdm(train_loader):
            X_batch = X_batch.to('cpu')
            y_batch = y_batch.to('cpu')

            optimizer.zero_grad()

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)

            loss.backward()
            optimizer.step()

            total_loss += loss.item() * X_batch.size(0)

        avg_loss = total_loss / len(train_loader.dataset)

        print(f"Epoch {epoch+1}/{n_epochs}, loss = {avg_loss:.4f}")
    
def predict_lstm(model, X):
    model.eval()
    with torch.no_grad():
        logits = model(X)
        probs = torch.nn.sigmoid(logits).cpu().numpy
    return probs

In [13]:
lstm = LSTMClassifier(128, 1, 0)
train_lstm(lstm, X_train_np, y_train, 10, 1e-3, 'ADAM')

  0%|          | 0/2500 [00:00<?, ?it/s]

100%|██████████| 2500/2500 [00:32<00:00, 77.15it/s]


Epoch 1/10, loss = 0.6732


100%|██████████| 2500/2500 [00:31<00:00, 80.21it/s]


Epoch 2/10, loss = 0.6725


100%|██████████| 2500/2500 [00:31<00:00, 80.36it/s]


Epoch 3/10, loss = 0.6724


100%|██████████| 2500/2500 [00:31<00:00, 78.40it/s]


Epoch 4/10, loss = 0.6722


100%|██████████| 2500/2500 [00:34<00:00, 71.56it/s]


Epoch 5/10, loss = 0.6722


100%|██████████| 2500/2500 [00:46<00:00, 53.49it/s]


Epoch 6/10, loss = 0.6722


100%|██████████| 2500/2500 [00:39<00:00, 63.90it/s]


Epoch 7/10, loss = 0.6721


100%|██████████| 2500/2500 [00:33<00:00, 75.42it/s]


Epoch 8/10, loss = 0.6721


100%|██████████| 2500/2500 [00:28<00:00, 88.08it/s]


Epoch 9/10, loss = 0.6720


100%|██████████| 2500/2500 [00:29<00:00, 85.23it/s]

Epoch 10/10, loss = 0.6720





In [14]:
start = time.perf_counter()
y_pred = predict_rnn(rnn, train_dl)
stop = time.perf_counter()

lstm_time = stop - start
lstm_speed = X_train.shape[0] / lstm_time
print(f"Speed (predictions/s): {lstm_speed:,.0f}")

Speed (predictions/s): 12,287


# Results

In [33]:
results = pd.DataFrame(
    [rf_speed, xgb_speed, rnn_speed, lstm_speed],
    index=['Random Forest', 'XGBoost', 'RNN', 'LSTM']
    )
results.columns = ['Speed']
display(results.style.format('{:,.0f}')) 

Unnamed: 0,Speed
Random Forest,211420
XGBoost,742303
RNN,12174
LSTM,12287
