In [1]:
import numpy as np
import math
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import copy

# --- Framework-agnostic Data Generation Code ---
class StockOracle:
    """
    A class to simulate a stock's price using various stochastic processes.
    This version is simplified for generating clean datasets for the ML model.
    """
    def __init__(self, periods=252, x0=100, kappa=0.5, theta=100, sigma=0.5,
                 pump_start_percent=0.4, pump_end_percent=0.6, dump_end_percent=0.7,
                 post_dump_drift_strength=0.1, simulation_type="pump_and_dump", trend_drift=0.5):
        self.periods = periods
        self.x0 = x0
        self.kappa = kappa
        self.theta = theta
        self.sigma = sigma
        self.pump_start_percent = pump_start_percent
        self.pump_end_percent = pump_end_percent
        self.dump_end_percent = dump_end_percent
        self.post_dump_drift_strength = post_dump_drift_strength
        self.simulation_type = simulation_type
        self.trend_drift = trend_drift

        if self.simulation_type == "pump_and_dump":
            self.price_history = self._simulate_pump_dump_data()
        elif self.simulation_type == "mean_reverting":
            self.price_history = self._simulate_mean_reverting_data()
        elif self.simulation_type == "trend":
            self.price_history = self._simulate_trend_data()
        else:
            raise ValueError("Invalid simulation_type provided.")

    def _simulate_pump_dump_data(self):
        s = [self.x0]
        dt = 1.0 / self.periods
        pump_start_idx = int(self.periods * self.pump_start_percent)
        pump_end_idx = int(self.periods * self.pump_end_percent)
        dump_end_idx = int(self.periods * self.dump_end_percent)

        for t in range(1, self.periods):
            s_t_minus_1 = s[t - 1]
            drift = self.kappa * (self.theta - s_t_minus_1) * dt
            volatility = s_t_minus_1 * self.sigma * math.sqrt(dt) * random.gauss(0, 1)
            additional_drift = 0

            if pump_start_idx <= t < pump_end_idx:
                additional_drift = 2.0 * dt * self.x0
            elif pump_end_idx <= t < dump_end_idx:
                additional_drift = -5.0 * dt * self.x0
            elif t >= dump_end_idx:
                additional_drift = -self.post_dump_drift_strength * (s_t_minus_1 - self.x0) * dt

            s_t = (s_t_minus_1 + drift + volatility + additional_drift)
            s.append(max(0, s_t))
        return np.array(s)

    def _simulate_mean_reverting_data(self):
        s = [self.x0]
        dt = 1.0 / self.periods
        for t in range(1, self.periods):
            s_t_minus_1 = s[t - 1]
            drift = self.kappa * (self.theta - s_t_minus_1) * dt
            volatility = s_t_minus_1 * self.sigma * math.sqrt(dt) * random.gauss(0, 1)
            s_t = (s_t_minus_1 + drift + volatility)
            s.append(max(0, s_t))
        return np.array(s)

    def _simulate_trend_data(self):
        s = [self.x0]
        dt = 1.0 / self.periods
        for t in range(1, self.periods):
            s_t_minus_1 = s[t - 1]
            drift = self.trend_drift * dt * self.x0
            volatility = s_t_minus_1 * self.sigma * math.sqrt(dt) * random.gauss(0, 1)
            s_t = (s_t_minus_1 + drift + volatility)
            s.append(max(0, s_t))
        return np.array(s)

def generate_dataset(num_samples=10000, periods=252):
    """
    Generates a synthetic dataset for classification.
    """
    X = []
    y = []

    # Generate Pump-and-Dump samples (Label 1)
    for _ in range(num_samples // 2):
        oracle = StockOracle(
            periods=periods,
            x0=random.uniform(50, 150),
            kappa=random.uniform(0.1, 1.0),
            theta=random.uniform(50, 150),
            sigma=random.uniform(0.1, 1.0),
            pump_start_percent=random.uniform(0.1, 0.4),
            pump_end_percent=random.uniform(0.5, 0.7),
            dump_end_percent=random.uniform(0.7, 0.9),
            simulation_type="pump_and_dump"
        )
        X.append(oracle.price_history)
        y.append(1)

    # Generate "Normal" market samples (Label 0)
    for _ in range(num_samples // 4):
        oracle = StockOracle(
            periods=periods,
            x0=random.uniform(50, 150),
            kappa=random.uniform(0.1, 1.0),
            theta=random.uniform(50, 150),
            sigma=random.uniform(0.1, 1.0),
            simulation_type="mean_reverting"
        )
        X.append(oracle.price_history)
        y.append(0)

    for _ in range(num_samples // 4):
        oracle = StockOracle(
            periods=periods,
            x0=random.uniform(50, 150),
            sigma=random.uniform(0.1, 1.0),
            trend_drift=random.uniform(-0.5, 0.5),
            simulation_type="trend"
        )
        X.append(oracle.price_history)
        y.append(0)

    # Convert to numpy arrays
    X = np.array(X)
    y = np.array(y)

    return X, y

# --- PyTorch Specific Code ---

class TimeseriesDataset(Dataset):
    """
    Custom PyTorch Dataset for time series data.
    """
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class PumpDumpClassifier(nn.Module):
    """
    An LSTM-based classifier for pump-and-dump events.
    """
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate):
        super(PumpDumpClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_size, 16)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_size)
        out, _ = self.lstm(x)
        # Take the output from the last timestep
        out = out[:, -1, :]
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out


In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Hyperparameters
NUM_SAMPLES = 20_000
PERIODS = 252 # Number of trading days in a year
EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.001

INPUT_SIZE = 1
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT_RATE = 0.2

# Early stopping parameters
PATIENCE = 5
best_val_loss = float('inf')
patience_counter = 0

print("Generating synthetic data...")
X, y = generate_dataset(num_samples=NUM_SAMPLES, periods=PERIODS)

# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42, stratify=y_train_val)

# Create PyTorch datasets and dataloaders
train_dataset = TimeseriesDataset(X_train.reshape(-1, PERIODS, INPUT_SIZE), y_train)
val_dataset = TimeseriesDataset(X_val.reshape(-1, PERIODS, INPUT_SIZE), y_val)
test_dataset = TimeseriesDataset(X_test.reshape(-1, PERIODS, INPUT_SIZE), y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("\nTraining PyTorch LSTM model with early stopping...")
model = PumpDumpClassifier(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT_RATE).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):
        sequences = sequences.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(sequences)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for sequences, labels in val_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            outputs = model(sequences)
            val_loss += criterion(outputs, labels).item()
        val_loss /= len(val_loader)
        print(f'Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}')

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save the best model state
        best_model_state = copy.deepcopy(model.state_dict())
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"Early stopping at epoch {epoch+1}. Restoring best model state.")
            model.load_state_dict(best_model_state)
            break


print("\nEvaluating model performance...")
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for sequences, labels in test_loader:
        sequences = sequences.to(device)
        outputs = model(sequences)
        predicted = outputs.round()
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Normal", "Pump-and-Dump"]))

Using device: cuda
Generating synthetic data...

Training PyTorch LSTM model with early stopping...
Epoch [1/50], Loss: 0.5364, Validation Loss: 0.6200
Epoch [2/50], Loss: 0.4867, Validation Loss: 0.5946
Epoch [3/50], Loss: 0.3598, Validation Loss: 0.5004
Epoch [4/50], Loss: 0.4804, Validation Loss: 0.4739
Epoch [5/50], Loss: 0.5391, Validation Loss: 0.5286
Epoch [6/50], Loss: 0.5889, Validation Loss: 0.3307
Epoch [7/50], Loss: 0.2214, Validation Loss: 0.2712
Epoch [8/50], Loss: 0.1333, Validation Loss: 0.2538
Epoch [9/50], Loss: 0.2604, Validation Loss: 0.3244
Epoch [10/50], Loss: 0.2901, Validation Loss: 0.2407
Epoch [11/50], Loss: 0.3172, Validation Loss: 0.2214
Epoch [12/50], Loss: 0.1599, Validation Loss: 0.2286
Epoch [13/50], Loss: 0.1105, Validation Loss: 0.2151
Epoch [14/50], Loss: 0.3579, Validation Loss: 0.2128
Epoch [15/50], Loss: 0.3380, Validation Loss: 0.2267
Epoch [16/50], Loss: 0.1100, Validation Loss: 0.2272
Epoch [17/50], Loss: 0.3213, Validation Loss: 0.2164
Epoch [1