## training LSTM

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam




In [5]:
# Load data
df = pd.read_csv('percent_change_new.csv', header=None, names=['stock', 'title', 'date', 'percent_change'])

# Clean headlines (remove special characters, lowercasing)
df['title'] = df['title'].str.replace('[^a-zA-Z ]', '').str.lower()

# Encode stock symbols
label_encoder = LabelEncoder()
df['stock'] = label_encoder.fit_transform(df['stock'])

# Tokenize and pad headlines
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['title'])
sequences = tokenizer.texts_to_sequences(df['title'])
headline_data = pad_sequences(sequences, maxlen=200)

# Normalize percentage changes
scaler = MinMaxScaler()
df['percent_change'] = pd.to_numeric(df['percent_change'], errors='coerce')
df['percent_change'].fillna(0, inplace=True)
df['percent_change'] = scaler.fit_transform(df[['percent_change']])

# Split the dataset
X_headline_train, X_headline_test, X_stock_train, X_stock_test, y_train, y_test = train_test_split(
    headline_data, df['stock'].values, df['percent_change'].values, test_size=0.2, random_state=42)

  df['title'] = df['title'].str.replace('[^a-zA-Z ]', '').str.lower()


In [6]:
class StockDataset(Dataset):
    def __init__(self, headlines, stocks, changes):
        self.headlines = headlines
        self.stocks = stocks
        self.changes = changes

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        return {
            'headline': torch.tensor(self.headlines[idx], dtype=torch.long),
            'stock': torch.tensor(self.stocks[idx], dtype=torch.float),
            'change': torch.tensor(self.changes[idx], dtype=torch.float)
        }


In [7]:
class StockPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lstm_hidden_dim, output_dim):
        super(StockPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(lstm_hidden_dim + 1, 128)  # +1 for the stock encoding
        self.fc2 = nn.Linear(128, output_dim)
        self.relu = nn.ReLU()

    def forward(self, headlines, stocks):
        embedded = self.embedding(headlines)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        combined = torch.cat((lstm_out, stocks.unsqueeze(1)), dim=1)
        x = self.relu(self.fc1(combined))
        x = self.fc2(x)
        return x


In [8]:
# Model parameters
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding
embedding_dim = 50
lstm_hidden_dim = 64
output_dim = 1  # For percentage change

# Model, loss, optimizer
model = StockPredictor(vocab_size, embedding_dim, lstm_hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters())

# DataLoaders
train_dataset = StockDataset(X_headline_train, X_stock_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        # print(batch['headline'].shape, batch['stock'].shape, batch['change'].shape)
        outputs = model(batch['headline'], batch['stock'])
        loss = criterion(outputs.squeeze(), batch['change'])
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 0.00031439989106729627
Epoch 2, Loss: 0.0004212296335026622
Epoch 3, Loss: 0.0003216788754798472
Epoch 4, Loss: 0.0031911847181618214
Epoch 5, Loss: 0.0004602802509907633
Epoch 6, Loss: 0.0008812681771814823
Epoch 7, Loss: 0.00015552662080153823
Epoch 8, Loss: 0.00122043676674366
Epoch 9, Loss: 0.0003198861959390342
Epoch 10, Loss: 0.00015673338202759624


In [9]:
torch.save(model.state_dict(), 'stock_prediction_model_new.pth')

In [10]:
# Initialize the model
model = StockPredictor(vocab_size, embedding_dim, lstm_hidden_dim, output_dim)

# Load the saved state dict
model.load_state_dict(torch.load('stock_prediction_model_new.pth'))

# Set the model to evaluation mode
model.eval()


StockPredictor(
  (embedding): Embedding(30521, 50)
  (lstm): LSTM(50, 64, batch_first=True)
  (fc1): Linear(in_features=65, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
)

In [36]:
def preprocess_input(stock_name, headline, tokenizer, label_encoder, max_length):
    # Clean and tokenize headline
    cleaned_headline = headline.lower().replace('[^a-zA-Z ]', '')
    # print(cleaned_headline)
    tokenized_headline = tokenizer.texts_to_sequences([cleaned_headline])
    padded_headline = pad_sequences(tokenized_headline, maxlen=max_length)
    # print(padded_headline.shape)

    # Encode stock name
    encoded_stock = label_encoder.transform([stock_name])
    # print(encoded_stock.shape)

    return torch.tensor(padded_headline, dtype=torch.long), torch.tensor(encoded_stock, dtype=torch.float)

# Example inputs
stock_input = "TSLA"
headline_input = "Tesla bankrupt, stock price falls to $0.00"
print("Input:", stock_input, ", ", headline_input)

# Preprocess inputs
headline_tensor, stock_tensor = preprocess_input(stock_input, headline_input, tokenizer, label_encoder, 200)
# print(headline_tensor.shape, stock_tensor.shape)

# print(stock_tensor.unsqueeze(-1).shape)

with torch.no_grad():
    prediction = model(headline_tensor, stock_tensor).item()

# Assuming you used a scaler for the target variable, you need to inverse transform the prediction
predicted_change = scaler.inverse_transform([[prediction]])

print(f"Predicted Change in Stock Price: {predicted_change[0][0]}%")


Input: TSLA ,  Tesla bankrupt, stock price falls to $0.00
Predicted Change in Stock Price: 1.8659482253284212%


In [39]:
import torch.onnx

dummy_input = (headline_tensor, stock_tensor)
# Export the model
torch.onnx.export(model, dummy_input, "model.onnx", export_params=True, opset_version=10, do_constant_folding=True, input_names=['input'], output_names=['output'])

