<a href="https://colab.research.google.com/github/gz2214/Text2Trade/blob/main/code/Bert_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_from_disk
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import ast
import re
import os
import numpy as np
from sklearn.metrics import accuracy_score
import torch.optim as optim
import torch.nn as nn

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dataset_path_block0 = '/content/drive/MyDrive/Text2Trade/data/block0'
dataset_path_block1 = '/content/drive/MyDrive/Text2Trade/data/block1'
dataset_path_block2 = '/content/drive/MyDrive/Text2Trade/data/block2'
dataset_path_block3 = '/content/drive/MyDrive/Text2Trade/data/block3'
dataset_path_block4 = '/content/drive/MyDrive/Text2Trade/data/block4'

block0 = load_from_disk(dataset_path_block0)
block1 = load_from_disk(dataset_path_block1)
block2 = load_from_disk(dataset_path_block2)
block3 = load_from_disk(dataset_path_block3)
block4 = load_from_disk(dataset_path_block4)

blocks = [block0, block1, block2, block3, block4]
block_names = ['block0', 'block1', 'block2', 'block3', 'block4']

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2 # Binary classification (up/down)
)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#function to split the text into chunks of tokens - ensure each chunk is less than 512 tokens
def chunk_tokens(text, max_tokens=512):
    tokens = tokenizer.tokenize(text) #tokenize text
    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokens:
        current_chunk.append(token) #append token to current chunk
        current_length += 1

        #once the limit is reached, the current chunk is saved, and a new chunk begins
        if current_length >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [7]:
#process a block of data and create df_train and df_test
def process_blocks(block, block_name):
    train_dataset = block['train']
    test_dataset = block['test']

    train_dates, train_titles, train_labels = [], [], []
    test_dates, test_titles, test_labels = [], [], []

    for item in tqdm(train_dataset, desc=f'Processing Train Data ({block_name})'):
        date = item['Date']
        title = item['Title']
        label = item['Label']
        #tokenizes the titles into chunks
        chunks = chunk_tokens(title)

        for chunk in chunks:
            train_dates.append(date)
            train_titles.append(' '.join(chunk))
            train_labels.append(label)

    for item in tqdm(test_dataset, desc=f'Processing Test Data ({block_name})'):
        date = item['Date']
        title = item['Title']
        label = item['Label']
        chunks = chunk_tokens(title)

        for chunk in chunks:
            test_dates.append(date)
            test_titles.append(' '.join(chunk))
            test_labels.append(label)

    df_train = pd.DataFrame({'Date': train_dates, 'Title': train_titles, 'Label': train_labels})
    df_test = pd.DataFrame({'Date': test_dates, 'Title': test_titles, 'Label': test_labels})

    return df_train, df_test

# Process each block
for block, block_name in zip(blocks, block_names):
    df_train, df_test = process_blocks(block, block_name)
    df_train.to_csv(f'/content/drive/MyDrive/Text2Trade/data/{block_name}_train.csv', index=False)
    df_test.to_csv(f'/content/drive/MyDrive/Text2Trade/data/{block_name}_test.csv', index=False)

Processing Train Data (block0): 100%|██████████| 128/128 [00:04<00:00, 30.13it/s]
Processing Test Data (block0): 100%|██████████| 32/32 [00:01<00:00, 28.24it/s]
Processing Train Data (block1): 100%|██████████| 128/128 [00:04<00:00, 30.19it/s]
Processing Test Data (block1): 100%|██████████| 32/32 [00:01<00:00, 30.23it/s]
Processing Train Data (block2): 100%|██████████| 127/127 [00:04<00:00, 30.09it/s]
Processing Test Data (block2): 100%|██████████| 32/32 [00:01<00:00, 30.91it/s]
Processing Train Data (block3): 100%|██████████| 127/127 [00:04<00:00, 30.77it/s]
Processing Test Data (block3): 100%|██████████| 32/32 [00:01<00:00, 30.13it/s]
Processing Train Data (block4): 100%|██████████| 127/127 [00:04<00:00, 30.62it/s]
Processing Test Data (block4): 100%|██████████| 32/32 [00:00<00:00, 32.34it/s]


In [8]:
train_dataset = block0['train']
num_examples_to_view = 5
example_text = train_dataset.select(range(num_examples_to_view))['Title']

# Train model

Train Bert Classifier using data from block 0 - block 3. No validation required in this step.

In [9]:
"""
def load_train_data(blocks, tokenizer, max_tokens=512, batch_size=16, shuffle=True, train_ratio=0.8):
    # Create empty lists to store data from blocks
    titles = []
    labels = []

    # Merge data from blocks 0 to 3
    for block in blocks[:4]:  # Blocks 0 to 3
        # Extract titles and labels
        block_titles = block['train']['Title']
        block_labels = block['train']['Label']

        # Append titles and labels
        titles.extend(block_titles)
        labels.extend(block_labels)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Tokenize and process titles
    input_ids_list = []
    attention_masks_list = []

    for title in titles:
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Split data into train and validation sets based on train_ratio
    train_size = int(train_ratio * len(labels))
    val_size = len(labels) - train_size

    train_dataset = TensorDataset(input_ids[:train_size], attention_masks[:train_size], labels[:train_size])
    val_dataset = TensorDataset(input_ids[train_size:], attention_masks[train_size:], labels[train_size:])

    # Create DataLoader for train and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader

train_dataloader, val_dataloader = load_train_data(blocks, tokenizer, batch_size=16, shuffle=True)
"""

"\ndef load_train_data(blocks, tokenizer, max_tokens=512, batch_size=16, shuffle=True, train_ratio=0.8):\n    # Create empty lists to store data from blocks\n    titles = []\n    labels = []\n\n    # Merge data from blocks 0 to 3\n    for block in blocks[:4]:  # Blocks 0 to 3\n        # Extract titles and labels\n        block_titles = block['train']['Title']\n        block_labels = block['train']['Label']\n\n        # Append titles and labels\n        titles.extend(block_titles)\n        labels.extend(block_labels)\n\n    # Convert labels to a tensor\n    labels = torch.tensor(labels, dtype=torch.long)\n\n    # Tokenize and process titles\n    input_ids_list = []\n    attention_masks_list = []\n\n    for title in titles:\n        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)\n        input_ids = inputs['input_ids']\n        attention_mask = inputs['attention_mask']\n        input_ids_list.append(input_ids)\n        attention_masks

block0 - block4['train']

In [10]:
def load_train_data(blocks, tokenizer, max_tokens=512, batch_size=16, shuffle=True, train_ratio=0.8):
    # Create empty lists to store data from blocks
    titles = []
    labels = []

    # Merge data from blocks 0 to 3
    for block in blocks[:4]:  # Blocks 0 to 3
        # Extract titles and labels
        block_titles = block['train']['Title']
        block_labels = block['train']['Label']

        # Append titles and labels
        titles.extend(block_titles)
        labels.extend(block_labels)

    # Merge data from block4['train']
    block4_train_titles = blocks[4]['train']['Title']
    block4_train_labels = blocks[4]['train']['Label']

    # Append titles and labels
    titles.extend(block4_train_titles)
    labels.extend(block4_train_labels)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Tokenize and process titles
    input_ids_list = []
    attention_masks_list = []

    for title in titles:
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Split data into train and validation sets based on train_ratio
    train_size = int(train_ratio * len(labels))
    val_size = len(labels) - train_size

    train_dataset = TensorDataset(input_ids[:train_size], attention_masks[:train_size], labels[:train_size])
    val_dataset = TensorDataset(input_ids[train_size:], attention_masks[train_size:], labels[train_size:])

    # Create DataLoader for train and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader

train_dataloader, val_dataloader = load_train_data(blocks, tokenizer, batch_size=16, shuffle=True)


In [11]:
# Iterate through the dataloader to view the first batch
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    break  # Stop after viewing the first batch

Input IDs: tensor([[[  101,  4551,  2705,  ...,  3778, 14667,   102]],

        [[  101, 16798,  2509,  ...,  2824,  1999,   102]],

        [[  101,  4811,  7659,  ..., 10201,  2125,   102]],

        ...,

        [[  101, 29433,  1005,  ...,  2011,  2482,   102]],

        [[  101, 10474,  7906,  ...,  4487, 26212,   102]],

        [[  101,  2327,  2753,  ...,  2368,  3138,   102]]])
Attention Mask: tensor([[[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        ...,

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]]])
Labels: tensor([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0])


In [12]:
# Define the optimizer with weight decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

#we want to leave the last block as "test" after fine tuning.
epochs = 30
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [13]:
for epoch in range(epochs):
  # ========================================
  #               Training
  # ========================================

  print(f'Epoch {epoch + 1}/{epochs}')
  print('Training...')

  avg_train_loss=[]
  total_train_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    b_input_ids = batch[0].squeeze(1)
    b_input_mask = batch[1]
    b_labels = batch[2]

    #print("Shape of input_ids:", b_input_ids.shape)

    model.zero_grad()
    outputs = model(input_ids = b_input_ids, attention_mask=b_input_mask)

    #loss = outputs.loss
    loss = nn.CrossEntropyLoss()(outputs.logits, b_labels)
    logits = outputs.logits
    total_train_loss += loss.item()

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

  avg_train_loss = total_train_loss / len(train_dataloader)
  print("Average training loss: {0:.2f}".format(avg_train_loss))

Epoch 1/30
Training...
Average training loss: 0.69
Epoch 2/30
Training...
Average training loss: 0.69
Epoch 3/30
Training...
Average training loss: 0.67
Epoch 4/30
Training...
Average training loss: 0.63
Epoch 5/30
Training...
Average training loss: 0.55
Epoch 6/30
Training...
Average training loss: 0.44
Epoch 7/30
Training...
Average training loss: 0.35
Epoch 8/30
Training...
Average training loss: 0.28
Epoch 9/30
Training...
Average training loss: 0.21
Epoch 10/30
Training...
Average training loss: 0.16
Epoch 11/30
Training...
Average training loss: 0.11
Epoch 12/30
Training...
Average training loss: 0.10
Epoch 13/30
Training...
Average training loss: 0.09
Epoch 14/30
Training...
Average training loss: 0.07
Epoch 15/30
Training...
Average training loss: 0.08
Epoch 16/30
Training...
Average training loss: 0.02
Epoch 17/30
Training...
Average training loss: 0.04
Epoch 18/30
Training...
Average training loss: 0.03
Epoch 19/30
Training...
Average training loss: 0.01
Epoch 20/30
Training.

# Test

In [18]:
def load_data_for_testing(blocks, tokenizer, max_tokens=512, batch_size=16, shuffle=True):
    # Create empty lists to store data
    titles = []
    labels = []
    input_ids_list = []  # Add this line
    attention_masks_list = []  # Add this line

    # Merge data from blocks 0 to 4 for training
    for block in blocks[:5]:  # Blocks 0 to 4
        # Extract titles and labels
        block_titles = block['train']['Title']
        block_labels = block['train']['Label']

        # Append titles and labels
        titles.extend(block_titles)
        labels.extend(block_labels)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Tokenize and process titles
    for title in titles:
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Create DataLoader for training data
    train_dataset = TensorDataset(input_ids, attention_masks, labels)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)

    # Extract day-level labels and titles for testing
    test_dates = blocks[4]['test']['Date']
    test_titles = blocks[4]['test']['Title']
    test_labels = blocks[4]['test']['Label']

    # Tokenize and process titles for testing
    test_input_ids_list = []
    test_attention_masks_list = []

    for title in test_titles:
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        test_input_ids_list.append(input_ids)
        test_attention_masks_list.append(attention_mask)

    # Find the maximum sequence length for testing data
    max_test_sequence_length = max(input_ids.shape[1] for input_ids in test_input_ids_list)

    # Pad input_ids and attention_mask tensors for testing data to have the same length
    for i in range(len(test_input_ids_list)):
        test_input_ids_list[i] = torch.cat([test_input_ids_list[i], torch.zeros(1, max_test_sequence_length - test_input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        test_attention_masks_list[i] = torch.cat([test_attention_masks_list[i], torch.zeros(1, max_test_sequence_length - test_attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors for testing data along dim=0
    test_input_ids = torch.stack(test_input_ids_list)
    test_attention_masks = torch.stack(test_attention_masks_list)

    # Convert labels for testing data to a tensor
    test_labels = torch.tensor(test_labels, dtype=torch.long)

    # Create DataLoader for testing data
    test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = load_data_for_testing(blocks, tokenizer, batch_size=16, shuffle=True)

In [19]:
from sklearn.metrics import accuracy_score, f1_score, precision_score

Each day has different number of chunks

In [20]:
def calculate_num_chunks_per_day(dataset):
    # Convert the dataset to a DataFrame
    df = pd.DataFrame(dataset)

    # Group by unique "Date" and count the number of chunks (labels) per day
    num_chunks_per_day = df.groupby("Date")["Label"].count().values

    return num_chunks_per_day

block4_test = pd.read_csv("/content/drive/MyDrive/Text2Trade/data/block4_test.csv")  # Replace "path_to_block4_test.csv" with the actual file path
num_chunks_per_day = calculate_num_chunks_per_day(block4_test)
num_chunks_per_day

array([ 3,  3,  9,  3,  3,  3,  3,  9,  3,  3,  3,  3,  9,  3,  4,  3,  4,
        9,  3,  3,  4,  3,  9,  4,  3,  3,  3, 10,  3,  3,  3,  4])

In [21]:
day_predictions = [] #store predictions (logits) for each chunk of text
day_probabilities = [] #store probability of class 1 for each chunk
day_ground_truth_labels = [] #store ground truth labels for each chunk
num_examples_per_day = None

# Set the model to evaluation mode
model.eval()

with torch.no_grad():
    for batch in test_dataloader:
        # Extract inputs for a day (chunks of text)
        day_input_ids = batch[0].squeeze(1)
        day_attention_mask = batch[1]
        day_labels = batch[2]

        # Get model predictions and probabilities
        outputs = model(input_ids=day_input_ids, attention_mask=day_attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)

        # Aggregate predictions for the day
        day_predictions.append(torch.argmax(logits, dim=1))
        day_probabilities.append(probabilities[:, 1])  # Probability of class 1

        # Ground truth labels for the day
        day_ground_truth_labels.append(day_labels)

day_predictions = torch.cat(day_predictions)
print("Shape of day_predictions:", day_predictions.shape)
day_probabilities = torch.cat(day_probabilities)
day_ground_truth_labels = torch.cat(day_ground_truth_labels)

# Perform majority vote for each day
day_final_labels = []
current_index = 0  # Initialize the current index

for num_chunks in num_chunks_per_day:
    day_predictions_i = day_predictions[current_index:current_index + num_chunks]
    day_final_label = 1 if (day_predictions_i.sum() > (num_chunks / 2)) else 0
    day_final_labels.extend([day_final_label] * num_chunks)
    current_index += num_chunks

# Calculate the probability of class 1 for each day
day_probabilities = [day_probabilities[i:i + num_chunks] for i, num_chunks in enumerate(num_chunks_per_day)]
day_avg_probabilities = [torch.mean(prob) for prob in day_probabilities]

# Ensure that day_final_labels and day_ground_truth_labels have the same length
day_final_labels = day_final_labels[:len(day_ground_truth_labels)]

# Calculate evaluation metrics
accuracy = accuracy_score(day_ground_truth_labels, day_final_labels)
f1 = f1_score(day_ground_truth_labels, day_final_labels)
precision = precision_score(day_ground_truth_labels, day_final_labels)

#print("Length of day_ground_truth_labels:", len(day_ground_truth_labels))
#print("Length of day_final_labels:", len(day_final_labels))
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")

Shape of day_predictions: torch.Size([32])
Accuracy: 0.38
F1 Score: 0.41
Precision: 0.39


Day-level aggregated outputs (from majority vote):


In [22]:
for i, label in enumerate(day_final_labels):
    print(f"Day {i + 1} Aggregated Label: {label}")

Day 1 Aggregated Label: 0
Day 2 Aggregated Label: 0
Day 3 Aggregated Label: 0
Day 4 Aggregated Label: 0
Day 5 Aggregated Label: 0
Day 6 Aggregated Label: 0
Day 7 Aggregated Label: 1
Day 8 Aggregated Label: 1
Day 9 Aggregated Label: 1
Day 10 Aggregated Label: 1
Day 11 Aggregated Label: 1
Day 12 Aggregated Label: 1
Day 13 Aggregated Label: 1
Day 14 Aggregated Label: 1
Day 15 Aggregated Label: 1
Day 16 Aggregated Label: 1
Day 17 Aggregated Label: 1
Day 18 Aggregated Label: 1
Day 19 Aggregated Label: 1
Day 20 Aggregated Label: 1
Day 21 Aggregated Label: 1
Day 22 Aggregated Label: 1
Day 23 Aggregated Label: 1
Day 24 Aggregated Label: 1
Day 25 Aggregated Label: 0
Day 26 Aggregated Label: 0
Day 27 Aggregated Label: 0
Day 28 Aggregated Label: 0
Day 29 Aggregated Label: 0
Day 30 Aggregated Label: 0
Day 31 Aggregated Label: 0
Day 32 Aggregated Label: 0


Day-level averaged proabaility (for being 1):

In [23]:
for i, prob in enumerate(day_avg_probabilities):
    print(f"Day {i + 1} Average Probability of 1: {prob:.2f}")

Day 1 Average Probability of 1: 0.36
Day 2 Average Probability of 1: 0.03
Day 3 Average Probability of 1: 0.36
Day 4 Average Probability of 1: 0.01
Day 5 Average Probability of 1: 0.34
Day 6 Average Probability of 1: 0.67
Day 7 Average Probability of 1: 0.75
Day 8 Average Probability of 1: 0.58
Day 9 Average Probability of 1: 0.42
Day 10 Average Probability of 1: 0.67
Day 11 Average Probability of 1: 0.33
Day 12 Average Probability of 1: 0.67
Day 13 Average Probability of 1: 0.77
Day 14 Average Probability of 1: 0.66
Day 15 Average Probability of 1: 0.75
Day 16 Average Probability of 1: 0.67
Day 17 Average Probability of 1: 1.00
Day 18 Average Probability of 1: 0.78
Day 19 Average Probability of 1: 0.99
Day 20 Average Probability of 1: 0.99
Day 21 Average Probability of 1: 0.75
Day 22 Average Probability of 1: 0.68
Day 23 Average Probability of 1: 0.45
Day 24 Average Probability of 1: 0.26
Day 25 Average Probability of 1: 0.33
Day 26 Average Probability of 1: 0.32
Day 27 Average Probab

# NOT IN USE

In [None]:
"""
def load_data(data_path, tokenizer, max_tokens=512, batch_size=16, shuffle=True):
    df = pd.read_csv(data_path)

    # Extract titles, labels, and convert them to lists
    titles = df['Title'].tolist()
    labels = df['Label'].tolist()

    input_ids_list = []
    attention_masks_list = []

    for title in titles:
        #encoding
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)

        # Extract input_ids tensor and attention_mask tensor from the BatchEncoding
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Create a TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader
"""

In [None]:
"""
num_blocks = 4

# Lists to store train and test DataLoaders for each block
train_dataloaders = []
test_dataloaders = []

# Load data for each block
for i in range(num_blocks):
    train_data_path = f'/content/drive/MyDrive/Text2Trade/data/block{i}_train.csv'
    test_data_path = f'/content/drive/MyDrive/Text2Trade/data/block{i}_test.csv'

    train_dataloader = load_data(train_data_path, tokenizer, batch_size=16, shuffle=True)
    test_dataloader = load_data(test_data_path, tokenizer, batch_size=16, shuffle=False)

    train_dataloaders.append(train_dataloader)
    test_dataloaders.append(test_dataloader)
"""

In [None]:
"""
block0_train_dataloader = train_dataloaders[0]

# Iterate through the dataloader to view the first batch
for batch in block0_train_dataloader:
    input_ids, attention_mask, labels = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    break  # Stop after viewing the first batch
"""

In [None]:
"""
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

#we want to leave the last block as "test" after fine tuning.
epochs = 4
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

training_stats = []
block_validation_losses = []
all_validation_losses = []

for i in range(num_blocks):
    # Split the data into training and validation for the current block
    train_dataloader = train_dataloaders[i]
    validation_dataloader = test_dataloaders[i]

    #train your model on the current block's training data
    for epoch_i in range(epochs):
        # ========================================
        #               Training
        # ========================================

        print(f'Epoch {epoch_i + 1}/{epochs}')
        print('Training...')

        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

            b_input_ids = batch[0].squeeze(1)
            b_input_mask = batch[1]
            b_labels = batch[2]

            #print("Shape of input_ids:", b_input_ids.shape)

            model.zero_grad()
            outputs = model(input_ids = b_input_ids,
                            attention_mask=b_input_mask)

            #loss = outputs.loss
            loss = nn.CrossEntropyLoss()(outputs.logits, b_labels)
            logits = outputs.logits
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))

        # ========================================
        #               Validation
        # ========================================
        print("Running Validation...")

        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].squeeze(1)
            b_input_mask = batch[1]
            b_labels = batch[2]

            with torch.no_grad():
                outputs = model(b_input_ids,
                                attention_mask=b_input_mask)

            loss = nn.CrossEntropyLoss()(outputs.logits, b_labels)
            logits = outputs.logits

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += accuracy_score(np.argmax(logits, axis=1), label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))

        # Append the validation loss to the list for this block
        block_validation_losses.append(avg_val_loss)

    # Calculate the average validation loss for this block across all epochs
    avg_validation_loss_block = sum(block_validation_losses) / len(block_validation_losses)
    print("Average Validation Loss for Block {}: {:.2f}".format(i, avg_validation_loss_block))

    # Append the average validation loss for this block to the list for all blocks
    all_validation_losses.append(avg_validation_loss_block)

# Calculate the overall average validation loss across all blocks and epochs
avg_validation_loss = sum(all_validation_losses) / len(all_validation_losses)
print("Average Validation Loss Across Blocks and Epochs: {:.2f}".format(avg_validation_loss))
"""