<a href="https://colab.research.google.com/github/gz2214/Text2Trade/blob/main/code/Bert_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_from_disk
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import ast
import re
import os
import numpy as np
from sklearn.metrics import accuracy_score
import torch.optim as optim
import torch.nn as nn

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
dataset_path_block0 = '/content/drive/MyDrive/Text2Trade/data/block0'
dataset_path_block1 = '/content/drive/MyDrive/Text2Trade/data/block1'
dataset_path_block2 = '/content/drive/MyDrive/Text2Trade/data/block2'
dataset_path_block3 = '/content/drive/MyDrive/Text2Trade/data/block3'
dataset_path_block4 = '/content/drive/MyDrive/Text2Trade/data/block4'

block0 = load_from_disk(dataset_path_block0)
block1 = load_from_disk(dataset_path_block1)
block2 = load_from_disk(dataset_path_block2)
block3 = load_from_disk(dataset_path_block3)
block4 = load_from_disk(dataset_path_block4)

blocks = [block0, block1, block2, block3, block4]
block_names = ['block0', 'block1', 'block2', 'block3', 'block4']

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2 # Binary classification (up/down)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#function to split the text into chunks of tokens - ensure each chunk is less than 512 tokens
def chunk_tokens(text, max_tokens=512):
    tokens = tokenizer.tokenize(text) #tokenize text
    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokens:
        current_chunk.append(token) #append token to current chunk
        current_length += 1

        #once the limit is reached, the current chunk is saved, and a new chunk begins
        if current_length >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [7]:
#process a block of data and create df_train and df_test
def process_blocks(block, block_name):
    train_dataset = block['train']
    test_dataset = block['test']

    train_dates, train_titles, train_labels = [], [], []
    test_dates, test_titles, test_labels = [], [], []

    for item in tqdm(train_dataset, desc=f'Processing Train Data ({block_name})'):
        date = item['Date']
        title = item['Title']
        label = item['Label']
        #tokenizes the titles into chunks
        chunks = chunk_tokens(title)

        for chunk in chunks:
            train_dates.append(date)
            train_titles.append(' '.join(chunk))
            train_labels.append(label)

    for item in tqdm(test_dataset, desc=f'Processing Test Data ({block_name})'):
        date = item['Date']
        title = item['Title']
        label = item['Label']
        chunks = chunk_tokens(title)

        for chunk in chunks:
            test_dates.append(date)
            test_titles.append(' '.join(chunk))
            test_labels.append(label)

    df_train = pd.DataFrame({'Date': train_dates, 'Title': train_titles, 'Label': train_labels})
    df_test = pd.DataFrame({'Date': test_dates, 'Title': test_titles, 'Label': test_labels})

    return df_train, df_test

# Process each block
for block, block_name in zip(blocks, block_names):
    df_train, df_test = process_blocks(block, block_name)
    df_train.to_csv(f'/content/drive/MyDrive/Text2Trade/data/{block_name}_train.csv', index=False)
    df_test.to_csv(f'/content/drive/MyDrive/Text2Trade/data/{block_name}_test.csv', index=False)

Processing Train Data (block0): 100%|██████████| 128/128 [00:05<00:00, 23.31it/s]
Processing Test Data (block0): 100%|██████████| 32/32 [00:01<00:00, 27.72it/s]
Processing Train Data (block1): 100%|██████████| 128/128 [00:04<00:00, 27.28it/s]
Processing Test Data (block1): 100%|██████████| 32/32 [00:01<00:00, 29.37it/s]
Processing Train Data (block2): 100%|██████████| 127/127 [00:04<00:00, 30.33it/s]
Processing Test Data (block2): 100%|██████████| 32/32 [00:01<00:00, 30.82it/s]
Processing Train Data (block3): 100%|██████████| 127/127 [00:04<00:00, 30.76it/s]
Processing Test Data (block3): 100%|██████████| 32/32 [00:01<00:00, 30.81it/s]
Processing Train Data (block4): 100%|██████████| 127/127 [00:04<00:00, 30.39it/s]
Processing Test Data (block4): 100%|██████████| 32/32 [00:00<00:00, 32.10it/s]


In [8]:
train_dataset = block0['train']
num_examples_to_view = 10
example_text = train_dataset.select(range(num_examples_to_view))['Title']

In [9]:
example_text

 '75 Texas Tech football players have tested positive for COVID-19 since testing began in June Facebook buying REI\'s brand-new HQ for $368M Why Fastly Stock Surged Today Jim Cramer reveals playbook for Snowflake IPO: \'It\'s a great concept\' Here\'s some of the places Boston Sports Club owes money to Amazon to Hire 5,500 New Employees in Chicago Area, Signing Bonuses Offered What\'s Behind Amazon\'s Hiring Spree Ford Ranger Tremor: Meet the Midsize Off-Road Adventure Truck Snowflake Boosts Expected IPO Pricing Range Texas changes how it reports coronavirus positivity rate VA data breach exposes personal information for 46,000 veterans CenturyLink to Get a New Name Home Depot co-founder Arthur Blank on corporate profits, social change Hobby Lobby raises full-time hourly minimum wage to $17 New York City\'s 2nd tallest office building unveiled at ceremony in Manhattan Taco Bell introduces Jalapeno Noir to its menu Why TikTok\'s Oracle deal is a green light for advertisers to keep spend

In [10]:
def load_data(data_path, tokenizer, max_tokens=512, batch_size=16, shuffle=True):
    df = pd.read_csv(data_path)

    # Extract titles, labels, and convert them to lists
    titles = df['Title'].tolist()
    labels = df['Label'].tolist()

    input_ids_list = []
    attention_masks_list = []

    for title in titles:
        #encoding
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)

        # Extract input_ids tensor and attention_mask tensor from the BatchEncoding
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Create a TensorDataset
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # Create a DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

In [11]:
num_blocks = 4

# Lists to store train and test DataLoaders for each block
train_dataloaders = []
test_dataloaders = []

# Load data for each block
for i in range(num_blocks):
    train_data_path = f'/content/drive/MyDrive/Text2Trade/data/block{i}_train.csv'
    test_data_path = f'/content/drive/MyDrive/Text2Trade/data/block{i}_test.csv'

    train_dataloader = load_data(train_data_path, tokenizer, batch_size=16, shuffle=True)
    test_dataloader = load_data(test_data_path, tokenizer, batch_size=16, shuffle=False)

    train_dataloaders.append(train_dataloader)
    test_dataloaders.append(test_dataloader)

In [12]:
block0_train_dataloader = train_dataloaders[0]

# Iterate through the dataloader to view the first batch
for batch in block0_train_dataloader:
    input_ids, attention_mask, labels = batch
    print("Input IDs:", input_ids)
    print("Attention Mask:", attention_mask)
    print("Labels:", labels)
    break  # Stop after viewing the first batch

Input IDs: tensor([[[  101,  1005, 13255,  ..., 13749,  1029,   102]],

        [[  101,  1001,  1001,  ...,  7661,  7020,   102]],

        [[  101,  5414,  2951,  ...,  1001, 16985,   102]],

        ...,

        [[  101,  3958, 29433,  ...,  3573, 26060,   102]],

        [[  101,  5766,  1001,  ...,  1001,  6819,   102]],

        [[  101,  2017,  1521,  ...,  3384, 12312,   102]]])
Attention Mask: tensor([[[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        ...,

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]]])
Labels: tensor([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1])


In [13]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

#we want to leave the last block as "test" after fine tuning.
epochs = 4
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [16]:
training_stats = []
block_validation_losses = []
all_validation_losses = []

for i in range(num_blocks):
    # Split the data into training and validation for the current block
    train_dataloader = train_dataloaders[i]
    validation_dataloader = test_dataloaders[i]

    #train your model on the current block's training data
    for epoch_i in range(epochs):
        # ========================================
        #               Training
        # ========================================

        print(f'Epoch {epoch_i + 1}/{epochs}')
        print('Training...')

        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

            b_input_ids = batch[0].squeeze(1)
            b_input_mask = batch[1]
            b_labels = batch[2]

            #print("Shape of input_ids:", b_input_ids.shape)

            model.zero_grad()
            outputs = model(input_ids = b_input_ids,
                            attention_mask=b_input_mask)

            #loss = outputs.loss
            loss = nn.CrossEntropyLoss()(outputs.logits, b_labels)
            logits = outputs.logits
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))

        # ========================================
        #               Validation
        # ========================================
        print("Running Validation...")

        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].squeeze(1)
            b_input_mask = batch[1]
            b_labels = batch[2]

            with torch.no_grad():
                outputs = model(b_input_ids,
                                attention_mask=b_input_mask)

            loss = nn.CrossEntropyLoss()(outputs.logits, b_labels)
            logits = outputs.logits

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += accuracy_score(np.argmax(logits, axis=1), label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))

        # Append the validation loss to the list for this block
        block_validation_losses.append(avg_val_loss)

    # Calculate the average validation loss for this block across all epochs
    avg_validation_loss_block = sum(block_validation_losses) / len(block_validation_losses)
    print("Average Validation Loss for Block {}: {:.2f}".format(i, avg_validation_loss_block))

    # Append the average validation loss for this block to the list for all blocks
    all_validation_losses.append(avg_validation_loss_block)

# Calculate the overall average validation loss across all blocks and epochs
avg_validation_loss = sum(all_validation_losses) / len(all_validation_losses)
print("Average Validation Loss Across Blocks and Epochs: {:.2f}".format(avg_validation_loss))

Training...
  Average training loss: 0.67
Running Validation...
  Accuracy: 0.76
  Validation Loss: 0.60
Average Validation Loss for Block 0: 0.60
Training...
  Average training loss: 0.66
Running Validation...
  Accuracy: 0.58
  Validation Loss: 0.68
Average Validation Loss for Block 1: 0.64
Training...
  Average training loss: 0.74
Running Validation...
  Accuracy: 0.61
  Validation Loss: 0.67
Average Validation Loss for Block 2: 0.65
Training...
  Average training loss: 0.73
Running Validation...
  Accuracy: 0.47
  Validation Loss: 0.74
Average Validation Loss for Block 3: 0.67
Training...
  Average training loss: 0.73
Running Validation...
  Accuracy: 0.46
  Validation Loss: 0.75
Average Validation Loss for Block 4: 0.69
Average Validation Loss Across Blocks and Epochs: 0.65


In [21]:
def load_fine_tune_data(blocks, tokenizer, max_tokens=512, batch_size=16, shuffle=True, train_ratio=0.8):
    # Create empty lists to store data from blocks
    titles = []
    labels = []

    # Merge data from blocks 0 to 3
    for block in blocks[:4]:  # Blocks 0 to 3
        # Extract titles and labels
        block_titles = block['train']['Title']
        block_labels = block['train']['Label']

        # Append titles and labels
        titles.extend(block_titles)
        labels.extend(block_labels)

    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)

    # Tokenize and process titles
    input_ids_list = []
    attention_masks_list = []

    for title in titles:
        inputs = tokenizer(title, return_tensors='pt', padding=True, truncation=True, max_length=max_tokens)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)

    # Find the maximum sequence length
    max_sequence_length = max(input_ids.shape[1] for input_ids in input_ids_list)

    # Pad input_ids and attention_mask tensors to have the same length
    for i in range(len(input_ids_list)):
        input_ids_list[i] = torch.cat([input_ids_list[i], torch.zeros(1, max_sequence_length - input_ids_list[i].shape[1], dtype=torch.long)], dim=1)
        attention_masks_list[i] = torch.cat([attention_masks_list[i], torch.zeros(1, max_sequence_length - attention_masks_list[i].shape[1], dtype=torch.long)], dim=1)

    # Stack input_ids tensors and attention_mask tensors along dim=0
    input_ids = torch.stack(input_ids_list)
    attention_masks = torch.stack(attention_masks_list)

    # Split data into train and validation sets based on train_ratio
    train_size = int(train_ratio * len(labels))
    val_size = len(labels) - train_size

    train_dataset = TensorDataset(input_ids[:train_size], attention_masks[:train_size], labels[:train_size])
    val_dataset = TensorDataset(input_ids[train_size:], attention_masks[train_size:], labels[train_size:])

    # Create DataLoader for train and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader

train_dataloader, val_dataloader = load_fine_tune_data(blocks, tokenizer, batch_size=16, shuffle=True)

In [22]:
optimizer = AdamW(model.parameters(), lr = 2e-5)
epochs = 4
#total number of training steps
total_steps = len(train_dataloader) * epochs
#create learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    print(f'Epoch {epoch + 1}/{epochs}')
    print('Training...')

    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        #loss = outputs.loss
        loss = nn.CrossEntropyLoss()(outputs.logits, labels)
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    # Calculate average training loss for this epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Average training loss: {avg_train_loss:.2f}")

    # Validation loop
    print("Running Validation...")
    model.eval()
    total_val_accuracy = 0
    total_val_loss = 0

    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        logits = outputs.logits

        total_val_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        total_val_accuracy += accuracy_score(label_ids, np.argmax(logits, axis=1))

    # Calculate average validation loss and accuracy for this epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_val_accuracy / len(val_dataloader)
    print(f"  Validation loss: {avg_val_loss:.2f}")
    print(f"  Validation accuracy: {avg_val_accuracy:.2f}")