In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([f"{token.text}_{token.pos_}" for token in doc])

# Load and preprocess the data
df = pd.read_csv('Downloads/stock_data.csv')
df['Sentiment'] = (df['Sentiment'] + 1) / 2
df['Processed_Text'] = df['Text'].apply(preprocess_text)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Text'], df['Sentiment'], 
                                                    test_size=0.2, random_state=42)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def encode_data(texts, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_input_ids, train_attention_masks = encode_data(X_train, tokenizer)
test_input_ids, test_attention_masks = encode_data(X_test, tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

batch_size = 32

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [8]:
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2].float().unsqueeze(1)}  # Add .float() and .unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [9]:
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2].float().unsqueeze(1)}  # Add .float() and .unsqueeze(1)

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        predictions.extend((logits.squeeze() > 0).int().cpu().numpy())
        true_labels.extend(inputs['labels'].cpu().numpy())

    return predictions, true_labels

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    avg_loss = train(model, train_dataloader, optimizer, scheduler, device)
    print(f'Average loss: {avg_loss:.4f}')

    predictions, true_labels = evaluate(model, test_dataloader, device)
    print(classification_report(true_labels, predictions))

Epoch 1/4
Average loss: 0.2377
              precision    recall  f1-score   support

         0.0       1.00      0.01      0.03       427
         1.0       0.63      1.00      0.78       732

    accuracy                           0.64      1159
   macro avg       0.82      0.51      0.40      1159
weighted avg       0.77      0.64      0.50      1159

Epoch 2/4
Average loss: 0.1639
              precision    recall  f1-score   support

         0.0       1.00      0.10      0.18       427
         1.0       0.65      1.00      0.79       732

    accuracy                           0.67      1159
   macro avg       0.83      0.55      0.48      1159
weighted avg       0.78      0.67      0.56      1159

Epoch 3/4
Average loss: 0.1248
              precision    recall  f1-score   support

         0.0       0.95      0.25      0.40       427
         1.0       0.69      0.99      0.82       732

    accuracy                           0.72      1159
   macro avg       0.82      0.62  

In [11]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    processed_text = preprocess_text(text)
    encoded = tokenizer.encode_plus(
        processed_text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probability = torch.sigmoid(logits).item()
    sentiment = 'Positive' if probability > 0.5 else 'Negative'
    confidence = probability if sentiment == 'Positive' else 1 - probability

    return sentiment, confidence

In [12]:
torch.save(model.state_dict(), 'bert_stock_sentiment_model.pth')

NameError: name 'torch' is not defined

  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load('bert_stock_sentiment_model.pth', map_location=device))
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Good sentiment!
Sentiment output: 1
