In [25]:
MAX_LEN = 512
BERT_MODEL = "bert-base-uncased"

In [26]:
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.nn import CrossEntropyLoss

import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import re


In [27]:
class BERTModel(nn.Module):
    """
        BERT model with a dropout and linear layer with 2 outputs
    """
    def __init__(self, bert_model, num_classes=2, dropout_rate=0.3):
        super(BERTModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(bert_model)
        self.drop = nn.Dropout(dropout_rate)
        self.out = nn.Linear(self.bert.config.hidden_size, num_classes)


    def forward(self, ids, mask, token_type_ids):
        _, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.drop(pooled_output)
        return self.out(output)

In [28]:
def remove_tags(text):
    if isinstance(text, list):
        return [remove_tags(t) for t in text]
    else:
        text = text.lower()
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        # Remove '<user>'
        text = re.sub(r'<user>', '', text)
        # Remove '<url>'
        text = re.sub(r'<url>', '', text)
        # remove number
        text = re.sub(r'\d+', '', text)
        return text

In [29]:
def train(data, model, optimizer, device):
    """
        Train the model for one epoch
    """
    model.train()
    running_loss = 0.0

    for batch_idx, d in enumerate(data):
        mask = d['mask'].to(device, dtype=torch.long)
        ids = d['ids'].to(device, dtype=torch.long)
        token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
        targets = d['targets'].to(device, dtype=torch.long)
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = torch.nn.CrossEntropyLoss(outputs, targets) # Calculate loss
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0 and batch_idx !=0:
            temp = f'Batch index = {batch_idx}\tRunning Loss = {running_loss/10}'
            print(temp)
            running_loss = 0.0

In [30]:
pos_path = 'data/twitter-datasets/train_pos.txt'
neg_path = 'data/twitter-datasets/train_neg.txt'

with open(pos_path, 'r') as f:
    pos_tweets = f.readlines()
with open(neg_path, 'r') as f:
    neg_tweets = f.readlines()

# TODO: preprocess data
pos_tweets = remove_tags(pos_tweets)
neg_tweets = remove_tags(neg_tweets)


pos_labels = [1 for _ in range(len(pos_tweets))]
neg_labels = [0 for _ in range(len(neg_tweets))]
labels = pos_labels + neg_labels
tweets = pos_tweets + neg_tweets

train_tweets, val_tweets, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.1)

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
train_encodings = tokenizer(train_tweets, truncation=True, padding=True, max_length=MAX_LEN)
val_encodings = tokenizer(val_tweets, truncation=True, padding=True, max_length=MAX_LEN)

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)



AttributeError: 'list' object has no attribute 'lower'

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5  # Number of training epochs


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader) * num_epochs)


# Function to calculate accuracy
def calc_accuracy(preds, labels):
    _, predictions = torch.max(preds, dim=1)
    correct = (predictions == labels).float()
    acc = correct.sum() / len(correct)
    return acc

all_labels = []
all_predictions = []

# Training Loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()
    total_train_loss = 0
    total_train_acc = 0

    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = criterion(outputs.logits, batch['labels'])
        acc = calc_accuracy(outputs.logits, batch['labels'])
        print(f"Step {step+1}/{len(train_loader)} - Loss: {loss.item()}, Accuracy: {acc.item()}")
        total_train_loss += loss.item()
        total_train_acc += acc.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_train_acc = total_train_acc / len(train_loader)
    print(f"Epoch {epoch+1} - Train loss: {avg_train_loss}, Accuracy: {avg_train_acc}")

    # Validation Loop
    model.eval()
    total_val_accuracy = 0
    total_val_loss = 0

    for batch in val_loader:
        with torch.no_grad():
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = criterion(outputs.logits, batch['labels'])
            acc = calc_accuracy(outputs.logits, batch['labels'])

            total_val_loss += loss.item()
            total_val_accuracy += acc.item()

            # for overall metrics calculation
            preds = outputs.logits.argmax(dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()
            all_predictions.extend(preds)
            all_labels.extend(labels)

    avg_val_loss = total_val_loss / len(val_loader)
    avg_val_accuracy = total_val_accuracy / len(val_loader)
    print(f"Epoch {epoch+1} - Validation loss: {avg_val_loss}, Accuracy: {avg_val_accuracy}")

# After all epochs, calculate overall metrics
precision = precision_score(all_labels, all_predictions, average='macro')
recall = recall_score(all_labels, all_predictions, average='macro')
f1 = f1_score(all_labels, all_predictions, average='macro')

print("Training complete!")
print(f"Overall Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Step 17505/22500 - Loss: 0.15061628818511963, Accuracy: 0.875
Step 17506/22500 - Loss: 0.33853819966316223, Accuracy: 0.875
Step 17507/22500 - Loss: 0.027762144804000854, Accuracy: 1.0
Step 17508/22500 - Loss: 0.08470586687326431, Accuracy: 1.0
Step 17509/22500 - Loss: 0.3344103693962097, Accuracy: 0.875
Step 17510/22500 - Loss: 0.08903975039720535, Accuracy: 1.0
Step 17511/22500 - Loss: 0.11428086459636688, Accuracy: 1.0
Step 17512/22500 - Loss: 0.16558657586574554, Accuracy: 0.875
Step 17513/22500 - Loss: 0.0924130231142044, Accuracy: 1.0
Step 17514/22500 - Loss: 0.2806130647659302, Accuracy: 0.875
Step 17515/22500 - Loss: 0.0034428469371050596, Accuracy: 1.0
Step 17516/22500 - Loss: 0.1406453400850296, Accuracy: 0.875
Step 17517/22500 - Loss: 0.4121395945549011, Accuracy: 0.875
Step 17518/22500 - Loss: 0.2875213325023651, Accuracy: 0.875
Step 17519/22500 - Loss: 0.11572179943323135, Accuracy: 1.0
Step 17520/22500 - Loss: 0.05712210014462471, 

In [None]:
torch.save(model, 'bert.pth')

In [None]:
## predict
model = torch.load('bert.pth')

## use the test set
test_path = folder_path + 'data/twitter-datasets/test_data.txt'
with open(test_path, 'r') as f:
    test_tweets = f.readlines()

test_encodings = tokenizer(test_tweets, truncation=True, padding=True, max_length=MAX_LEN)
test_dataset = TweetDataset(test_encodings, [0 for _ in range(len(test_tweets))])
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


model.eval()
predictions = []
for batch in test_loader:
    with torch.no_grad():
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = outputs.logits.argmax(dim=1).cpu().numpy()
        predictions.extend(preds)

In [None]:
predictions = np.array(predictions)
predictions[predictions == 0] = -1

In [None]:
print(predictions)

[-1 -1 -1 ... -1  1 -1]


In [None]:
import pandas as pd

submission = pd.DataFrame({'Id':range(1, len(predictions) + 1),'Prediction': predictions})
submission.to_csv('submission.csv', index=False)