In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm



In [2]:
# Загрузка данных
df = pd.read_csv('Constraint_Train.csv')



In [3]:
# Загрузка предварительно обученного BERT токенизатора и модели
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


In [4]:
# Токенизация текстов
sentences = []
attention_masks = []
for text in tqdm(df.tweet):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    sentences.append(encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])



100%|██████████| 6420/6420 [00:05<00:00, 1258.02it/s]


In [5]:
# Преобразование списков в тензоры PyTorch
sentences_tensor = torch.cat(sentences, dim=0)
attention_masks_tensor = torch.cat(attention_masks, dim=0)

labels = (df.label == 'real').astype(int).to_list()
labels_tensor = torch.tensor(labels)



In [6]:
# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(sentences_tensor, labels_tensor, test_size=0.33)



In [7]:
# Подготовка данных для PyTorch
train_dataset = TensorDataset(X_train, y_train, attention_masks_tensor[:len(X_train)])
test_dataset = TensorDataset(X_test, y_test, attention_masks_tensor[len(X_train):])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [8]:
# Определение модели CNN с использованием BERT
class CNN(nn.Module):
    def __init__(self, output_size, bert_model):
        super(CNN, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.5)
        self.label = nn.Linear(768, output_size)
    
    def forward(self, input_sentences, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_sentences, attention_mask=attention_mask)[1]
        bert_output = self.dropout(bert_output)
        logits = self.label(bert_output)
        return logits



In [9]:
# Определение устройства
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)



cuda


In [10]:
# Определение модели, функции потерь и оптимизатора
model = CNN(output_size=2, bert_model=bert_model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:

# Ранняя остановка
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')

    def __call__(self, val_loss):
        if val_loss < (self.best_loss - self.min_delta):
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

early_stopping = EarlyStopping(patience=5, min_delta=0.01)



In [12]:
# Обучение модели с использованием ранней остановки
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_idx, (data, target, attention_mask) in enumerate(train_loader):
        data, target, attention_mask = data.to(device), target.to(device), attention_mask.to(device)
        optimizer.zero_grad()
        output = model(data, attention_mask)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    
    # Валидация
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for data, target, attention_mask in test_loader:
            data, target, attention_mask = data.to(device), target.to(device), attention_mask.to(device)
            output = model(data, attention_mask)
            loss = criterion(output, target)
            val_loss += loss.item()
    val_loss /= len(test_loader)
    
    # Ранняя остановка
    if early_stopping(val_loss):
        print(f"Early stopping at epoch {epoch}")
        break



In [13]:
# Оценка модели
model.eval()
with torch.no_grad():
    y_pred = []
    y_true = []
    for data, target, attention_mask in test_loader:
        data, target, attention_mask = data.to(device), target.to(device), attention_mask.to(device)
        output = model(data, attention_mask)
        _, predicted = torch.max(output, 1)
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(target.cpu().numpy())


In [14]:
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.7797790407989869
