In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [3]:
dataset = load_dataset("imdb")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

args = TrainingArguments(
    output_dir="./bert_sentiment",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized["test"].select(range(1000)),
    compute_metrics=compute_metrics
)

# Обучение
trainer.train()
bert_results = trainer.evaluate()
print("BERT results:", bert_results)



Step,Training Loss
10,0.4502
20,0.2782
30,0.2484
40,0.1218
50,0.1569
60,0.2623
70,0.3578
80,0.2628
90,0.2116
100,0.1901


BERT results: {'eval_loss': 0.27451398968696594, 'eval_accuracy': 0.896, 'eval_f1': 0.0, 'eval_runtime': 18.091, 'eval_samples_per_second': 55.276, 'eval_steps_per_second': 3.482, 'epoch': 1.0}


In [13]:
class IMDbDataset(Dataset):
    def __init__(self, texts, labels, vectorizer):
        self.X = torch.tensor(vectorizer.transform(texts).toarray(), dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Пример: 1000 примеров для обучения
texts = dataset["train"]["text"][:1000]
labels = dataset["train"]["label"][:1000]
texts_test = dataset["test"]["text"][:200]
labels_test = dataset["test"]["label"][:200]

vectorizer = CountVectorizer(max_features=2000)
vectorizer.fit(texts)

train_data = IMDbDataset(texts, labels, vectorizer)
test_data = IMDbDataset(texts_test, labels_test, vectorizer)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [14]:
class LSTMSentiment(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # add sequence dimension
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n.squeeze(0))

model = LSTMSentiment(2000, 128, 2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Обучение LSTM
for epoch in range(1):
    model.train()
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

# Оценка
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        output = model(x_batch)
        preds = torch.argmax(output, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(y_batch.tolist())

from sklearn.metrics import accuracy_score, f1_score
print("LSTM Accuracy:", accuracy_score(all_labels, all_preds))
print("LSTM F1:", f1_score(all_labels, all_preds))


LSTM Accuracy: 1.0
LSTM F1: 0.0


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
