In [46]:
import os
import glob
import pandas as pd
from tqdm import tqdm

import torch

In [47]:
# Функция для чтения текстов и их рейтингов
def load_data(pos_dir, neg_dir):
    data = []
    ratings = []

    # Чтение положительных отзывов
    for filepath in tqdm(glob.glob(os.path.join(pos_dir, '*.txt'))):
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            rating = int(os.path.basename(filepath).split('_')[1].split('.')[0])  # Извлечение рейтинга
            data.append(text)
            ratings.append(rating)

    # Чтение отрицательных отзывов
    for filepath in tqdm(glob.glob(os.path.join(neg_dir, '*.txt'))):
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            rating = int(os.path.basename(filepath).split('_')[1].split('.')[0])  # Извлечение рейтинга
            data.append(text)
            ratings.append(rating)

    return pd.DataFrame({'text': data, 'rating': ratings})


In [48]:
# Путь к каталогам
train_dir = 'train'
test_dir = 'test'
pos_dir_train = os.path.join(train_dir, 'pos')
neg_dir_train = os.path.join(train_dir, 'neg')
pos_dir_test = os.path.join(test_dir, 'pos')
neg_dir_test = os.path.join(test_dir, 'neg')

train_dataset = load_data(pos_dir_train, neg_dir_train)
test_dataset = load_data(pos_dir_test, neg_dir_test)
print(train_dataset.shape, test_dataset.shape)
train_dataset.head()

100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [02:13<00:00, 93.89it/s]
100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:59<00:00, 104.44it/s]
100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:59<00:00, 104.85it/s]
100%|███████████████████████████████████████████████████████████████████████████| 12500/12500 [01:47<00:00, 115.84it/s]


(25000, 2) (25000, 2)


Unnamed: 0,text,rating
0,Bromwell High is a cartoon comedy. It ran at t...,9
1,Homelessness (or Houselessness as George Carli...,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,This is easily the most underrated film inn th...,7
4,This is not the typical Mel Brooks film. It wa...,8


In [49]:
# Сохранение данных во временный CSV для torchtext
train_dataset.to_csv('train.csv', index=False)
test_dataset.to_csv('test.csv', index=False)

In [50]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,text,rating
0,Bromwell High is a cartoon comedy. It ran at t...,9
1,Homelessness (or Houselessness as George Carli...,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,This is easily the most underrated film inn th...,7
4,This is not the typical Mel Brooks film. It wa...,8


In [51]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [52]:
texts = train_df["text"].tolist()
ratings = train_df["rating"].tolist()

In [53]:
set(ratings)

{1, 2, 3, 4, 7, 8, 9, 10}

In [54]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, ratings, test_size=0.2)

In [55]:
# Загрузка токенизатора и токенизация текста
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [69]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [70]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [84]:
# Создание класса для PyTorch Dataset
class MovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [85]:
train_dataset = MovieReviewDataset(train_encodings, train_labels)
val_dataset = MovieReviewDataset(val_encodings, val_labels)

In [86]:
# Обучение модели
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=11)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
# Настройка параметров обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='steps',   
    eval_steps=500,                
)  

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [88]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.8801,1.495725
1000,1.5352,1.429707
1500,1.4812,1.39953
2000,1.4324,1.339643
2500,1.4032,1.441392
3000,1.1999,1.336743
3500,1.1761,1.311511
4000,1.1394,1.278639
4500,1.1336,1.273514
5000,1.1129,1.256234


TrainOutput(global_step=5000, training_loss=1.349397900390625, metrics={'train_runtime': 29377.0551, 'train_samples_per_second': 1.362, 'train_steps_per_second': 0.17, 'total_flos': 5299546398720000.0, 'train_loss': 1.349397900390625, 'epoch': 2.0})

In [89]:
model.save_pretrained('./movie_review_model')
tokenizer.save_pretrained('./movie_review_model')

('./movie_review_model\\tokenizer_config.json',
 './movie_review_model\\special_tokens_map.json',
 './movie_review_model\\vocab.txt',
 './movie_review_model\\added_tokens.json')

In [90]:
model = DistilBertForSequenceClassification.from_pretrained('./movie_review_model')
tokenizer = DistilBertTokenizer.from_pretrained('./movie_review_model')

def predict_rating(review_text):
    inputs = tokenizer(review_text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

## Пример предсказания рейтинга

In [91]:
new_review = "This film was amazing. I am so inspired" 
predicted_rating = predict_rating(new_review)
print(f'Предсказанный рейтинг: {predicted_rating}')

Предсказанный рейтинг: 10


## Accuracy on validation set

In [92]:
predictions = [predict_rating(x) for x in val_texts]

In [93]:
def accuracy(preds, labels, tolerance=0):
    if len(preds)!=len(labels):
        raise ValueError("lengths of texts and labels collections should be same")
    c = 0
    for i in range(len(labels)):
        if abs(labels[i] - preds[i]) <= tolerance:
            c+=1
    return c/len(labels)

In [94]:
print("Accuracy with tolerance 0:", accuracy(predictions, val_labels))
print("Accuracy with tolerance 1:", accuracy(predictions, val_labels, tolerance=1))
print("Accuracy with tolerance 2:", accuracy(predictions, val_labels, tolerance=2))

Accuracy with tolerance 0: 0.514
Accuracy with tolerance 1: 0.7802
Accuracy with tolerance 2: 0.9


## Binary tonality (positives, negatives)

In [95]:
bin_preds = [int(x>5) for x in predictions]
bin_labels = [int(x>5) for x in val_labels]

In [96]:
print("Accuracy on binary classification:", accuracy(bin_preds, bin_labels))

Accuracy on binary classification: 0.9364
