In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_absolute_error
from datasets import Dataset


In [None]:
# Загрузка подготовленных датасетов
train_data = pd.read_csv('data/imdb_train.csv',index_col='Unnamed: 0')
test_data = pd.read_csv('data/imdb_test.csv',index_col='Unnamed: 0')

In [None]:
class MovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

In [None]:
X_train = train_data['text']
X_test = test_data['text']
y_train = train_data['label']
y_test = test_data['label']

In [None]:
# LabelEncoding для label
ratings = list(range(1, 5)) + list(range(7, 11))

label_encoder = LabelEncoder()
label_encoder.fit(ratings)

y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [None]:
# Создаем датасет
train_dataset = MovieReviewDataset(X_train.tolist(), y_train.tolist())
test_dataset = MovieReviewDataset(X_test.tolist(), y_test.tolist())

In [None]:
# Применяем токенайзер к вашим датасетам
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)
# Применяем токенизацию
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Устанавливаем формат
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Для регрессии num_labels=1


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Удаляем ненужные колонки (например, оригинальный текст), если они не нужны для модели
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# Указываем, что метки нужны в виде целых чисел
train_dataset.set_format('torch')
test_dataset.set_format('torch')


In [None]:
training_args = TrainingArguments(
    output_dir='maybe_worked',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='maybe_worked_logs',
    evaluation_strategy='epoch',
    logging_steps=10,
    save_strategy='epoch'
)



In [None]:
# Создаем Trainer с добавленным токенайзером
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,  # добавляем токенайзер
    compute_metrics=lambda p: {
        'mae': mean_absolute_error(p.label_ids, p.predictions.flatten())
    }
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mae
1,1.2639,6.038891,1.620651
2,1.2134,4.454584,1.39204
3,0.8107,4.276865,1.360022


TrainOutput(global_step=4689, training_loss=1.111080771495244, metrics={'train_runtime': 830.0073, 'train_samples_per_second': 90.361, 'train_steps_per_second': 5.649, 'total_flos': 2483719430400000.0, 'train_loss': 1.111080771495244, 'epoch': 3.0})

In [None]:
# Оценка тестовых данных
test_dataset = MovieReviewDataset(test_data['text'].tolist(), test_data['label'].tolist())
trainer.evaluate(test_dataset)


{'eval_loss': 4.1326470375061035,
 'eval_mae': 1.3495981693267822,
 'eval_runtime': 64.8171,
 'eval_samples_per_second': 385.701,
 'eval_steps_per_second': 24.114,
 'epoch': 3.0}

### Pipeline для получения оценок

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
def predict(text):
    """
    Предсказывает оценку.

    Параметры:
    - text (str): исходный текст

    Возвращает:
    - int: предсказанную оценку
    """
    text = split_long_text(text)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)

    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)


    score = round(outputs.logits.item())
    return score

In [None]:
def split_long_text(text, max_length=100):
    """
    Разбивает длинные строки на несколько частей для использования в коде.

    Параметры:
    - text (str): исходный текст
    - max_length (int): максимальная длина строки

    Возвращает:
    - str: строка, разбитая на несколько частей, объединённая с помощью символа "+"
    """
    words = text.split()
    lines = []
    current_line = []

    for word in words:
        if len(' '.join(current_line + [word])) > max_length:
            lines.append(' '.join(current_line))
            current_line = [word]
        else:
            current_line.append(word)

    if current_line:
        lines.append(' '.join(current_line))

    return ' +\n'.join([f'"{line}"' for line in lines])

In [None]:
# Чтобы случайно не запустить
# Сохраняем полученную модель и токенайзер
trainer.save_model("final_project")
tokenizer.save_pretrained("final_project")

('maybe_final_project/tokenizer_config.json',
 'maybe_final_project/special_tokens_map.json',
 'maybe_final_project/vocab.txt',
 'maybe_final_project/added_tokens.json',
 'maybe_final_project/tokenizer.json')

In [None]:
predicted_rating = predict(
    "The first time I watched this show it was OK. There were some funny moments and I laughed a couple of times but "
    "this show is getting worse and worse. Carly and Sam's web show is NOT the least bit funny. They play a stupid video "
    "from the internet, scream at the camera and make some very bad jokes. And then the laugh track goes off?! One problem "
    "with the show is that none of the main characters are funny. Carly is not funny. Miranda Cosgrove's acting is lackluster "
    "at best. Her acting in this show is nothing like her acting from Drake And Josh. Her friend Sam is very rude and crude "
    "and the show is written in a way that makes her look like some kind of hillbilly. I mean they make jokes about her mom "
    "driving a rusty old truck, her mom smashing an old TV with a bat, and then there's the jokes about Sam failing in school, "
    "getting detention all the time and running from cops. None of that is funny at all. Then there's Freddy who is a computer "
    "geek. He isn't too funny unless his Mom is treating him like a baby. The show's only somewhat funny full time character "
    "is Carly's brother Spencer. He makes some funny jokes and does some pretty funny things like pretending to drive a spaceship "
    "while making spaceship noises, knocking over a girl scouts' cookie table for revenge as they did the same thing to him. "
    "His material is the only thing worth laughing at. Aside from the characters other things make the show bad too. Like the fact "
    "that a couple of kids doing a local web show from a Seattle apartment is a worldwide hit and got them a free trip to Tokyo? "
    "Another thing is that how can a 26 year old single guy with no real job can pay for a 2 level apartment in downtown Seattle "
    "and raise his 13 year old sister and pay for a room full of camera and sound equipment including a remote controlled projector "
    "and a green screen and an HD camera? This sounds like it was written by a 10 year old. The worst thing is that the show contains "
    "some pretty questionable content. There are a couple of times when Carly(remember a 13 year old girl) appears on her internet web "
    "show in a bikini top. WTF? Then I saw an episode where Freddy tells Carly and Sam that he 'slept in JUST his socks the night before.' "
    "I mean WTF? Then there's an episode where Carly's rival Nevel blackmails her by taking her website rights and agrees to give her the "
    "website back in exchange for a kiss. Creepy! And I just saw an episode where Carly meets a boy who just moved into their apartment building "
    "and he has some kind of back injury and he takes off his shirt and Carly stands there drooling over him. I can't believe Nick even lets them "
    "show that kind of stuff and I can't believe that this was created by the same guy responsible for Drake and Josh. This show is not appropriate "
    "for kids under the age of 12 and that's even questionable. iCarly is just another addition to the long list of awful Nick programming."
)

print(f"Predicted Rating: {predicted_rating}")


Predicted Rating: 1
