<a href="https://colab.research.google.com/github/filnels/Homeworks/blob/main/Classification_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
import os

# Загружаем датасет
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)



csv_file_path = os.path.join(path, "IMDB Dataset.csv")
data = pd.read_csv(csv_file_path)
print(data.head())


Path to dataset files: /root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
import random
import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example
from pathlib import Path
import random
from sklearn.metrics import precision_recall_fscore_support

# Функция загрузки данных
def load_training_data(
    csv_file_path: str = "IMDB Dataset.csv",
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    data = pd.read_csv(csv_file_path)
    if limit > 0:
        data = data.sample(n=limit, random_state=42)
    reviews = [
        (row["review"], {"cats": {"pos": row["sentiment"] == "positive", "neg": row["sentiment"] == "negative"}})
        for _, row in data.iterrows()
    ]
    random.shuffle(reviews)
    split_index = int(len(reviews) * split)
    train_data = reviews[:split_index]
    test_data = reviews[split_index:]
    return train_data, test_data

# Обучение модели
def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 10,
    model_output_dir: str = "./model_output"
) -> None:
    nlp = spacy.blank("en")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.add_pipe("textcat", last=True)
    textcat.add_label("pos")
    textcat.add_label("neg")

    optimizer = nlp.begin_training()
    train_examples = [
        Example.from_dict(nlp.make_doc(text), labels) for text, labels in training_data
    ]

    print("Начало обучения")
    losses_list = []
    for i in range(iterations):
        random.shuffle(train_examples)
        losses = {}
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.2, losses=losses)

        losses_list.append(losses["textcat"])
        print(f"Итерация {i+1} - Потери: {losses['textcat']}")

        if (i + 1) % 5 == 0:
            evaluate_model(nlp, test_data)

    output_dir = Path(model_output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print(f"Модель сохранена в: {output_dir}")

# Оценка модели с использованием нескольких метрик
def evaluate_model(nlp, test_data):
    true_labels = []
    pred_labels = []
    for text, labels in test_data:
        true_label = "pos" if labels["cats"]["pos"] == 1.0 else "neg"
        true_labels.append(true_label)
        doc = nlp(text)
        pred_label = max(doc.cats, key=doc.cats.get)
        pred_labels.append(pred_label)

    precision, recall, fscore, _ = precision_recall_fscore_support(
        true_labels, pred_labels, labels=["pos", "neg"], average="binary", pos_label="pos"
    )
    accuracy = sum(1 for true, pred in zip(true_labels, pred_labels) if true == pred) / len(true_labels)
    print(f"Точность: {accuracy:.4f}, Полнота: {recall:.4f}, Точность: {precision:.4f}, F-мера: {fscore:.4f}")

# Пример использования
csv_file_path = "/root/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv"
train_data, test_data = load_training_data(csv_file_path, split=0.8, limit=50000)  # limit=5000 для быстрой проверки
train_model(train_data, test_data, iterations=10)


KeyboardInterrupt: 

In [None]:
#Видим, что в целом модель дет хорошие результаты, постепенно уменьшются потери, где-то на 7 итерации можно остановить обучение.

In [None]:
def test_model(input_data_list: list):
    """
    Тестирует модель на списке текстов и выводит предсказания и вероятность для каждого текста.
    """
    # Загружаем сохраненную модель
    loaded_model = spacy.load("model_output")

    results = []
    for input_data in input_data_list:
        parsed_text = loaded_model(input_data)
        # Определяем возвращаемое предсказание
        if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
            prediction = "Положительный отзыв"
            score = parsed_text.cats["pos"]
        else:
            prediction = "Негативный отзыв"
            score = parsed_text.cats["neg"]
        results.append((input_data, prediction, score))

    for review, prediction, score in results:
        print(f"Текст обзора: {review}\nПредсказание: {prediction}\nScore: {score:.3f}\n")

# Пример использования
test_reviews = [
    """
   Well I absolutely loved this. A sweet, earnest parable about parenthood, purpose and perseverance.
   I find, as I get older, that I deeply appreciate stories that are kind and uncynical.
   The world needs more of them, especially now. I'm glad this one exists in the world,
   and my kids loved it too - though I think they'll experience an entirely different movie
   if they revisit this when they're older. This one got me, and I was fighting tears on more than a few occasions. One of my favorites this year.
    """,
    "This movie was a total waste of time.",
    """A movie made for iPad kids to watch on 2x speed while they try to get
     a Victory Royale fighting in Slurpy Swamp.""",
    "It was okay, not the best but not the worst either."
]
test_model(input_data_list=test_reviews)


Текст обзора: 
   Well I absolutely loved this. A sweet, earnest parable about parenthood, purpose and perseverance. 
   I find, as I get older, that I deeply appreciate stories that are kind and uncynical. 
   The world needs more of them, especially now. I'm glad this one exists in the world, 
   and my kids loved it too - though I think they'll experience an entirely different movie 
   if they revisit this when they're older. This one got me, and I was fighting tears on more than a few occasions. One of my favorites this year.
    
Предсказание: Положительный отзыв
Score: 1.000

Текст обзора: This movie was a total waste of time.
Предсказание: Негативный отзыв
Score: 1.000

Текст обзора: A movie made for iPad kids to watch on 2x speed while they try to get
     a Victory Royale fighting in Slurpy Swamp.
Предсказание: Положительный отзыв
Score: 1.000

Текст обзора: It was okay, not the best but not the worst either.
Предсказание: Негативный отзыв
Score: 1.000



In [None]:
positive_count = sum(1 for _, label in train_data if label["cats"]["pos"] == 1.0)
negative_count = len(train_data) - positive_count
print(f"Положительных отзывов: {positive_count}, Отрицательных отзывов: {negative_count}")


Положительных отзывов: 20015, Отрицательных отзывов: 19985
