In [1]:
import pandas as pd
import joblib
from tqdm import tqdm

from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsMorphTagger,
    MorphVocab,
    Doc
)

from razdel import tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
segmenter = Segmenter()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
morph_vocab = MorphVocab()

def preprocess(text: str) -> str:
    """Лемматизация Natasha, работает на Python 3.11+"""
    if not isinstance(text, str):
        return ""
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    
    # нормализация слов
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    
    lemmas = [token.lemma for token in doc.tokens]
    return " ".join(lemmas)


In [4]:
df_iter = pd.read_csv("train.csv", chunksize=20000)

processed_chunks = []

print("=== НАЧИНАЮ ПРЕДОБРАБОТКУ (Natasha) ===")
for chunk in tqdm(df_iter, desc="Processing chunks"):
    chunk["clean_text"] = chunk["text"].apply(preprocess)
    processed_chunks.append(chunk)

train = pd.concat(processed_chunks)
train.to_csv("train_clean.csv", index=False)

print("\n=== ПРЕДОБРАБОТКА ЗАВЕРШЕНА ===")
print("Сохранено: train_clean.csv")

=== НАЧИНАЮ ПРЕДОБРАБОТКУ (Natasha) ===


Processing chunks: 0it [00:02, ?it/s]


MemoryError: Unable to allocate 1.06 MiB for an array with shape (7, 40, 330, 3) and data type float32

In [None]:
train = pd.read_csv("train_clean.csv")

X = train["clean_text"]
y = train["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=y
)

In [None]:
print("\n=== ВЕКТОРИЗАЦИЯ TF-IDF ===")

X_train = X_train.fillna("").astype(str)
X_val = X_val.fillna("").astype(str)

tfidf = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1, 2),
    min_df=2,                # снижает шум
    max_df=0.9, 
)

X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)

In [None]:
print("\n=== ОБУЧЕНИЕ МОДЕЛИ LinearSVC ===")

model = LinearSVC()
model.fit(X_train_vec, y_train)

In [None]:
val_pred = model.predict(X_val_vec)
score = f1_score(y_val, val_pred, average="macro")

print("\n=== ОБУЧЕНИЕ ЗАВЕРШЕНО ===")
print("Macro-F1:", score)

In [None]:
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf.pkl")

print("\n=== ФАЙЛЫ СОХРАНЕНЫ ===")
print("sentiment_model.pkl — обученная модель")
print("tfidf.pkl — TF-IDF векторизатор")