In [5]:
# -*- coding: utf-8 -*-
# Pipeline: The Messenger (Real vs Fake) — CatBoostClassifier
# Требования: python>=3.9, pandas, numpy, scikit-learn, catboost

import os, re, pathlib, warnings
from glob import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

In [6]:
# ----------------------------
# Константы
# ----------------------------
DATA_DIR = "data"                          # корневая папка с train/, test/, train.csv
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR  = os.path.join(DATA_DIR, "test")
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
RANDOM_STATE = 42

In [7]:
# ----------------------------
# Утилиты
# ----------------------------
_digit_re   = re.compile(r"\d")
_punct_re   = re.compile(r"[^\w\s]")
_upper_re   = re.compile(r"[A-Z]")
_sent_split = re.compile(r"[.!?]+")

STOPWORDS = set(ENGLISH_STOP_WORDS)

In [9]:
def read_text(path: str) -> str:
    # безопасное чтение с нормализацией пробелов
    try:
        txt = pathlib.Path(path).read_text(encoding="utf-8", errors="ignore")
    except Exception:
        txt = pathlib.Path(path).read_text(encoding="latin-1", errors="ignore")
    return " ".join(txt.split())

def extract_id_from_article_path(article_path: str) -> int:
    # ожидаем article_0000 -> 0
    m = re.search(r"article_(\d+)", article_path)
    if not m:
        raise ValueError(f"Не удалось извлечь id из {article_path}")
    return int(m.group(1))

def text_stats(text: str, prefix: str) -> dict:
    # Отдельные фичи на ТЕКСТ (без разностей)
    if not text:
        return {
            f"{prefix}_len_chars": 0,
            f"{prefix}_len_words": 0,
            f"{prefix}_mean_word_len": 0.0,
            f"{prefix}_uniq_words": 0,
            f"{prefix}_ttr": 0.0,
            f"{prefix}_digit_ratio": 0.0,
            f"{prefix}_punct_ratio": 0.0,
            f"{prefix}_upper_ratio": 0.0,
            f"{prefix}_stopword_ratio": 0.0,
            f"{prefix}_sentences": 0,
        }

    words = text.split()
    n_chars = len(text)
    n_words = len(words)
    uniq = len(set(w.lower() for w in words))
    avg_wlen = np.mean([len(w) for w in words]) if n_words else 0.0
    digits = len(_digit_re.findall(text))
    puncts = len(_punct_re.findall(text))
    uppers = len(_upper_re.findall(text))
    sents  = len([s for s in _sent_split.split(text) if s.strip()])

    stop_cnt = sum(1 for w in (w.lower() for w in words) if w in STOPWORDS)

    return {
        f"{prefix}_len_chars": n_chars,
        f"{prefix}_len_words": n_words,
        f"{prefix}_mean_word_len": float(avg_wlen),
        f"{prefix}_uniq_words": uniq,
        f"{prefix}_ttr": float(uniq / n_words) if n_words else 0.0,  # type-token ratio
        f"{prefix}_digit_ratio": float(digits / n_chars) if n_chars else 0.0,
        f"{prefix}_punct_ratio": float(puncts / n_chars) if n_chars else 0.0,
        f"{prefix}_upper_ratio": float(uppers / n_chars) if n_chars else 0.0,
        f"{prefix}_stopword_ratio": float(stop_cnt / n_words) if n_words else 0.0,
        f"{prefix}_sentences": sents,
    }

def build_split_df(root_dir: str) -> pd.DataFrame:
    """
    Читает пары статей: .../article_xxxx/file_1.txt, file_2.txt
    Возвращает DataFrame: id, text1, text2 + числовые фичи t1_*, t2_*
    """
    rows = []
    articles = sorted(glob(os.path.join(root_dir, "article_*")))
    if not articles:
        raise FileNotFoundError(f"Не найдены папки article_* в {root_dir}")

    for art_dir in articles:
        id_ = extract_id_from_article_path(art_dir)
        f1 = os.path.join(art_dir, "file_1.txt")
        f2 = os.path.join(art_dir, "file_2.txt")
        if not os.path.exists(f1) or not os.path.exists(f2):
            raise FileNotFoundError(f"Отсутствуют file_1.txt/file_2.txt в {art_dir}")

        t1 = read_text(f1)
        t2 = read_text(f2)

        feats = {}
        feats.update(text_stats(t1, "t1"))
        feats.update(text_stats(t2, "t2"))

        rows.append({
            "id": id_,
            "text1": t1,
            "text2": t2,
            **feats
        })
    df = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
    return df

In [10]:
# ----------------------------
# Загрузка данных
# ----------------------------
train_df_pairs = build_split_df(TRAIN_DIR)                   # id, text1, text2, t1_*, t2_*
train_gt = pd.read_csv(TRAIN_CSV)                            # id, real_text_id ∈ {1,2}

In [11]:
# Проверим соответствие id
merged = train_df_pairs.merge(train_gt, on="id", how="inner")
assert len(merged) == len(train_df_pairs), "Несовпадение id между парами и train.csv"

In [12]:
# Целевая переменная: 1 => file_1 реал, 0 => file_2 реал
merged["target"] = (merged["real_text_id"] == 1).astype(int)

In [14]:
# Финальные признаки: две текстовые + набор числовых по каждому тексту
text_cols = ["text1", "text2"]
num_cols = [c for c in merged.columns if c.startswith("t1_") or c.startswith("t2_")]
feature_cols = text_cols + num_cols

X = merged[feature_cols].copy()
y = merged["target"].astype(int).copy()

In [15]:
# ----------------------------
# Валидация
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

In [27]:
# ----------------------------
# Модель и поиск гиперпараметров (6-fold CV)
# ----------------------------
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=RANDOM_STATE)

base_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=RANDOM_STATE,
    verbose=100,
    text_features=text_cols,
    use_best_model=True,
    early_stopping_rounds=50
    # включим текстовую обработку по умолчанию
)

param_dist = {
    "depth":            [4, 6, 8, 10],
    "learning_rate":    [0.03, 0.05, 0.1],
    "l2_leaf_reg":      [1, 3, 5, 7, 10],
    "iterations":       [500, 1000, 2000],
    "bagging_temperature": [0.0, 0.25, 0.5, 1.0],
    "random_strength":  [0.0, 0.5, 1.0, 2.0],
    "border_count":     [32, 64, 128],
    # Можно добавить текстовые настройки при необходимости:
    # "text_processing": [...]
}

search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=20,                 # при необходимости увеличить/уменьшить
    scoring="roc_auc",
    cv=cv,
    refit=True,
    n_jobs=-1,
    verbose=100,
    random_state=RANDOM_STATE
)


In [29]:
# ВАЖНО: передаем, какие признаки — текстовые
search.fit(
    X_train, y_train,
    text_features=text_cols,
    verbose=100,
    eval_set=(X_val, y_val)
)

best_model = search.best_estimator_
print("=== Лучшие параметры ===")
print(search.best_params_)
print("=== Лучший CV AUC ===")
print(search.best_score_)

Fitting 6 folds for each of 20 candidates, totalling 120 fits
[CV 2/6; 1/20] START bagging_temperature=1.0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=1, learning_rate=0.1, random_strength=1.0
[CV 1/6; 2/20] START bagging_temperature=0.0, border_count=64, depth=4, iterations=2000, l2_leaf_reg=3, learning_rate=0.1, random_strength=0.0
[CV 1/6; 1/20] START bagging_temperature=1.0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=1, learning_rate=0.1, random_strength=1.0
[CV 2/6; 2/20] START bagging_temperature=0.0, border_count=64, depth=4, iterations=2000, l2_leaf_reg=3, learning_rate=0.1, random_strength=0.0
[CV 6/6; 1/20] START bagging_temperature=1.0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=1, learning_rate=0.1, random_strength=1.0
[CV 5/6; 1/20] START bagging_temperature=1.0, border_count=64, depth=4, iterations=1000, l2_leaf_reg=1, learning_rate=0.1, random_strength=1.0
[CV 4/6; 1/20] START bagging_temperature=1.0, border_count=64, depth=4, iteratio

In [31]:
# ----------------------------
# Метрики на hold-out X_val
# ----------------------------
val_proba = best_model.predict_proba(X_val)[:, 1]
val_pred  = (val_proba >= 0.5).astype(int)

print("\n=== Метрики на валидации ===")
print("AUC:     ", roc_auc_score(y_val, val_proba))
print("Accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_val, val_pred))


=== Метрики на валидации ===
AUC:      0.8555555555555556
Accuracy: 0.7368421052631579
              precision    recall  f1-score   support

           0     0.8571    0.6000    0.7059        10
           1     0.6667    0.8889    0.7619         9

    accuracy                         0.7368        19
   macro avg     0.7619    0.7444    0.7339        19
weighted avg     0.7669    0.7368    0.7324        19

Confusion matrix:
 [[6 4]
 [1 8]]


In [32]:
# ----------------------------
# Обучение на всём train (X, y) с лучшими параметрами
# ----------------------------
final_model = CatBoostClassifier(
    **search.best_params_,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=RANDOM_STATE,
    verbose=100
)
final_model.fit(X[feature_cols], y, text_features=text_cols)

0:	total: 71.5ms	remaining: 2m 22s
100:	total: 6.73s	remaining: 2m 6s
200:	total: 13.8s	remaining: 2m 3s
300:	total: 20.7s	remaining: 1m 57s
400:	total: 27.6s	remaining: 1m 50s
500:	total: 34.6s	remaining: 1m 43s
600:	total: 41.4s	remaining: 1m 36s
700:	total: 48.5s	remaining: 1m 29s
800:	total: 55.6s	remaining: 1m 23s
900:	total: 1m 6s	remaining: 1m 21s
1000:	total: 1m 14s	remaining: 1m 14s
1100:	total: 1m 21s	remaining: 1m 6s
1200:	total: 1m 28s	remaining: 59s
1300:	total: 1m 36s	remaining: 51.6s
1400:	total: 1m 42s	remaining: 44s
1500:	total: 1m 50s	remaining: 36.6s
1600:	total: 1m 57s	remaining: 29.3s
1700:	total: 2m 5s	remaining: 22.1s
1800:	total: 2m 12s	remaining: 14.7s
1900:	total: 2m 20s	remaining: 7.3s
1999:	total: 2m 27s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1794dc2d0>

In [34]:
# ----------------------------
# Инференс на test и сохранение submission
# ----------------------------
test_df_pairs = build_split_df(TEST_DIR)
X_test = test_df_pairs[feature_cols].copy()

test_proba = final_model.predict_proba(X_test)[:, 1]
# правило: >=0.5 -> file_1.txt реал => real_text_id=1, иначе 2
test_pred_real_id = np.where(test_proba >= 0.5, 1, 2).astype(int)

submission = pd.DataFrame({
    "id": test_df_pairs["id"],
    "real_text_id": test_pred_real_id
}).sort_values("id")

out_path = "submission2.csv"
submission.to_csv(out_path, index=False)
print(f"\nSaved: {out_path}")


Saved: submission2.csv
