In [10]:
import pandas as pd
from pathlib import Path

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, make_scorer, recall_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from ydata_profiling import ProfileReport

import numpy as np

import string, unicodedata
import textstat
import nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
train_dir = Path("data/train")
train_df = pd.read_csv("data/train.csv")

In [3]:
def extract_features(text: str):
    text = text.replace("\n", " ").strip()
    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    features = {}
    features['char_count'] = len(text)
    features['word_count'] = len(words)
    features['sentence_count'] = len(sentences)
    features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0

    try:
        features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    except:
        features['flesch_reading_ease'] = 0

    try:
        features['gunning_fog'] = textstat.gunning_fog(text)
    except:
        features['gunning_fog'] = 0

    non_space_chars = [c for c in text if c != ' ']
    if non_space_chars:
        latin_chars = [c for c in non_space_chars if 'LATIN' in unicodedata.name(c, '')]
        features['latin_ratio'] = len(latin_chars) / len(non_space_chars)
    else:
        features['latin_ratio'] = 0

    # сохраним сам текст для text_features CatBoost
    features['text'] = text
    return features

In [4]:
features = []

for _, row in train_df.iterrows():
    article_id = f"article_{int(row['id']):04d}"
    real = row["real_text_id"]

    path = train_dir / article_id
    with open(path / "file_1.txt", encoding="utf-8") as f1:
        text1 = f1.read()
    with open(path / "file_2.txt", encoding="utf-8") as f2:
        text2 = f2.read()

    feats1 = extract_features(text1)
    feats2 = extract_features(text2)

    # ❗ Берём разности только по новым числовым признакам
    diff_feats = {f"diff_{k}": feats1[k] - feats2[k]
                  for k in feats1 if k != "text"}

    label = 1 if real == 1 else 0

    features.append({
        "id": article_id,
        **diff_feats,
        "text1": feats1["text"],
        "text2": feats2["text"],
        "target": label
    })

train_features = pd.DataFrame(features)

In [5]:
train_features['text1'] = train_features['text1'].str.replace('\n', ' ', regex=False)
train_features['text1'] = train_features['text1'].str.replace('\'s', ' ', regex=False)
train_features['text2'] = train_features['text2'].str.replace('\n', ' ', regex=False)
train_features['text2'] = train_features['text2'].str.replace('\'s', ' ', regex=False)

In [398]:
train_features.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:07<00:00,  1.17it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [391]:
import numpy as np

def drop_high_corr_features(df, threshold=0.95, exclude=[]):
    """
    Удаляет признаки с корреляцией выше threshold.
    
    df — DataFrame с фичами (только числовыми!)
    threshold — допустимый уровень корреляции
    exclude — список колонок, которые не будут удалены даже при высокой корреляции
    
    Возвращает: DataFrame без лишних фич, список удалённых фичей
    """
    corr_matrix = df.corr().abs()

    # Берём верхний треугольник корр. матрицы (без диагонали)
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [
        column for column in upper.columns
        if any(upper[column] > threshold) and column not in exclude
    ]
    
    df_clean = df.drop(columns=to_drop)
    return df_clean, to_drop


In [392]:
# Выбираем только числовые фичи
num_cols = train_features.select_dtypes(include=['number']).drop(columns=['target']).columns

In [394]:
df_cleaned, dropped = drop_high_corr_features(train_features[num_cols], threshold=0.95)

In [397]:
columns_to_keep = list(df_cleaned.columns) + ['text1', 'text2', 'target']
train_features = train_features[columns_to_keep]

In [6]:
train_features

Unnamed: 0,id,diff_char_count,diff_word_count,diff_sentence_count,diff_avg_word_length,diff_flesch_reading_ease,diff_gunning_fog,diff_latin_ratio,text1,text2,target
0,article_0000,178,-1,-1,0.556939,-21.894056,6.561949,0.002829,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,1
1,article_0001,2188,344,4,0.144488,-22.669973,8.948607,0.030779,China The goal of this project involves achiev...,The project aims to achieve an accuracy level ...,0
2,article_0002,338,38,0,0.646617,-17.976952,2.358644,-0.002100,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,1
3,article_0003,-95,42,2,-1.065560,24.651213,-7.148654,-0.023819,China The study suggests that multiple star sy...,The importance for understanding how stars evo...,0
4,article_0004,-676,-100,-1,-1.207857,57.016667,-12.584314,-0.010335,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,0
...,...,...,...,...,...,...,...,...,...,...,...
90,article_0090,135,26,-1,-0.015825,-2.412090,1.820656,-0.001172,A main focus of modern cosmology is to underst...,A key focus of modern cosmology is to understa...,0
91,article_0091,-52,-18,-2,0.184149,-12.555238,2.273418,-0.003498,"APEX, as its name suggests, serves as a guide ...","APEX, as its name suggests, serves as a guide ...",1
92,article_0092,7797,956,6,1.885054,-29.285896,2.186683,-0.148069,FORS1 and FORS2 are early instruments of the V...,FORS1 and FORS2 are early instruments of the V...,0
93,article_0093,46,2,0,0.182351,-5.143867,1.229297,0.002128,The observations of the Pluto-Charon system an...,The observations of the Pluto-Charon binary an...,0


In [11]:
text_features = ['text1','text2']
num_features = X.select_dtypes(exclude='object').columns.tolist()

In [12]:
num_features

['diff_char_count',
 'diff_word_count',
 'diff_sentence_count',
 'diff_avg_word_length',
 'diff_flesch_reading_ease',
 'diff_gunning_fog',
 'diff_latin_ratio']

In [8]:
X = train_features.drop(columns=['target'], axis=1)
y = train_features['target']

In [9]:
X.head(2)

Unnamed: 0,id,diff_char_count,diff_word_count,diff_sentence_count,diff_avg_word_length,diff_flesch_reading_ease,diff_gunning_fog,diff_latin_ratio,text1,text2
0,article_0000,178,-1,-1,0.556939,-21.894056,6.561949,0.002829,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,article_0001,2188,344,4,0.144488,-22.669973,8.948607,0.030779,China The goal of this project involves achiev...,The project aims to achieve an accuracy level ...


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm_ct = ColumnTransformer(
	transformers=[
		("txt1", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[0]),
		("txt2", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[1]),
		("num", MinMaxScaler(), num_features),
		],
		sparse_threshold=0.0, # оставляем разреженный формат для LinearSVC
)
svm_pipe = Pipeline([
	("prep", svm_ct),
	("HGBC", HistGradientBoostingClassifier(random_state=42))
])

svm_pipe.fit(X_train, y_train)



0,1,2
,steps,"[('prep', ...), ('HGBC', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('txt1', ...), ('txt2', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,max_iter,100
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [19]:
y_pred = svm_pipe.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.8421052631578947
              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       1.00      0.75      0.86        12

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



In [24]:
X.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:00<00:00, 111.10it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
# Оставим признаки с корреляцией < 0.95
import numpy as np

def drop_high_corr_features(df, threshold=0.95):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop), to_drop

df_cleaned, dropped_features = drop_high_corr_features(df[numeric_features])
print("❌ Dropped due to high correlation:", dropped_features)


In [439]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
model = CatBoostClassifier(
    auto_class_weights='Balanced',
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=42,
    text_features=text_features,
    use_best_model=True,
    early_stopping_rounds=100,
    verbose=100
)

In [108]:
param_grid = {
    'iterations': [100],
	'depth': [4, 6,],
	'learning_rate': [0.01, 0.5],
	'l2_leaf_reg': [3, 7],
	'bagging_temperature': [0.5, 1],
	'random_strength': [0.5, 1],
	'border_count': [64, 128, 256],
	'max_ctr_complexity': [1, 4],
	'leaf_estimation_iterations': [1, 32]
}
search = RandomizedSearchCV(
    model,
    param_distributions=param_grid,
    scoring='accuracy',
    n_iter=20,
    cv=5,
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train, eval_set=(X_val, y_val))
print("Best params:", search.best_params_)

0:	learn: 0.8747277	test: 0.6130653	best: 0.6130653 (0)	total: 159ms	remaining: 15.7s
0:	learn: 0.8905724	test: 0.7384615	best: 0.7384615 (0)	total: 158ms	remaining: 15.7s
0:	learn: 0.8197168	test: 0.6817420	best: 0.6817420 (0)	total: 198ms	remaining: 19.6s
0:	learn: 0.8804113	test: 0.8412162	best: 0.8412162 (0)	total: 232ms	remaining: 22.9s
0:	learn: 0.8785403	test: 0.6817420	best: 0.6817420 (0)	total: 280ms	remaining: 27.8s
0:	learn: 0.8905724	test: 0.7384615	best: 0.7384615 (0)	total: 424ms	remaining: 42s
0:	learn: 0.8970588	test: 0.6817420	best: 0.6817420 (0)	total: 465ms	remaining: 46s
0:	learn: 0.9373638	test: 0.6130653	best: 0.6130653 (0)	total: 602ms	remaining: 59.6s
99:	learn: 1.0000000	test: 0.7839196	best: 0.8291457 (8)	total: 11.9s	remaining: 0us

bestTest = 0.8291457302
bestIteration = 8

Shrink model to first 9 iterations.
99:	learn: 1.0000000	test: 0.6820513	best: 0.8512821 (9)	total: 11.9s	remaining: 0us

bestTest = 0.8512820508
bestIteration = 9

Shrink model to first 

KeyboardInterrupt: 

In [429]:
best_model = CatBoostClassifier(
    # iterations=1000,
    # learning_rate=0.5,
    # depth=2,
    # leaf_estimation_iterations=5,
    # max_ctr_complexity=1,
    l2_leaf_reg=10,
    random_strength=1,
    bagging_temperature=0.5,
    # border_count=256,
    auto_class_weights='Balanced',
    loss_function='Logloss',
    text_features=text_features,
    num_trees=877,
    random_seed=42,
    early_stopping_rounds=40,
    verbose=100,
)
best_model.fit(X_train, y_train, eval_set=(X_val, y_val))

0:	learn: 0.6833327	test: 0.6873242	best: 0.6873242 (0)	total: 99.6ms	remaining: 1m 27s
100:	learn: 0.2675152	test: 0.4603643	best: 0.4569781 (97)	total: 5.9s	remaining: 45.3s
200:	learn: 0.0968146	test: 0.3932540	best: 0.3897300 (183)	total: 12.2s	remaining: 41.1s
300:	learn: 0.0425080	test: 0.3619311	best: 0.3619311 (300)	total: 18.6s	remaining: 35.5s
400:	learn: 0.0279877	test: 0.3434637	best: 0.3426433 (397)	total: 25.2s	remaining: 29.9s
500:	learn: 0.0202452	test: 0.3365457	best: 0.3361673 (485)	total: 32.9s	remaining: 24.7s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.3354169111
bestIteration = 535

Shrink model to first 536 iterations.


<catboost.core.CatBoostClassifier at 0x31f849a50>

In [443]:
feat_imp = pd.DataFrame({
	'features': best_model.feature_names_,
	'importance': best_model.feature_importances_
})
feat_imp

Unnamed: 0,features,importance
0,diff_length,12.044174
1,diff_uppercase_ratio,0.51081
2,diff_avg_word_length,4.497461
3,len_ratio,22.283515
4,jaccard_words,0.336583
5,punct_ratio,4.885103
6,text1,27.132836
7,text2,28.309518


In [440]:
y_proba = best_model.predict_proba(X_val)[:, 1]
threshold = 0.5
y_pred = (y_proba >= threshold).astype(int)

print(f"ROC AUC: {roc_auc_score(y_val, y_proba):.4f}")
print(f"Порог: {threshold}")
print(classification_report(y_val, y_pred))
print(accuracy_score(y_val, y_pred))

ROC AUC: 0.9286
Порог: 0.5
              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       0.92      0.92      0.92        12

    accuracy                           0.89        19
   macro avg       0.89      0.89      0.89        19
weighted avg       0.89      0.89      0.89        19

0.8947368421052632


In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report, 
    precision_score, recall_score, f1_score, confusion_matrix
)
import numpy as np
import pandas as pd

# === Настройки ===
N_SPLITS = 5
SEED = 42

# Для хранения результатов
metrics = {
    "fold": [],
    "accuracy": [],
    "roc_auc": [],
    "f1_macro": [],
    "precision_macro": [],
    "recall_macro": []
}

# Инициализируем модель
def create_model():
    return CatBoostClassifier(
        l2_leaf_reg=10,
        random_strength=1,
        bagging_temperature=0.5,
        auto_class_weights='Balanced',
        loss_function='Logloss',
        text_features=text_features,  # укажи список индексов или имён колонок
        num_trees=877,
        random_seed=SEED,
        early_stopping_rounds=40,
        verbose=0
    )

# === Кросс-валидация ===
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{N_SPLITS}")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = create_model()
    model.fit(X_train, y_train, eval_set=(X_val, y_val))

    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    # Метрики
    acc = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)
    f1 = f1_score(y_val, y_pred, average='macro')
    prec = precision_score(y_val, y_pred, average='macro')
    rec = recall_score(y_val, y_pred, average='macro')

    metrics["fold"].append(fold + 1)
    metrics["accuracy"].append(acc)
    metrics["roc_auc"].append(auc)
    metrics["f1_macro"].append(f1)
    metrics["precision_macro"].append(prec)
    metrics["recall_macro"].append(rec)

    print(f"ROC AUC: {auc:.4f} | F1: {f1:.4f} | Acc: {acc:.4f}")

# === Результаты ===
results = pd.DataFrame(metrics)
print("\nСредние метрики по кросс-валидации:")
print(results.mean(numeric_only=True))

In [45]:
# ====== CONFIG ======
import os, re, gc, math, json, string, numpy as np, pandas as pd
from pathlib import Path
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity

from lightgbm import LGBMClassifier

SEED = 42
MODEL_NAME = "microsoft/deberta-v3-base"  # можно '...-base' при наличии GPU
MAX_LEN = 512
BATCH_SIZE = 4

train_dir = Path("data/train")
test_dir  = Path("data/test")
train_csv = Path("data/train.csv")

rng = np.random.RandomState(SEED)

# ====== UTILS ======
def read_pair_texts(base_dir: Path, article_id: str):
    p = base_dir / article_id
    with open(p/"file_1.txt", encoding="utf-8") as f1:
        t1 = f1.read().replace("\n", " ").strip()
    with open(p/"file_2.txt", encoding="utf-8") as f2:
        t2 = f2.read().replace("\n", " ").strip()
    return t1, t2

def numeric_features(t1: str, t2: str):
    # компактный набор некоррелирующих признаков
    len1, len2 = len(t1), len(t2)
    words1, words2 = set(t1.lower().split()), set(t2.lower().split())
    punct1 = sum(1 for c in t1 if c in string.punctuation)
    punct2 = sum(1 for c in t2 if c in string.punctuation)
    cap1 = sum(c.isupper() for c in t1)/(len1+1e-5)
    cap2 = sum(c.isupper() for c in t2)/(len2+1e-5)
    avgw1 = (sum(len(w) for w in t1.split())/(len(t1.split())+1e-5))
    avgw2 = (sum(len(w) for w in t2.split())/(len(t2.split())+1e-5))

    return {
        "diff_length":           len1 - len2,
        "diff_uppercase_ratio":  cap1 - cap2,
        "diff_avg_word_length":  avgw1 - avgw2,
        "len_ratio":             len1/(len2+1e-5),
        "jaccard_words":         len(words1 & words2)/max(len(words1 | words2), 1),
        "punct_ratio":           punct1/(punct2+1e-5),
    }

@torch.no_grad()
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # [B, L, H]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_emb = (token_embeddings * input_mask_expanded).sum(1)
    sum_mask = input_mask_expanded.sum(1).clamp(min=1e-9)
    return (sum_emb / sum_mask)  # [B, H]

@torch.no_grad()
def encode_texts(texts, tokenizer, model, device, max_len=512, batch_size=16):
    embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True, truncation=True, max_length=max_len,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        out = model(**enc)
        pooled = mean_pooling(out, enc["attention_mask"])  # [B, H]
        embs.append(pooled.cpu().float().numpy())
    return np.vstack(embs)  # [N, H]

def build_matrix(h1, h2, num_df):
    # h1,h2: np.array [N, H], num_df: pd.DataFrame [N, K]
    abs_diff = np.abs(h1 - h2)               # [N, H]
    cos_sim  = cosine_similarity(h1, h2)     # [N, N], возьмём диагональ как sim для пары
    cos_diag = np.diag(cos_sim).reshape(-1, 1)
    X = np.hstack([h1, h2, abs_diff, cos_diag, num_df.values.astype(np.float32)])
    return X

# ====== LOAD TRAIN ======
df = pd.read_csv(train_csv)
# цель: 1 если file_1 real, иначе 0
df["target"] = (df["real_text_id"] == 1).astype(int)
df["article_id"] = df["id"].apply(lambda x: f"article_{int(x):04d}")

# читаем тексты и считаем числовые признаки
t1_list, t2_list, num_rows = [], [], []
for _, r in tqdm(df.iterrows(), total=len(df), desc="Read train"):
    t1, t2 = read_pair_texts(train_dir, r["article_id"])
    t1_list.append(t1); t2_list.append(t2)
    num_rows.append(numeric_features(t1, t2))
num_df = pd.DataFrame(num_rows)

# ====== EMBEDDINGS (DeBERTa) ======
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()

h1 = encode_texts(t1_list, tokenizer, model, device, MAX_LEN, BATCH_SIZE)  # [N, H]
h2 = encode_texts(t2_list, tokenizer, model, device, MAX_LEN, BATCH_SIZE)  # [N, H]

Read train: 100%|██████████| 95/95 [00:00<00:00, 1932.77it/s]


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [47]:
# финальная матрица признаков для LGBM
X_all = build_matrix(h1, h2, num_df)       # [N, 3H + 1 + K]
y_all = df["target"].values.astype(int)

# ====== TRAIN/VAL SPLIT & TRAIN LGBM ======
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.2, random_state=SEED, stratify=y_all
)


In [48]:
lgbm = LGBMClassifier(
    n_estimators=3000,
    learning_rate=0.01,
    max_depth=-1,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    verbose=-1,
    metric='binary_logloss',
    boosting_type='gbdt',
    n_jobs=-1,
    verbos=-1,
    objective="binary",
    random_state=SEED
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc"
)



0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.01
,n_estimators,3000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [49]:
# ====== METRICS ======
val_proba = lgbm.predict_proba(X_val)[:, 1]
val_pred  = (val_proba >= 0.5).astype(int)      # 1 => file_1 real, 0 => file_2 real
print("Accuracy:", accuracy_score(y_val, val_pred))
print("ROC AUC :", roc_auc_score(y_val, val_proba))
print(classification_report(y_val, val_pred, digits=4))

Accuracy: 0.8421052631578947
ROC AUC : 0.9555555555555555
              precision    recall  f1-score   support

           0     0.8889    0.8000    0.8421        10
           1     0.8000    0.8889    0.8421         9

    accuracy                         0.8421        19
   macro avg     0.8444    0.8444    0.8421        19
weighted avg     0.8468    0.8421    0.8421        19





In [52]:
test_ids = sorted([p.name for p in test_dir.iterdir() if p.is_dir()])
sub_rows = []

# соберём тексты для батчевого кодирования
t1_test, t2_test, num_rows_test, id_nums = [], [], [], []
for art in tqdm(test_ids, desc="Read test"):
    t1, t2 = read_pair_texts(test_dir, art)
    t1_test.append(t1); t2_test.append(t2)
    num_rows_test.append(numeric_features(t1, t2))
    id_nums.append(int(art.replace("article_", "")))

num_df_test = pd.DataFrame(num_rows_test)



Read test: 100%|██████████| 1068/1068 [00:00<00:00, 2456.15it/s]


In [53]:
# эмбеддинги
h1_t = encode_texts(t1_test, tokenizer, model, device, MAX_LEN, BATCH_SIZE)
h2_t = encode_texts(t2_test, tokenizer, model, device, MAX_LEN, BATCH_SIZE)


KeyboardInterrupt: 

In [None]:
# фичи и предсказания
X_test = build_matrix(h1_t, h2_t, num_df_test)
test_proba = lgbm.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= 0.5).astype(int)    # 1 -> file_1 real

# маппинг к формату соревнования: {1,2}
real_col = np.where(test_pred == 1, 1, 2)

submission = pd.DataFrame({"id": id_nums, "real": real_col}).sort_values("id")
submission.to_csv("submission_lgbm_deberta.csv", index=False)
print("Saved submission_lgbm_deberta.csv:", submission.shape)

In [446]:
from pathlib import Path
import pandas as pd
def build_row_features(text1: str, text2: str):
    f1 = extract_features(text1)
    f2 = extract_features(text2)

    len1, len2 = f1["length"], f2["length"]
    words1, words2 = set(f1["text"].lower().split()), set(f2["text"].lower().split())
    punct1, punct2 = f1["num_punct"], f2["num_punct"]

    return {
        "diff_length":             f1["length"] - f2["length"],
        "diff_uppercase_ratio":    f1["uppercase_ratio"] - f2["uppercase_ratio"],
        "diff_avg_word_length":    f1["avg_word_length"] - f2["avg_word_length"],
        "len_ratio":               len1 / (len2 + 1e-5),
        "jaccard_words":           len(words1 & words2) / max(len(words1 | words2), 1),
        "punct_ratio":             punct1 / (punct2 + 1e-5),
        "text1":                   f1["text"],
        "text2":                   f2["text"],
    }
# --- генерация сабмита ---
test_dir = Path("data/test")
test_ids = sorted([p.name for p in test_dir.iterdir() if p.is_dir()])

rows = []
for article_id in test_ids:
    with open(test_dir / article_id / "file_1.txt", encoding="utf-8") as f1:
        t1 = f1.read()
    with open(test_dir / article_id / "file_2.txt", encoding="utf-8") as f2:
        t2 = f2.read()

    feats = build_row_features(t1, t2)
    X_sample = pd.DataFrame([feats])[[
        "diff_length", "diff_uppercase_ratio", "diff_avg_word_length",
        "len_ratio", "jaccard_words", "punct_ratio", "text1", "text2"
    ]]

    # CatBoost: можно predict_proba
    proba = best_model.predict_proba(X_sample)[0, 1]
    pred_real = 1 if proba > 0.5 else 2     # 1 -> file_1 real, иначе 2

    numeric_id = int(article_id.replace("article_", ""))
    rows.append({"id": numeric_id, "real": pred_real})

submission = pd.DataFrame(rows).sort_values("id")
submission.to_csv("submission1.csv", index=False)
print("Saved submission.csv:", submission.shape)

Saved submission.csv: (1068, 2)


In [437]:
print(X_train.columns)


Index(['diff_length', 'diff_uppercase_ratio', 'diff_avg_word_length',
       'len_ratio', 'jaccard_words', 'punct_ratio', 'text1', 'text2'],
      dtype='object')


# Метод опорных векторов

In [312]:
X = train_features[text_features + num_features]
y = train_features['target']

In [411]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC

# Text preprocessing
svm_ct = ColumnTransformer(
	transformers=[
		("txt1", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[0]),
		("txt2", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[1]),
		("num", MinMaxScaler(), num_features),
		],
		sparse_threshold=1.0, # оставляем разреженный формат для LinearSVC
)
svm_pipe = Pipeline([
	("prep", svm_ct),
	("clf", LinearSVC(C=1.0, class_weight="balanced", random_state=42, max_iter=10000))
])

In [412]:
svm_grid = {
	"clf__C": [0.25, 0.5, 1.0, 2.0]
}

svm_gs = GridSearchCV(
	svm_pipe, svm_grid, cv=5, n_jobs=-1, scoring="roc_auc"
)

svm_gs.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'clf__C': [0.25, 0.5, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('txt1', ...), ('txt2', ...), ...]"
,remainder,'drop'
,sparse_threshold,1.0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [413]:
y_pred = svm_gs.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.7894736842105263
              precision    recall  f1-score   support

           0       0.64      1.00      0.78         7
           1       1.00      0.67      0.80        12

    accuracy                           0.79        19
   macro avg       0.82      0.83      0.79        19
weighted avg       0.87      0.79      0.79        19



In [317]:
from pathlib import Path
import pandas as pd

# Путь к тестовому набору
test_dir = Path("data/test")
test_ids = [p.name for p in test_dir.iterdir() if p.is_dir()]
test_ids.sort()

# Финальный список строк сабмита
test_rows = []

# Признаки, как использовались при обучении
text_features = ["text1", "text2"]
num_features = ["len_diff"]

# Используем модель
best_model = svm_gs.best_estimator_

# Проход по всем тестовым примерам
for article_id in test_ids:
    # Чтение текстов
    with open(test_dir / article_id / "file_1.txt", encoding="utf-8") as f1:
        text1 = f1.read()
    with open(test_dir / article_id / "file_2.txt", encoding="utf-8") as f2:
        text2 = f2.read()

    # Формируем DataFrame с нужными признаками (text1, text2, len_diff)
    sample = pd.DataFrame([{
        "text1": text1,
        "text2": text2,
        "len_diff": len(text1) - len(text2)
    }])

    # Получаем raw margin из decision_function (т.к. LinearSVC)
    margin = best_model.decision_function(sample)[0]

    # Преобразуем в метку: если > 0 — значит model думает, что file_1 (== text1) реален
    pred_real = 1 if margin > 0 else 2

    # Извлекаем id в формате числа
    numeric_id = int(article_id.replace("article_", ""))

    test_rows.append({
        "id": numeric_id,
        "real": pred_real
    })

# Сохраняем результат
submission = pd.DataFrame(test_rows)
submission = submission.sort_values("id")
submission.to_csv("submission.csv", index=False)


# XGBoost

In [414]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score


In [416]:
xgb_ct = ColumnTransformer([
    ("txt1", TfidfVectorizer(ngram_range=(1, 2), min_df=2), 'text1'),
    ("txt2", TfidfVectorizer(ngram_range=(1, 2), min_df=2), 'text2'),
    ("num", MinMaxScaler(), num_features)
])
xgb_pipe = Pipeline([
    ("prep", xgb_ct),
    ("clf", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        max_depth=6,
        learning_rate=0.1,
        n_estimators=1000,
        random_state=42
    ))
])
xgb_pipe.fit(X_train, y_train)
y_pred = xgb_pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6842105263157895
              precision    recall  f1-score   support

           0       0.56      0.71      0.62         7
           1       0.80      0.67      0.73        12

    accuracy                           0.68        19
   macro avg       0.68      0.69      0.68        19
weighted avg       0.71      0.68      0.69        19



# Multinomial Naive Bayes

In [417]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [418]:
nb_ct = ColumnTransformer(
	transformers=[
		("txt1", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[0]),
		("txt2", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95), text_features[1]),
		],
		sparse_threshold=1.0, # оставляем разреженный формат для LinearSVC
)

nb_pipe = Pipeline([
    ("prep", nb_ct),
    ("clf", MultinomialNB())
])
nb_pipe.fit(X_train, y_train)
y_pred = nb_pipe.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy: 0.8421052631578947
              precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       1.00      0.75      0.86        12

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19

