In [11]:
# !pip install datasets razdel torch pandas tqdm

import re
import random
import numpy as np
import pandas as pd

from datasets import load_dataset
from razdel import sentenize
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# фиксируем сиды для воспроизводимости
SEED = 13
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7b97741699d0>

In [12]:
# Берём датасет русских новостей из Telegram
telegram = load_dataset("ScoutieAutoML/russian-news-telegram-dataset", split="train")
texts = [t["text"] for t in telegram if t["text"]]

print("Всего строк:", len(texts))


Всего строк: 97151


In [14]:
# Готовим пары: убираем пробелы - получаем входную строку,
# ставим метку 1, если после символа был пробел, иначе 0
def clean_text(s):
    if not isinstance(s, str):
        return None
    s = s.strip().replace("\uFEFF", "")
    s = re.sub(r"\s+", " ", s)
    return s if len(s) >= 5 else None


def make_pair(with_spaces):
    s = re.sub(r"\s+", " ", with_spaces.strip())
    if " " not in s:
        return None
    chars, labels = [], []
    for i, ch in enumerate(s):
        if ch == " ":
            continue
        chars.append(ch)
        labels.append(1 if i + 1 < len(s) and s[i + 1] == " " else 0)
    return "".join(chars), labels


rows, MAX_LEN = [], 256
for txt in tqdm(texts, desc="Формируем пары"):
    txt = clean_text(txt)
    if not txt:
        continue
    for sent in (x.text for x in sentenize(txt)):
        pair = make_pair(sent)
        if not pair:
            continue
        ns, lab = pair
        if 5 <= len(ns) <= MAX_LEN:
            rows.append((ns, sent, lab))

data = pd.DataFrame(rows, columns=["no_spaces", "with_spaces", "labels"])
data = data.drop_duplicates(subset=["no_spaces", "with_spaces"]).reset_index(drop=True)
print("Примеров:", len(data))



Формируем пары:   0%|          | 0/97151 [00:00<?, ?it/s]

Примеров: 374150


In [15]:
# Сплит на трейн-вал
val_frac = 0.1
perm = np.random.permutation(len(data))
split = int(len(data) * (1 - val_frac))

train_df = data.iloc[perm[:split]].reset_index(drop=True)
val_df = data.iloc[perm[split:]].reset_index(drop=True)

print(f"train={len(train_df)} val={len(val_df)}")


train=336735 val=37415


In [16]:
# Делаем словарь символов
def build_vocab(strings):
    chars = set()
    for s in strings:
        chars.update(s)
    itos = ["<pad>", "<unk>"] + sorted(chars)
    stoi = {c: i for i, c in enumerate(itos)}
    return stoi, itos


stoi, itos = build_vocab(train_df["no_spaces"])
PAD, UNK = stoi["<pad>"], stoi["<unk>"]

print("Размер словаря:", len(stoi))


Размер словаря: 1497


In [17]:
# Датасет и даталоадер
class SegmDataset(Dataset):
    def __init__(self, df):
        self.ns = df["no_spaces"].tolist()
        self.labels = df["labels"].tolist()

    def __len__(self):
        return len(self.ns)

    def __getitem__(self, i):
        ids = [stoi.get(ch, UNK) for ch in self.ns[i]]
        return ids, self.labels[i], self.ns[i]


def collate(batch):
    maxlen = max(len(x[0]) for x in batch)
    ids_pad, y_pad, mask, texts = [], [], [], []
    for ids, y, s in batch:
        pad = maxlen - len(ids)
        ids_pad.append(ids + [PAD] * pad)
        y_pad.append(y + [0] * pad)
        mask.append([1] * len(ids) + [0] * pad)
        texts.append(s)
    return (
        torch.tensor(ids_pad),
        torch.tensor(y_pad, dtype=torch.float32),
        torch.tensor(mask, dtype=torch.float32),
        texts,
    )


train_dl = DataLoader(SegmDataset(train_df), batch_size=256, shuffle=True, collate_fn=collate)
val_dl = DataLoader(SegmDataset(val_df), batch_size=512, shuffle=False, collate_fn=collate)


In [18]:
class BiLSTMSegm(nn.Module):
    def __init__(self, vocab, emb=96, hid=128, layers=2, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab, emb, padding_idx=PAD)
        self.lstm = nn.LSTM(
            emb, hid, num_layers=layers, batch_first=True,
            bidirectional=True, dropout=dropout if layers > 1 else 0.0
        )
        self.lin = nn.Linear(hid * 2, 1)

    def forward(self, x):
        e = self.emb(x)
        o, _ = self.lstm(e)
        return self.lin(o).squeeze(-1)


device = "cuda" if torch.cuda.is_available() else "cpu"
model = BiLSTMSegm(len(stoi)).to(device)


In [19]:
def estimate_pos_weight(df):
    ones, total = 0, 0
    for labs in df["labels"]:
        ones += sum(labs)
        total += len(labs)
    p = ones / (total + 1e-9)
    return torch.tensor([(1 - p) / p])


pos_weight = estimate_pos_weight(train_df).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

opt = torch.optim.AdamW(model.parameters(), lr=2e-3)
sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=6)


In [20]:
def evaluate(dl, thr=0.5):
    model.eval()
    tp = fp = fn = 0
    with torch.no_grad():
        for x, y, m, _ in dl:
            x, y, m = x.to(device), y.to(device), m.to(device)
            prob = torch.sigmoid(model(x))
            y_pred = prob >= thr
            y_true = y > 0.5
            mm = m > 0.5
            tp += ((y_pred & y_true) & mm).sum().item()
            fp += ((y_pred & (~y_true)) & mm).sum().item()
            fn += (((~y_pred) & y_true) & mm).sum().item()
    prec = tp / (tp + fp + 1e-9)
    rec = tp / (tp + fn + 1e-9)
    f1 = 2 * prec * rec / (prec + rec + 1e-9)
    return f1, prec, rec


In [21]:
EPOCHS = 5
best_f1, best_thr = 0.0, 0.5
thr_grid = [0.3, 0.4, 0.45, 0.5, 0.55, 0.6, 0.7]

for ep in range(1, EPOCHS + 1):
    model.train()
    epoch_loss = 0
    for x, y, m, _ in tqdm(train_dl, desc=f"epoch {ep}/{EPOCHS}"):
        x, y, m = x.to(device), y.to(device), m.to(device)
        prob = model(x)
        loss = (criterion(prob, y) * m).sum() / (m.sum() + 1e-9)
        opt.zero_grad()
        loss.backward()
        opt.step()
        epoch_loss += loss.item()
    sched.step()
    f1s = [(evaluate(val_dl, thr)[0], thr) for thr in thr_grid]
    f1, thr = max(f1s)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr
    print(f"loss={epoch_loss/len(train_dl):.4f} valF1={f1:.4f} thr={thr} bestF1={best_f1:.4f} thr*={best_thr}")

print("Лучший порог:", best_thr)


epoch 1/5:   0%|          | 0/1316 [00:00<?, ?it/s]

loss=0.0518 valF1=0.9761 thr=0.7 bestF1=0.9761 thr*=0.7


epoch 2/5:   0%|          | 0/1316 [00:00<?, ?it/s]

loss=0.0168 valF1=0.9830 thr=0.7 bestF1=0.9830 thr*=0.7


epoch 3/5:   0%|          | 0/1316 [00:00<?, ?it/s]

loss=0.0124 valF1=0.9865 thr=0.7 bestF1=0.9865 thr*=0.7


epoch 4/5:   0%|          | 0/1316 [00:00<?, ?it/s]

loss=0.0100 valF1=0.9880 thr=0.7 bestF1=0.9880 thr*=0.7


epoch 5/5:   0%|          | 0/1316 [00:00<?, ?it/s]

loss=0.0083 valF1=0.9893 thr=0.7 bestF1=0.9893 thr*=0.7
Лучший порог: 0.7


In [22]:
def predict_positions_for_text(text_no_spaces, thr=best_thr):
    ids = torch.tensor([[stoi.get(ch, UNK) for ch in text_no_spaces]], device=device)
    with torch.no_grad():
        prob = torch.sigmoid(model(ids)).cpu().numpy()[0]
    pred = (prob >= thr).astype(int)
    return [i + 1 for i, v in enumerate(pred) if v == 1]


def restore_by_indices(text, positions):
    out, ps = [], set(positions)
    for i, ch in enumerate(text):
        out.append(ch)
        if (i + 1) in ps:
            out.append(" ")
    return "".join(out).strip()


# пример
sample = "куплюайфон14про"
pos = predict_positions_for_text(sample)
print("Исходник:", sample)
print("Пробелы:", pos)
print("Восстановленный:", restore_by_indices(sample, pos))


Исходник: куплюайфон14про
Пробелы: [2, 5, 10, 12]
Восстановленный: ку плю айфон 14 про


In [23]:
path = "/kaggle/input/test-data/dataset_1937770_3.txt"

tmp = pd.read_fwf(path, header=0, names=["raw"])
task_data = tmp["raw"].str.split(",", n=1, expand=True)
task_data.columns = ["id", "text_no_spaces"]
task_data["id"] = task_data["id"].astype(int)

task_data["predicted_positions"] = task_data["text_no_spaces"].apply(
    lambda s: str(predict_positions_for_text(str(s)))
)

submission = task_data[["id", "predicted_positions"]]
submission.to_csv("submission.csv", index=False)
print("Сабмишен готов:", submission.shape)
submission.head()


Сабмишен готов: (1005, 2)


Unnamed: 0,id,predicted_positions
0,0,"[2, 5, 10, 12]"
1,1,"[1, 6, 7, 18]"
2,2,"[1, 4, 12, 13, 20, 21, 29]"
3,3,"[5, 10, 18]"
4,4,"[5, 10, 15]"
