In [2]:
!pip install -q --upgrade datasets fsspec
!pip install -q pytorch-lightning torchmetrics jsonargparse pyphen
!pip install -q bert-score
!pip install -q --no-deps lens-metric textstat

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/503.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m501.8/503.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine =

In [3]:
import torch

DATASET_PATH = "/content/drive/MyDrive/NLP_Project/wikilarge_dataset"
DATASET_PATH_CLEAN = "/content/drive/MyDrive/NLP_Project/wikilarge_dataset_clean"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_float32_matmul_precision("high")
BATCH_SIZE=256

In [4]:
from datasets import load_from_disk

ds = load_from_disk(DATASET_PATH)
ds_clean = load_from_disk(DATASET_PATH_CLEAN)

Metrics

In [5]:
import time
from bert_score import BERTScorer

bertscorer = BERTScorer(lang="en", rescale_with_baseline=True, device=DEVICE)

from lens import download_model, LENS_SALSA

lens_salsa_path = download_model("davidheineman/lens-salsa")
lens_salsa = LENS_SALSA(lens_salsa_path)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

hparams.yaml:   0%|          | 0.00/774 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [6]:
import warnings, logging

# Logging levels
for name in [
    "pytorch_lightning", "lightning.pytorch",
    "lens", "torch"
]:
    logging.getLogger(name).setLevel(logging.ERROR)

# Hide common warnings (e.g., Roberta pooler init)
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized*")
warnings.filterwarnings("ignore", message=".*Empty candidate sentence detected*")

In [None]:
from tqdm import tqdm
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import textstat

def _fkgl_one(t: str) -> float:
    try:
        return float(textstat.flesch_kincaid_grade(t))
    except Exception:
        return float("nan")

def corpus_fkgl_parallel(texts, n_jobs=8, chunksize=1000, show_tqdm=True):
    with ProcessPoolExecutor(max_workers=n_jobs) as ex:
        it = ex.map(_fkgl_one, texts, chunksize=chunksize)
        if show_tqdm:
            it = tqdm(it, total=len(texts), desc="FKGL (parallel)", unit="ex")
        vals = list(it)
    return float(np.nanmean(vals))

def bertscore_mean(tgts, srcs, bs=256):
    f_vals = []
    for i in tqdm(range(0, len(tgts), bs), desc="BERTScore", unit="batch"):
        t_slice = tgts[i:i+bs]; s_slice = srcs[i:i+bs]
        with torch.inference_mode():
            P, R, F = bertscorer.score(t_slice, s_slice, batch_size=bs, verbose=False)
        f_vals.extend(F.detach().cpu().tolist())
    return float(sum(f_vals) / len(f_vals)) * 100.0

def eval_dataset(dset, batch_size=BATCH_SIZE, fkgl_jobs=8):
    srcs = list(dset["source"])
    tgts = list(dset["target"])

    print("Processing FKGL (parallel CPU)...")
    t0 = time.time()
    fkgl = corpus_fkgl_parallel(tgts, n_jobs=fkgl_jobs, chunksize=1000, show_tqdm=True)
    print(f"FKGL took {time.time() - t0:.1f} seconds\n")

    print("Processing LENSalsa...")
    t0 = time.time()
    lens_scores, _ = lens_salsa.score(srcs, tgts, batch_size=128,devices=[0])
    lens_mean = float(sum(lens_scores) / len(lens_scores))
    print(f"LENSalsa took {time.time() - t0:.1f} seconds\n")

    print("Processing BERTScore...")
    t0 = time.time()
    #P, R, F = bertscorer.score(tgts, srcs, batch_size=batch_size, verbose=True)
    bert_f1_mean = bertscore_mean(tgts, srcs, bs=BATCH_SIZE)
    print(f"BERTScore took {time.time() - t0:.1f} seconds\n")

    return {
        "FKGL": fkgl,
        "BERTScore": bert_f1_mean,
        "LENS": lens_mean,
    }

print("== Original WikiLarge ==\n")
res_orig  = eval_dataset(ds["train"])
print("\n== WikiLarge-Clean: ==\n")
res_clean = eval_dataset(ds_clean["train"])

In [18]:
print(f"{'Metric':<10}{'Original':>12}{'Clean':>12}{'Delta':>12}")
for k in ["FKGL", "BERTScore", "LENS"]:
    a, b = res_orig[k], res_clean[k]
    print(f"{k:<10}{a:>12.2f}{b:>12.2f}{(b-a):>12.2f}")

Metric        Original       Clean       Delta
FKGL              9.24        8.83       -0.40
BERTScore        46.27       52.18        5.90
LENS             49.13       54.44        5.31


In [17]:
import string
import re, unicodedata

_ws_re = re.compile(r"\s+")
def _norm(s: str, lower: bool) -> str:
    """
    basic, language-agnostic: strip, NFKC, collapse spaces; optional lower
    """
    s = unicodedata.normalize("NFKC", s.strip())
    s = _ws_re.sub(" ", s)
    return s.lower() if lower else s

def _norm_loose(x):
    x = _norm(x, True)
    return x.rstrip(".")
same_loose = sum(1 for s,t in zip(ds['train']['source'], ds['train']['target'])
                 if _norm_loose(s) == _norm_loose(t))
print("Original Identical (loose):", same_loose)

same_loose = sum(1 for s,t in zip(ds_clean['train']['source'], ds_clean['train']['target'])
                 if _norm_loose(s) == _norm_loose(t))
print("Clean Identical (loose):", same_loose)

Original Identical (loose): 1214
Clean Identical (loose): 0
