In [1]:
import torch
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tqdm.auto import tqdm

import spacy
from synth_data import *
from helpers import *

import json
import random

# ESL Data

First, we will load in the ESL data

In [2]:
records = []
with open("lang-8_data.dat", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            records.append(obj)
        except:
            pass


Due to the unstructured nature of the data, we will add column names and variables in order to work with the data better.

In [3]:
rows = []

for rec in records:
    journal_id = rec[0]
    sentence_id = rec[1]
    learning_language = rec[2]
    native_language = rec[3]
    learner_sents = rec[4]
    corrections = rec[5]

    for sent, corr_list in zip(learner_sents, corrections):
        rows.append({
            "journal_id": journal_id,
            "sentence_id": sentence_id,
            "learning_language": learning_language,
            "native_language": native_language,
            "sentence": sent,
            "corrections": corr_list
        })

Next, we will save this as a dataframe 

In [4]:
df = pd.DataFrame(rows)
df.head()

KeyboardInterrupt: 

We now sort by English learning language only to sort only English sentances.

In [None]:
df = df[df["learning_language"] == "English"]
df = df.reset_index(drop=True)

In [None]:
df.columns
df.head(2)

Unnamed: 0,journal_id,sentence_id,learning_language,native_language,sentence,corrections
0,728457,216037,English,Japanese,About winter,[]
1,728457,216037,English,Japanese,This is my second post.,[]


This code sorts for only comma errors. It is commented out as it takes hours to run locally

In [None]:
# # Take the first correction as our reference target
# df["first_corr"] = df["corrections"].apply(
#     lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
# )


# # Label whether the only edits between sentence and correction are comma edits
# df["comma_only_error"] = df.apply(
#     lambda row: comma_only_edit(row["sentence"], row["first_corr"]),
#     axis=1
# )

# df[["sentence", "first_corr", "comma_only_error"]].head()


KeyboardInterrupt: 

In [None]:
# # Only keep rows where we actually have a correction
# df_clean = df[df["first_corr"].notnull()].copy()

# # Our label: 1 = pure comma error, 0 = not pure comma error
# df_clean["label"] = df_clean["comma_only_error"].astype(int)

# df_clean[["sentence", "first_corr", "label"]].head()

If you already have the csv file, then start running the code here

In [5]:
# # Save to CSV
# df_clean.to_csv("df_clean.csv", index=False)
df_clean = pd.read_csv('df_clean.csv')

In [6]:
df_clean["label"].value_counts()


label
0    1163569
1       3015
Name: count, dtype: int64

In [30]:
df_pos = df_clean[df_clean["label"] == 1]
df_neg = df_clean[df_clean["label"] == 0].sample(n=10000, random_state=42)

df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42)
df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset size:", len(df_balanced))
print(df_balanced["label"].value_counts())

Balanced dataset size: 13015
label
0    10000
1     3015
Name: count, dtype: int64


# Wikipedia Synthetic Data

First, we will download the data off of kaggle

In [33]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikeortman/wikipedia-sentences")

print("Path to dataset files:", path)

Path to dataset files: /Users/eddiecavallin/.cache/kagglehub/datasets/mikeortman/wikipedia-sentences/versions/3


Next, select random 200,000 lines to construct the dataset

In [34]:
def sample_random_lines(path, k=200_000):
    """
    Randomly sample k lines from a very large file
    using reservoir sampling.
    Ensures unbiased random sampling.
    """
    reservoir = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            if len(reservoir) < k:
                reservoir.append(line)
            else:
                # Replace elements with decreasing probability
                j = random.randint(1, i)
                if j <= k:
                    reservoir[j - 1] = line

    return reservoir


wiki_path = "wikisent2.txt"

random.seed(42)
clean_sentences = sample_random_lines(wiki_path, k=200_000)

len(clean_sentences), clean_sentences[:10]

(200000,
 ['Brian C. McGing is a papyrologist and ancient historian, who specializes in the Hellenistic period.',
  'Ford gained national attention when Miley Cyrus brought them as her date to The Foundation for AIDS Research (AMFAR) gala in 2015.',
  'Its specific name "limbatus" is from the Latin meaning "black-edged" and refers to the colored markings of this species.',
  'Skarbino is a village in Kardzhali Municipality, Kardzhali Province, southern Bulgaria.',
  'Dasai Chowdhary is an Indian politician.',
  'Berdusco also played internationally for Canada and scored one of its most memorable goals in a friendly against Brazil in 1994.',
  'AppleDouble leaves the data fork in its original format, allowing it to be edited by normal Unix utilities.',
  'Kronenbourg 1664 is now produced in the UK by Heineken after being bought from Scottish & Newcastle.',
  "In J. R. R. Tolkien's legendarium, the Battle of the Morannon or Battle of the Black Gate is a fictional event that took place at

Next, we load in spacy, and then use our helper functions to build the synthetic dataset

In [35]:
nlp = spacy.load("en_core_web_sm")
df_synthetic = build_synthetic_comma_dataset(clean_sentences, max_per_type=3000)
df_synthetic.head()

Synthetic examples per type:
  comma splices: 3000
  comma deletions: 3000
  comma insertions: 3000
Total rows in df_syn: 16505


Unnamed: 0,sentence,label,error_type,synthetic_source
0,Song is professor of law and political science...,0,orig,delete
1,"He played 18 seasons and 346 matches in the, N...",1,comma_inserted,insert
2,Behind the scenes Imbruglia quit the serial.,0,orig,insert
3,The club currently has many teams within the o...,1,comma_deleted,delete
4,"On June 29 1995, the drinking water supply in ...",1,comma_deleted,delete


In [36]:
rows = build_synthetic_esl_like(clean_sentences, n_per_type=3000, seed=42)
df_syn_esl = pd.DataFrame(rows).drop_duplicates(subset=["sentence","label","error_type"]).reset_index(drop=True)

df_syn_esl["label"].value_counts(), df_syn_esl["error_type"].value_counts().head()

KeyboardInterrupt: 

If you already have the built csv file, then just run this code block

In [6]:
# df_synthetic.to_csv("df_synthetic.csv")
df_syn_esl.to_csv("df_syn_esl.csv")

In [5]:
df_synthetic.head()
df_synthetic["label"] = df_synthetic["label"].astype(int)
df_synthetic["label"].value_counts()

label
1    9000
0    7505
Name: count, dtype: int64

Sort into train, test, and validation sets

In [37]:
#df_synthetic = pd.read_csv('df_synthetic.csv')
df_synthetic = pd.read_csv('df_syn_esl.csv')
# df_synthetic must have columns: 'sentence' (str), 'label' (0/1)
df_synthetic["label"] = df_synthetic["label"].astype(int)

X = df_synthetic["sentence"]
y = df_synthetic["label"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_df = pd.DataFrame({"sentence": X_train, "label": y_train})
val_df   = pd.DataFrame({"sentence": X_val,   "label": y_val})
test_df  = pd.DataFrame({"sentence": X_test,  "label": y_test})

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))
train_df.head()

Train size: 15145
Val size: 3246
Test size: 3246


Unnamed: 0,sentence,label
376,It has become a popular girls' name and was th...,0
2655,Wildboyz is an American spin-off television se...,1
9148,"However, they have some functionality, as wate...",0
20397,Wanbi is a settlement in South Australia.,0
1533,The success of The Grapes of Wrath allowed Smi...,1


# CFG code

In [None]:
import spacy
import nltk
from nltk import CFG, ChartParser
import helpers

# spaCy model
nlp = spacy.load("en_core_web_sm")

# Make sure helpers uses this nlp object
helpers.nlp = nlp

Now we will define the grammar for the cfg

In [None]:
grammar_str = r"""
S -> MAIN_CLAUSE PUNCT
S -> MAIN_CLAUSE CONJ MAIN_CLAUSE PUNCT
S -> SUB_CLAUSE COMMA MAIN_CLAUSE PUNCT
S -> MAIN_CLAUSE COMMA SUB_CLAUSE PUNCT

# Main clauses
MAIN_CLAUSE -> NP VP
MAIN_CLAUSE -> VP 
MAIN_CLAUSE -> NP 

# Subordinate clauses: SCONJ + clause
SUB_CLAUSE -> SCONJ MAIN_CLAUSE
SUB_CLAUSE -> SCONJ NP VP

# Noun phrases
NP -> PRON
NP -> DET NBAR
NP -> NBAR
NP -> NP PP 
NP -> NP COMMA NP  

# N-bar (nominal core)
NBAR -> N
NBAR -> ADJ NBAR 
NBAR -> N NBAR
NBAR -> N PP 

# Verb phrases
VP -> V
VP -> V NP
VP -> V PP
VP -> V NP PP
VP -> V ADV
VP -> V NP ADV
VP -> V PP ADV
VP -> V NP PP ADV

VP -> AUX VP 
VP -> ADV VP  
VP -> VP ADV 
VP -> VP PP  
# Prepositional phrase
PP -> P NP

# POS tag terminals (coarse tags from your helpers.coarse_pos)
PRON -> 'PRON'
DET  -> 'DET'
N    -> 'N'
V    -> 'V'
AUX  -> 'AUX'
P    -> 'P'
CONJ -> 'CONJ'
SCONJ -> 'SCONJ'
PUNCT -> 'PUNCT'
COMMA -> 'COMMA'
ADV -> 'ADV'
ADJ -> 'ADJ'
"""

grammar = CFG.fromstring(grammar_str)
s_parser = ChartParser(grammar)

# For clause-level parsing, we treat MAIN_CLAUSE as the start symbol
clause_nt = nltk.Nonterminal('MAIN_CLAUSE')
clause_grammar = CFG(clause_nt, grammar.productions())
clause_parser = ChartParser(clause_grammar)

# Plug into helpers so is_cfg_clause / is_cfg_comma_splice can use these
helpers.s_parser = s_parser
helpers.clause_parser = clause_parser

This test show how the current grammar is partially insufficient for the current comma detection task

In [None]:
for s in [
    "I went home, I slept.",
    "I went home, and I slept.",
    "I went home and I slept."
]:
    print(s, "=>", helpers.is_cfg_comma_splice(s))

I went home, I slept. => True
I went home, and I slept. => False
I went home and I slept. => False


Now we will use out cfg rules to predict on the synthetic dataset

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def cfg_predict_label(sentence: str) -> int:
    try:
        return 1 if helpers.is_cfg_comma_splice(sentence) else 0
    except Exception:
        return 0
cfg_true = test_df["label"].to_numpy()
cfg_pred = test_df["sentence"].apply(cfg_predict_label).to_numpy()

print("CFG baseline on synthetic test set")
print(classification_report(cfg_true, cfg_pred, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(cfg_true, cfg_pred))

CFG baseline on synthetic test set
              precision    recall  f1-score   support

           0     0.4581    0.9991    0.6281      1126
           1     0.9500    0.0141    0.0277      1350

    accuracy                         0.4620      2476
   macro avg     0.7040    0.5066    0.3279      2476
weighted avg     0.7263    0.4620    0.3008      2476

Confusion Matrix:
[[1125    1]
 [1331   19]]


Basically just guessing at this point:

In [41]:
def cfg_predict_label(sentence: str) -> int:
    try:
        return 1 if helpers.is_cfg_comma_splice(sentence) else 0
    except Exception:
        return 0  # fail closed

cfg_true = test_df["label"].to_numpy()
cfg_pred = test_df["sentence"].apply(cfg_predict_label).to_numpy()

print("CFG baseline on Lang-8 test set")
print(classification_report(cfg_true, cfg_pred, digits=4, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(cfg_true, cfg_pred))

CFG baseline on Lang-8 test set
              precision    recall  f1-score   support

           0     0.4889    1.0000    0.6567      1587
           1     0.0000    0.0000    0.0000      1659

    accuracy                         0.4889      3246
   macro avg     0.2445    0.5000    0.3284      3246
weighted avg     0.2390    0.4889    0.3211      3246

Confusion Matrix:
[[1587    0]
 [1659    0]]


# DistilBERT training

In [39]:
from datasets import Dataset, DatasetDict
# Wrap into HF Datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 15145
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3246
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3246
    })
})

Now we tokenize the datasets in order to preprocess the inputs to the encoder

In [40]:
# DistilBERT tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = raw_datasets.map(tokenize_batch, batched=True)

# Keep only what we need
cols_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_datasets = tokenized_datasets.remove_columns(
    [c for c in tokenized_datasets["train"].column_names if c not in cols_to_keep]
)

tokenized_datasets.set_format("torch")
tokenized_datasets



Map:   0%|          | 0/15145 [00:00<?, ? examples/s]

Map:   0%|          | 0/3246 [00:00<?, ? examples/s]

Map:   0%|          | 0/3246 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 15145
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 3246
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 3246
    })
})

Load the model

In [10]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)

# Device (M1/M2 â†’ 'mps', else CPU/CUDA)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)
device

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='mps')

In [11]:
# DataLoaders
train_dataset = tokenized_datasets["train"]
val_dataset   = tokenized_datasets["validation"]
test_dataset  = tokenized_datasets["test"]

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)
test_loader  = DataLoader(test_dataset, batch_size=32)

In [12]:
# Optimizer & scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps,
)



Now we wil train the model over 3 epochs:

In [13]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} avg training loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total
    print(f"Epoch {epoch+1} val loss: {avg_val_loss:.4f}, val acc: {val_acc:.4f}\n")

Epoch 1/3:   0%|          | 0/947 [00:00<?, ?it/s]


Epoch 1 avg training loss: 0.3994
Epoch 1 val loss: 0.2710, val acc: 0.8706



Epoch 2/3:   0%|          | 0/947 [00:00<?, ?it/s]


Epoch 2 avg training loss: 0.2292
Epoch 2 val loss: 0.2741, val acc: 0.8749



Epoch 3/3:   0%|          | 0/947 [00:00<?, ?it/s]


Epoch 3 avg training loss: 0.1634
Epoch 3 val loss: 0.3081, val acc: 0.8715



Next, we evaluate on the synthetic dataset

In [14]:
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

print("Synthetic test set performance:")
print(classification_report(all_labels, all_preds, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

Synthetic test set performance:
              precision    recall  f1-score   support

           0     0.8870    0.8702    0.8785      1587
           1     0.8780    0.8939    0.8859      1659

    accuracy                         0.8823      3246
   macro avg     0.8825    0.8821    0.8822      3246
weighted avg     0.8824    0.8823    0.8823      3246

Confusion Matrix:
[[1381  206]
 [ 176 1483]]


And now we will see which types of comma errors it is best at predicting

In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# test_df still has the original index from df_synthetic
# so we reset it but store that original index as 'orig_idx'
results_df = test_df.copy()
results_df = results_df.reset_index().rename(columns={"index": "orig_idx"})

# Attach true + predicted labels from your DistilBERT evaluation
results_df["true"] = all_labels
results_df["pred"] = all_preds

# Map error_type from df_synthetic using the original indices
results_df["error_type"] = df_synthetic.loc[results_df["orig_idx"], "error_type"].values

results_df.head()

Unnamed: 0,orig_idx,sentence,label,true,pred,error_type
0,10085,"On 28 May 2014 Louis Bontes, Joram van Klavere...",0,0,1,orig
1,17606,Community events continue to be held in the bu...,0,0,0,orig
2,6907,"For the first time ever, the national prelimin...",0,0,0,orig
3,8686,In cultures where it is not normal it may be c...,1,1,1,missing_intro_comma
4,4752,A larger ion that has two onium ion subgroups ...,0,0,0,orig


In [18]:
print("Accuracy by synthetic error_type:\n")
for etype, group in results_df.groupby("error_type"):
    acc = (group["pred"] == group["true"]).mean()
    n = len(group)
    print(f"{etype:15s}  accuracy = {acc:.4f}   (n={n})")

Accuracy by synthetic error_type:

comma_splice     accuracy = 0.9429   (n=420)
missing_comma_before_conj  accuracy = 0.7018   (n=399)
missing_intro_comma  accuracy = 0.9298   (n=413)
orig             accuracy = 0.8702   (n=1587)
unnecessary_subj_verb_comma  accuracy = 0.9906   (n=427)


# Test on ESL data

Now we will test on the ESL data

In [22]:
from sklearn.model_selection import train_test_split

X = df_balanced["sentence"]
y = df_balanced["label"]

# Train / temp split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Validation / Test split
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 9110
Validation size: 1952
Test size: 1953


In [23]:
# Start from df_clean (your Lang-8 data)
df_clean["label"] = df_clean["label"].astype(int)

# Keep only the columns we need
lang8_df = df_clean[["sentence", "label"]].copy()

# Drop rows where sentence is missing
lang8_df = lang8_df.dropna(subset=["sentence"])

# Force everything to string (tokenizer wants strings)
lang8_df["sentence"] = lang8_df["sentence"].astype(str)

print("Lang-8 size after cleaning:", len(lang8_df))
print(lang8_df.head())

Lang-8 size after cleaning: 1166583
                                            sentence  label
0  I will appreciate it if you correct my sentences.      0
1  It's been getting colder these days here in Ja...      0
2  The summer weather in Japan is not agreeable t...      0
3  So, as the winter is coming, I'm getting to fe...      0
4                    It is the very exciting season.      0


In [24]:
# Optional: subsample for evaluation, e.g. 50k sentences
lang8_df = lang8_df.sample(n=50_000, random_state=42)
print("Using subset for evaluation:", len(lang8_df))

Using subset for evaluation: 50000


In [42]:
from datasets import Dataset

lang8_ds = Dataset.from_pandas(lang8_df.reset_index(drop=True))

def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

lang8_tokenized = lang8_ds.map(tokenize_batch, batched=True)

cols_to_keep = ["input_ids", "attention_mask", "label"]
lang8_tokenized = lang8_tokenized.remove_columns(
    [c for c in lang8_tokenized.column_names if c not in cols_to_keep]
)
lang8_tokenized.set_format("torch")

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Now we will perform the test:

In [14]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

lang8_loader = DataLoader(lang8_tokenized, batch_size=32)

model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in lang8_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(ids, attention_mask=mask).logits
        preds = torch.argmax(logits, dim=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

print("Lang-8 evaluation (synthetic-trained model):")
print(classification_report(all_labels, all_preds, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

NameError: name 'lang8_tokenized' is not defined