In [6]:
import torch
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tqdm.auto import tqdm

import spacy
from synth_data import *
from helpers import *

# Wikipedia Synthetic Data

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikeortman/wikipedia-sentences")

print("Path to dataset files:", path)

Path to dataset files: /Users/eddiecavallin/.cache/kagglehub/datasets/mikeortman/wikipedia-sentences/versions/3


In [7]:
import random

def sample_random_lines(path, k=200_000):
    """
    Randomly sample k lines from a very large file
    using reservoir sampling.
    Ensures unbiased random sampling.
    """
    reservoir = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            if len(reservoir) < k:
                reservoir.append(line)
            else:
                # Replace elements with decreasing probability
                j = random.randint(1, i)
                if j <= k:
                    reservoir[j - 1] = line

    return reservoir


# ---- USE IT ----

wiki_path = "wikisent2.txt"   # change to your actual filename

random.seed(42)
clean_sentences = sample_random_lines(wiki_path, k=200_000)

len(clean_sentences), clean_sentences[:10]

(200000,
 ['Brian C. McGing is a papyrologist and ancient historian, who specializes in the Hellenistic period.',
  'Ford gained national attention when Miley Cyrus brought them as her date to The Foundation for AIDS Research (AMFAR) gala in 2015.',
  'Its specific name "limbatus" is from the Latin meaning "black-edged" and refers to the colored markings of this species.',
  'Skarbino is a village in Kardzhali Municipality, Kardzhali Province, southern Bulgaria.',
  'Dasai Chowdhary is an Indian politician.',
  'Berdusco also played internationally for Canada and scored one of its most memorable goals in a friendly against Brazil in 1994.',
  'AppleDouble leaves the data fork in its original format, allowing it to be edited by normal Unix utilities.',
  'Kronenbourg 1664 is now produced in the UK by Heineken after being bought from Scottish & Newcastle.',
  "In J. R. R. Tolkien's legendarium, the Battle of the Morannon or Battle of the Black Gate is a fictional event that took place at

In [8]:
nlp = spacy.load("en_core_web_sm")
df_synthetic = build_synthetic_comma_dataset(clean_sentences, max_per_type=3000)
df_synthetic.head()

Synthetic examples per type:
  comma splices: 3000
  comma deletions: 3000
  comma insertions: 3000
Total rows in df_syn: 16505


Unnamed: 0,sentence,label,error_type,synthetic_source
0,Song is professor of law and political science...,0,orig,delete
1,"He played 18 seasons and 346 matches in the, N...",1,comma_inserted,insert
2,Behind the scenes Imbruglia quit the serial.,0,orig,insert
3,The club currently has many teams within the o...,1,comma_deleted,delete
4,"On June 29 1995, the drinking water supply in ...",1,comma_deleted,delete


In [9]:
df_synthetic.to_csv("df_synthetic.csv")

In [10]:
df_synthetic.head()
df_synthetic["label"] = df_synthetic["label"].astype(int)
df_synthetic["label"].value_counts()

label
1    9000
0    7505
Name: count, dtype: int64

In [11]:
# df_synthetic must have columns: 'sentence' (str), 'label' (0/1)
df_synthetic["label"] = df_synthetic["label"].astype(int)

X = df_synthetic["sentence"]
y = df_synthetic["label"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

train_df = pd.DataFrame({"sentence": X_train, "label": y_train})
val_df   = pd.DataFrame({"sentence": X_val,   "label": y_val})
test_df  = pd.DataFrame({"sentence": X_test,  "label": y_test})

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))
train_df.head()

Train size: 11553
Val size: 2476
Test size: 2476


Unnamed: 0,sentence,label
16200,Once the sea all around Lesbos rose in such hi...,1
10190,"147 Squadron, often referred to as the Flying ...",0
16286,The Administrator is nominated by the Presiden...,1
11438,"Warner W. Holzinger of the 2nd Platoon, Troop ...",1
5815,"Their lord was the monarch ,, they were admini...",1


In [13]:
# Wrap into HF Datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 11553
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2476
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2476
    })
})

In [14]:
# DistilBERT tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = raw_datasets.map(tokenize_batch, batched=True)

# Keep only what we need
cols_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_datasets = tokenized_datasets.remove_columns(
    [c for c in tokenized_datasets["train"].column_names if c not in cols_to_keep]
)

tokenized_datasets.set_format("torch")
tokenized_datasets



Map:   0%|          | 0/11553 [00:00<?, ? examples/s]

Map:   0%|          | 0/2476 [00:00<?, ? examples/s]

Map:   0%|          | 0/2476 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 11553
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2476
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2476
    })
})

In [15]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)

# Device (M1/M2 â†’ 'mps', else CPU/CUDA)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)
device

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='mps')

In [16]:
# DataLoaders
train_dataset = tokenized_datasets["train"]
val_dataset   = tokenized_datasets["validation"]
test_dataset  = tokenized_datasets["test"]

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)
test_loader  = DataLoader(test_dataset, batch_size=32)

In [17]:
# Optimizer & scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps,
)



In [18]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} avg training loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total
    print(f"Epoch {epoch+1} val loss: {avg_val_loss:.4f}, val acc: {val_acc:.4f}\n")

Epoch 1/3:   0%|          | 0/723 [00:00<?, ?it/s]


Epoch 1 avg training loss: 0.4474
Epoch 1 val loss: 0.3306, val acc: 0.8518



Epoch 2/3:   0%|          | 0/723 [00:00<?, ?it/s]


Epoch 2 avg training loss: 0.2671
Epoch 2 val loss: 0.3314, val acc: 0.8506



Epoch 3/3:   0%|          | 0/723 [00:00<?, ?it/s]


Epoch 3 avg training loss: 0.1844
Epoch 3 val loss: 0.3670, val acc: 0.8534



In [19]:
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

print("Synthetic test set performance:")
print(classification_report(all_labels, all_preds, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

Synthetic test set performance:
              precision    recall  f1-score   support

           0     0.8406    0.8570    0.8487      1126
           1     0.8788    0.8644    0.8715      1350

    accuracy                         0.8611      2476
   macro avg     0.8597    0.8607    0.8601      2476
weighted avg     0.8614    0.8611    0.8612      2476

Confusion Matrix:
[[ 965  161]
 [ 183 1167]]


In [None]:
test_df = pd.read_csv("test.csv")
val_df  = pd.read_csv("validation.csv")


test_df["corr_list"] = test_df["corrections"].apply(parse_corrections)
val_df["corr_list"]  = val_df["corrections"].apply(parse_corrections)

test_df["comma_candidate"] = test_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)
val_df["comma_candidate"] = val_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)

test_comma = test_df[test_df["comma_candidate"]]
val_comma  = val_df[val_df["comma_candidate"]]

# Project Gutenberg Data

Here I will construct the Gutenberg dataset

We will not define our own grammar, as this would be quite complicated. Instead we decide to import a pretrained parser to do parts of speech tagging. We decided on using the spaCy NLP package. As a group, we are aware that spaCy has the capability of 

In [None]:
import spacy

# Make sure you've done: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")



In [None]:
import nltk
from nltk import CFG, ChartParser
import helpers

grammar_str = r"""
S -> CLAUSE PUNCT
S -> CLAUSE CONJ CLAUSE PUNCT
S -> CLAUSE COMMA CONJ CLAUSE PUNCT
S -> INTRO COMMA CLAUSE PUNCT

INTRO -> ADV
INTRO -> ADV ADV
INTRO -> PP

CLAUSE -> NP VP
CLAUSE -> NP_LIST VP
CLAUSE -> NP VP_LIST

NP -> PRON
NP -> DET N
NP -> N
NP -> NP_LIST
NP -> NP APPOS

NP_LIST -> NP COMMA NP
NP_LIST -> NP COMMA NP_LIST

APPOS -> COMMA NP COMMA

VP -> V
VP -> V NP
VP -> V ADV
VP -> V NP ADV
VP -> AUX V
VP -> AUX V NP
VP -> AUX V ADV
VP -> AUX V NP ADV
VP -> V NP PP
VP -> AUX V NP PP

VP_LIST -> VP COMMA VP
VP_LIST -> VP COMMA VP_LIST

PP -> P NP

PRON -> 'PRON'
DET  -> 'DET'
N    -> 'N'
V    -> 'V'
AUX  -> 'AUX'
P    -> 'P'
CONJ -> 'CONJ'
PUNCT -> 'PUNCT'
COMMA -> 'COMMA'
ADV -> 'ADV'
"""

grammar = CFG.fromstring(grammar_str)
s_parser = ChartParser(grammar)

clause_nt = nltk.Nonterminal('CLAUSE')
clause_grammar = CFG(clause_nt, grammar.productions())
clause_parser = ChartParser(clause_grammar)

# hook into helpers so parses_as_sentence / parses_as_clause / is_cfg_comma_splice use this grammar
helpers.s_parser = s_parser
helpers.clause_parser = clause_parser

In [None]:
from helpers import is_cfg_comma_splice

def cfg_predict_label(sentence: str) -> int:
    """
    Return 1 if the CFG-based rule thinks this is a comma splice,
    else 0. This is *not* a perfect match to your 'comma-only edit' label,
    but we can still evaluate it as a baseline against that label.
    """
    try:
        return int(is_cfg_comma_splice(sentence))
    except Exception:
        return 0

In [None]:
examples = [
    "I went home, I slept.",
    "I went home, and I slept.",
    "I went home and I slept.",
    "Every person needs to know a bit about math, so they can manage daily life.",
    "Every person needs to know a bit about math.",
]

for s in examples:
    print(s, "=>", is_cfg_comma_splice(s))

In [None]:
import json

records = []
with open("lang-8_data.dat", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            records.append(obj)
        except:
            pass


In [None]:
rows = []

for rec in records:
    journal_id = rec[0]
    sentence_id = rec[1]
    learning_language = rec[2]
    native_language = rec[3]
    learner_sents = rec[4]
    corrections = rec[5]

    for sent, corr_list in zip(learner_sents, corrections):
        rows.append({
            "journal_id": journal_id,
            "sentence_id": sentence_id,
            "learning_language": learning_language,
            "native_language": native_language,
            "sentence": sent,
            "corrections": corr_list
        })

In [None]:
import pandas as pd
df = pd.DataFrame(rows)
df.head()

In [None]:
df = df[df["learning_language"] == "English"]
df = df.reset_index(drop=True)

In [None]:
# # Take the first correction as our reference target
# df["first_corr"] = df["corrections"].apply(
#     lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
# )


# # Label whether the only edits between sentence and correction are comma edits
# df["comma_only_error"] = df.apply(
#     lambda row: comma_only_edit(row["sentence"], row["first_corr"]),
#     axis=1
# )

# df[["sentence", "first_corr", "comma_only_error"]].head()

In [None]:
# # Only keep rows where we actually have a correction
# df_clean = df[df["first_corr"].notnull()].copy()

# # Our label: 1 = pure comma error, 0 = not pure comma error
# df_clean["label"] = df_clean["comma_only_error"].astype(int)

# df_clean[["sentence", "first_corr", "label"]].head()

In [20]:
# # Save to CSV
# df_clean.to_csv("df_clean.csv", index=False)
df_clean = pd.read_csv('df_clean.csv')

In [21]:
df_clean["label"].value_counts()


label
0    1163569
1       3015
Name: count, dtype: int64

In [22]:
df_pos = df_clean[df_clean["label"] == 1]
df_neg = df_clean[df_clean["label"] == 0].sample(n=10000, random_state=42)

df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42)
df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset size:", len(df_balanced))
print(df_balanced["label"].value_counts())

Balanced dataset size: 13015
label
0    10000
1     3015
Name: count, dtype: int64


In [23]:
from sklearn.model_selection import train_test_split

X = df_balanced["sentence"]
y = df_balanced["label"]

# Train / temp split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Validation / Test split
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 9110
Validation size: 1952
Test size: 1953


In [25]:
# Start from df_clean (your Lang-8 data)
df_clean["label"] = df_clean["label"].astype(int)

# Keep only the columns we need
lang8_df = df_clean[["sentence", "label"]].copy()

# Drop rows where sentence is missing
lang8_df = lang8_df.dropna(subset=["sentence"])

# Force everything to string (tokenizer wants strings)
lang8_df["sentence"] = lang8_df["sentence"].astype(str)

print("Lang-8 size after cleaning:", len(lang8_df))
print(lang8_df.head())

Lang-8 size after cleaning: 1166583
                                            sentence  label
0  I will appreciate it if you correct my sentences.      0
1  It's been getting colder these days here in Ja...      0
2  The summer weather in Japan is not agreeable t...      0
3  So, as the winter is coming, I'm getting to fe...      0
4                    It is the very exciting season.      0


In [26]:
# Optional: subsample for evaluation, e.g. 50k sentences
lang8_df = lang8_df.sample(n=50_000, random_state=42)
print("Using subset for evaluation:", len(lang8_df))

Using subset for evaluation: 50000


In [27]:
from datasets import Dataset

lang8_ds = Dataset.from_pandas(lang8_df.reset_index(drop=True))

def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

lang8_tokenized = lang8_ds.map(tokenize_batch, batched=True)

cols_to_keep = ["input_ids", "attention_mask", "label"]
lang8_tokenized = lang8_tokenized.remove_columns(
    [c for c in lang8_tokenized.column_names if c not in cols_to_keep]
)
lang8_tokenized.set_format("torch")

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [28]:
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

lang8_loader = DataLoader(lang8_tokenized, batch_size=32)

model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in lang8_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(ids, attention_mask=mask).logits
        preds = torch.argmax(logits, dim=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

print("Lang-8 evaluation (synthetic-trained model):")
print(classification_report(all_labels, all_preds, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

Lang-8 evaluation (synthetic-trained model):
              precision    recall  f1-score   support

           0     0.9982    0.8043    0.8908     49871
           1     0.0058    0.4419    0.0115       129

    accuracy                         0.8033     50000
   macro avg     0.5020    0.6231    0.4511     50000
weighted avg     0.9956    0.8033    0.8885     50000

Confusion Matrix:
[[40109  9762]
 [   72    57]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),       # unigrams + bigrams
        max_features=50000,      # cap vocab size
        lowercase=True
    )),
    ('clf', LogisticRegression(
        class_weight="balanced", # helps with imbalance
        max_iter=200
    ))
])

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_val_pred = model.predict(X_val)

print("Validation Performance")
print(classification_report(y_val, y_val_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

In [None]:
y_test_pred = model.predict(X_test)

print("Test Performance")
print(classification_report(y_test, y_test_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

In [None]:
import pandas as pd

train_df = pd.DataFrame({"sentence": X_train, "label": y_train.astype(int)})
val_df   = pd.DataFrame({"sentence": X_val,   "label": y_val.astype(int)})
test_df  = pd.DataFrame({"sentence": X_test,  "label": y_test.astype(int)})

train_df.head(), val_df.head(), test_df.head()

In [None]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})

raw_datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

In [None]:
def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = raw_datasets.map(tokenize_batch, batched=True)

# Set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(
    [col for col in tokenized_datasets["train"].column_names if col not in ("input_ids", "attention_mask", "label")]
)
tokenized_datasets.set_format("torch")

tokenized_datasets

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

batch_size = 16

training_args = TrainingArguments(
    output_dir="comma_error_distilbert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_datasets["test"])
test_results

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# CFG predictions on test sentences
y_test_cfg = X_test.apply(cfg_predict_label).to_numpy()

print("CFG baseline on test set")
print(classification_report(y_test, y_test_cfg, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_cfg))