In [2]:
from datasets import load_dataset
import pandas as pd
import ast
import helpers
from helpers import *
import nltk
from nltk import CFG, ChartParser


In [2]:
# ds = load_dataset("jhu-clsp/jfleg")
# ds.save_to_disk("JFLEG")
# ds["test"].to_csv("test.csv")
# ds["validation"].to_csv("validation.csv")

In [5]:
test_df = pd.read_csv("test.csv")
val_df  = pd.read_csv("validation.csv")


test_df["corr_list"] = test_df["corrections"].apply(parse_corrections)
val_df["corr_list"]  = val_df["corrections"].apply(parse_corrections)

test_df["comma_candidate"] = test_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)
val_df["comma_candidate"] = val_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)

test_comma = test_df[test_df["comma_candidate"]]
val_comma  = val_df[val_df["comma_candidate"]]

# Project Gutenberg Data

Here I will construct the Gutenberg dataset

We will not define our own grammar, as this would be quite complicated. Instead we decide to import a pretrained parser to do parts of speech tagging. We decided on using the spaCy NLP package. As a group, we are aware that spaCy has the capability of 

In [3]:
import spacy

# Make sure you've done: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")



In [4]:

grammar_str = r"""
S -> CLAUSE PUNCT
S -> CLAUSE CONJ CLAUSE PUNCT

CLAUSE -> NP VP

NP -> PRON
NP -> DET N
NP -> N

VP -> V
VP -> V NP
VP -> V ADV
VP -> V NP ADV
VP -> AUX V
VP -> AUX V NP
VP -> AUX V ADV
VP -> AUX V NP ADV
VP -> V NP PP
VP -> AUX V NP PP

PP -> P NP

PRON -> 'PRON'
DET  -> 'DET'
N    -> 'N'
V    -> 'V'
AUX  -> 'AUX'
P    -> 'P'
CONJ -> 'CONJ'
PUNCT -> 'PUNCT'
COMMA -> 'COMMA'
ADV -> 'ADV'
"""

grammar = CFG.fromstring(grammar_str)
s_parser = ChartParser(grammar)

clause_nt = nltk.Nonterminal('CLAUSE')
clause_grammar = CFG(clause_nt, grammar.productions())
clause_parser = ChartParser(clause_grammar)

helpers.s_parser = s_parser
helpers.clause_parser = clause_parser

In [6]:
examples = [
    "I went home, I slept.",
    "I went home, and I slept.",
    "I went home and I slept.",
    "Every person needs to know a bit about math, so they can manage daily life.",
    "Every person needs to know a bit about math.",
]

for s in examples:
    print(s, "=>", is_cfg_comma_splice(s))

I went home, I slept. => True
I went home, and I slept. => False
I went home and I slept. => False
Every person needs to know a bit about math, so they can manage daily life. => False
Every person needs to know a bit about math. => False


In [7]:
import json

records = []
with open("lang-8_data.dat", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            records.append(obj)
        except:
            pass


In [8]:
rows = []

for rec in records:
    journal_id = rec[0]
    sentence_id = rec[1]
    learning_language = rec[2]
    native_language = rec[3]
    learner_sents = rec[4]
    corrections = rec[5]

    for sent, corr_list in zip(learner_sents, corrections):
        rows.append({
            "journal_id": journal_id,
            "sentence_id": sentence_id,
            "learning_language": learning_language,
            "native_language": native_language,
            "sentence": sent,
            "corrections": corr_list
        })

In [9]:
import pandas as pd
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,journal_id,sentence_id,learning_language,native_language,sentence,corrections
0,1057227,290610,Korean,English,Ïò§Îäò Î∞∞Ïö¥ ÏÉà ÌëúÌòÑ / New expressions I learned today,[Ïò§Îäò Î∞∞Ïö¥ ÏÉà[f-blue]Î°úÏö¥[/f-blue] ÌëúÌòÑ[f-blue]Îì§[/f-blu...
1,1057227,290610,Korean,English,TTMIKÍ∞Ä Ï†úÏûê ÏûêÏ£º Ïì∞Îäî ÌïúÍµ≠ÍµêÏû¨Ïù¥ÏóêÏöî.,"[TTMIKÍ∞Ä Ï†úÏûê ÏûêÏ£º Ïì∞Îäî ÌïúÍµ≠ÍµêÏû¨[sline]Ïù¥[/sline]ÏóêÏöî., TTMI..."
2,1057227,290610,Korean,English,Ïò§ÎäòÏùÄ ÏÉàÎ°ú Î†àÏä® ÎÇòÏôÄÏÑú Í∑∏ Î†àÏä®ÏóêÍ≤åÏÑú ÏÉàÎ°ú ÌëúÌòÑÏù¥ Î∞∞Ïõ†Ïñ¥Ïöî.,[Ïò§ÎäòÏùÄ ÏÉàÎ°ú Î†àÏä® ÎÇòÏôÄÏÑú Í∑∏ Î†àÏä®ÏóêÍ≤åÏÑú ÏÉàÎ°ú ÌëúÌòÑ[f-red]ÏùÑ[/f-red] Î∞∞...
3,1057227,290610,Korean,English,Î∞ëÏóê Í∑∏ ÌëúÌòÑÎì§ Î∂ôÌòîÏñ¥Ïöî.,"[Î∞ëÏóê Í∑∏ ÌëúÌòÑÎì§ Î∂ô[f-red]ÏòÄ[/f-red]Ïñ¥Ïöî., Î∞ëÏóê Í∑∏ ÌëúÌòÑÎì§[f-blu..."
4,1057227,290610,Korean,English,TTMIK is a Korean learning resource that I use...,[]


In [10]:
df = df[df["learning_language"] == "English"]
df = df.reset_index(drop=True)

In [11]:
# # Take the first correction as our reference target
# df["first_corr"] = df["corrections"].apply(
#     lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
# )


# # Label whether the only edits between sentence and correction are comma edits
# df["comma_only_error"] = df.apply(
#     lambda row: comma_only_edit(row["sentence"], row["first_corr"]),
#     axis=1
# )

# df[["sentence", "first_corr", "comma_only_error"]].head()

In [12]:
# # Only keep rows where we actually have a correction
# df_clean = df[df["first_corr"].notnull()].copy()

# # Our label: 1 = pure comma error, 0 = not pure comma error
# df_clean["label"] = df_clean["comma_only_error"].astype(int)

# df_clean[["sentence", "first_corr", "label"]].head()

In [7]:
# # Save to CSV
# df_clean.to_csv("df_clean.csv", index=False)
df_clean = pd.read_csv('df_clean.csv')

In [8]:
df_clean["label"].value_counts()


label
0    1163569
1       3015
Name: count, dtype: int64

In [9]:
df_pos = df_clean[df_clean["label"] == 1]
df_neg = df_clean[df_clean["label"] == 0].sample(n=10000, random_state=42)

df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42)
df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset size:", len(df_balanced))
print(df_balanced["label"].value_counts())

Balanced dataset size: 13015
label
0    10000
1     3015
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

X = df_balanced["sentence"]
y = df_balanced["label"]

# Train / temp split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Validation / Test split
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 9110
Validation size: 1952
Test size: 1953


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),       # unigrams + bigrams
        max_features=50000,      # cap vocab size
        lowercase=True
    )),
    ('clf', LogisticRegression(
        class_weight="balanced", # helps with imbalance
        max_iter=200
    ))
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

y_val_pred = model.predict(X_val)

print("Validation Performance")
print(classification_report(y_val, y_val_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Validation Performance
              precision    recall  f1-score   support

           0     0.8178    0.7333    0.7733      1500
           1     0.3410    0.4580    0.3909       452

    accuracy                         0.6696      1952
   macro avg     0.5794    0.5956    0.5821      1952
weighted avg     0.7074    0.6696    0.6848      1952

Confusion Matrix:
[[1100  400]
 [ 245  207]]


In [13]:
y_test_pred = model.predict(X_test)

print("Test Performance")
print(classification_report(y_test, y_test_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Test Performance
              precision    recall  f1-score   support

           0     0.8190    0.7480    0.7819      1500
           1     0.3516    0.4525    0.3958       453

    accuracy                         0.6795      1953
   macro avg     0.5853    0.6003    0.5888      1953
weighted avg     0.7106    0.6795    0.6923      1953

Confusion Matrix:
[[1122  378]
 [ 248  205]]


In [14]:
import pandas as pd

train_df = pd.DataFrame({"sentence": X_train, "label": y_train.astype(int)})
val_df   = pd.DataFrame({"sentence": X_val,   "label": y_val.astype(int)})
test_df  = pd.DataFrame({"sentence": X_test,  "label": y_test.astype(int)})

train_df.head(), val_df.head(), test_df.head()

(                                               sentence  label
 7415     While I was reading  many things surprised me.      1
 9192  Korea people think that guest should be treate...      0
 8970                                 What is her charm?      0
 5418  I dream vyychit english language, I'll be all ...      0
 852          The direction signs are for outsiders too.      1,
                                                 sentence  label
 10018  she's 23years old, and her husband is 29years ...      1
 721              A few days ago, I ran a fever suddenly.      0
 6618   I like baseball and I have been member of base...      0
 12993  I saw this title at 2ch which is the biggest f...      0
 11715  According to the book tittled ENGLISH„ÄÄHACKS, w...      0,
                                                 sentence  label
 12547  It is a beginning from the every moring ,when ...      0
 6483                 In the morning, I had just a fever.      1
 6046           I think tha

In [15]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

raw_datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 9110
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1952
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1953
    })
})

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def tokenize_batch(batch):
    return tokenizer(
        batch["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = raw_datasets.map(tokenize_batch, batched=True)

# Set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(
    [col for col in tokenized_datasets["train"].column_names if col not in ("input_ids", "attention_mask", "label")]
)
tokenized_datasets.set_format("torch")

tokenized_datasets

Map:   0%|          | 0/9110 [00:00<?, ? examples/s]

Map:   0%|          | 0/1952 [00:00<?, ? examples/s]

Map:   0%|          | 0/1953 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 9110
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1952
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1953
    })
})

In [18]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

batch_size = 16

training_args = TrainingArguments(
    output_dir="comma_error_distilbert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [19]:
trainer.train()

  0%|          | 0/1710 [00:00<?, ?it/s]

{'loss': 0.5445, 'grad_norm': 1.3076748847961426, 'learning_rate': 1.9415204678362573e-05, 'epoch': 0.09}
{'loss': 0.4927, 'grad_norm': 3.141963243484497, 'learning_rate': 1.8830409356725147e-05, 'epoch': 0.18}
{'loss': 0.4761, 'grad_norm': 3.30222749710083, 'learning_rate': 1.824561403508772e-05, 'epoch': 0.26}
{'loss': 0.4613, 'grad_norm': 2.9225587844848633, 'learning_rate': 1.7660818713450293e-05, 'epoch': 0.35}
{'loss': 0.4882, 'grad_norm': 5.1680192947387695, 'learning_rate': 1.7076023391812867e-05, 'epoch': 0.44}
{'loss': 0.4405, 'grad_norm': 10.044022560119629, 'learning_rate': 1.649122807017544e-05, 'epoch': 0.53}
{'loss': 0.4353, 'grad_norm': 5.416391849517822, 'learning_rate': 1.5906432748538013e-05, 'epoch': 0.61}
{'loss': 0.4336, 'grad_norm': 2.3040764331817627, 'learning_rate': 1.5321637426900587e-05, 'epoch': 0.7}
{'loss': 0.4558, 'grad_norm': 3.857893705368042, 'learning_rate': 1.4736842105263159e-05, 'epoch': 0.79}
{'loss': 0.4364, 'grad_norm': 5.030749797821045, 'lear

  0%|          | 0/122 [00:00<?, ?it/s]

{'eval_loss': 0.3940388560295105, 'eval_accuracy': 0.8263319672131147, 'eval_f1': 0.5375170532060027, 'eval_runtime': 20.0628, 'eval_samples_per_second': 97.294, 'eval_steps_per_second': 6.081, 'epoch': 1.0}
{'loss': 0.3918, 'grad_norm': 7.60194206237793, 'learning_rate': 1.2982456140350879e-05, 'epoch': 1.05}
{'loss': 0.3129, 'grad_norm': 6.762577533721924, 'learning_rate': 1.239766081871345e-05, 'epoch': 1.14}
{'loss': 0.3205, 'grad_norm': 5.027698516845703, 'learning_rate': 1.1812865497076024e-05, 'epoch': 1.23}
{'loss': 0.3159, 'grad_norm': 14.19728946685791, 'learning_rate': 1.1228070175438597e-05, 'epoch': 1.32}
{'loss': 0.3187, 'grad_norm': 8.066781997680664, 'learning_rate': 1.0643274853801172e-05, 'epoch': 1.4}
{'loss': 0.3101, 'grad_norm': 8.561522483825684, 'learning_rate': 1.0058479532163743e-05, 'epoch': 1.49}
{'loss': 0.2789, 'grad_norm': 8.975966453552246, 'learning_rate': 9.473684210526315e-06, 'epoch': 1.58}
{'loss': 0.3245, 'grad_norm': 11.130906105041504, 'learning_r

  0%|          | 0/122 [00:00<?, ?it/s]

{'eval_loss': 0.41499006748199463, 'eval_accuracy': 0.8227459016393442, 'eval_f1': 0.5938967136150235, 'eval_runtime': 17.493, 'eval_samples_per_second': 111.588, 'eval_steps_per_second': 6.974, 'epoch': 2.0}
{'loss': 0.2526, 'grad_norm': 9.65039348602295, 'learning_rate': 6.549707602339181e-06, 'epoch': 2.02}
{'loss': 0.2227, 'grad_norm': 5.061675071716309, 'learning_rate': 5.964912280701755e-06, 'epoch': 2.11}
{'loss': 0.2289, 'grad_norm': 12.565583229064941, 'learning_rate': 5.380116959064328e-06, 'epoch': 2.19}
{'loss': 0.2234, 'grad_norm': 2.4709632396698, 'learning_rate': 4.7953216374269005e-06, 'epoch': 2.28}
{'loss': 0.2307, 'grad_norm': 10.547578811645508, 'learning_rate': 4.210526315789474e-06, 'epoch': 2.37}
{'loss': 0.1753, 'grad_norm': 10.339212417602539, 'learning_rate': 3.625730994152047e-06, 'epoch': 2.46}
{'loss': 0.2112, 'grad_norm': 3.7938737869262695, 'learning_rate': 3.04093567251462e-06, 'epoch': 2.54}
{'loss': 0.2266, 'grad_norm': 11.696782112121582, 'learning_ra

  0%|          | 0/122 [00:00<?, ?it/s]

{'eval_loss': 0.5007182955741882, 'eval_accuracy': 0.8160860655737705, 'eval_f1': 0.627979274611399, 'eval_runtime': 17.2012, 'eval_samples_per_second': 113.481, 'eval_steps_per_second': 7.093, 'epoch': 3.0}
{'train_runtime': 977.1069, 'train_samples_per_second': 27.97, 'train_steps_per_second': 1.75, 'train_loss': 0.3297425762254592, 'epoch': 3.0}


TrainOutput(global_step=1710, training_loss=0.3297425762254592, metrics={'train_runtime': 977.1069, 'train_samples_per_second': 27.97, 'train_steps_per_second': 1.75, 'total_flos': 905083501317120.0, 'train_loss': 0.3297425762254592, 'epoch': 3.0})

In [20]:
test_results = trainer.evaluate(tokenized_datasets["test"])
test_results

  0%|          | 0/123 [00:00<?, ?it/s]

{'eval_loss': 0.5070383548736572,
 'eval_accuracy': 0.815668202764977,
 'eval_f1': 0.6194503171247357,
 'eval_runtime': 18.9556,
 'eval_samples_per_second': 103.03,
 'eval_steps_per_second': 6.489,
 'epoch': 3.0}