In [31]:
from datasets import load_dataset
import pandas as pd
import ast
import helpers
from helpers import *
import nltk
from nltk import CFG, ChartParser


In [32]:
# ds = load_dataset("jhu-clsp/jfleg")
# ds.save_to_disk("JFLEG")
# ds["test"].to_csv("test.csv")
# ds["validation"].to_csv("validation.csv")

In [33]:
test_df = pd.read_csv("test.csv")
val_df  = pd.read_csv("validation.csv")


test_df["corr_list"] = test_df["corrections"].apply(parse_corrections)
val_df["corr_list"]  = val_df["corrections"].apply(parse_corrections)

test_df["comma_candidate"] = test_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)
val_df["comma_candidate"] = val_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)

test_comma = test_df[test_df["comma_candidate"]]
val_comma  = val_df[val_df["comma_candidate"]]

# Project Gutenberg Data

Here I will construct the Gutenberg dataset

We will not define our own grammar, as this would be quite complicated. Instead we decide to import a pretrained parser to do parts of speech tagging. We decided on using the spaCy NLP package. As a group, we are aware that spaCy has the capability of 

In [34]:
import spacy

# Make sure you've done: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")



In [None]:

grammar_str = r"""
S -> CLAUSE PUNCT
S -> CLAUSE CONJ CLAUSE PUNCT

CLAUSE -> NP VP

NP -> PRON
NP -> DET N
NP -> N

VP -> V
VP -> V NP
VP -> V ADV
VP -> V NP ADV
VP -> AUX V
VP -> AUX V NP
VP -> AUX V ADV
VP -> AUX V NP ADV
VP -> V NP PP
VP -> AUX V NP PP

PP -> P NP

PRON -> 'PRON'
DET  -> 'DET'
N    -> 'N'
V    -> 'V'
AUX  -> 'AUX'
P    -> 'P'
CONJ -> 'CONJ'
PUNCT -> 'PUNCT'
COMMA -> 'COMMA'
ADV -> 'ADV'
"""

grammar = CFG.fromstring(grammar_str)
s_parser = ChartParser(grammar)

clause_nt = nltk.Nonterminal('CLAUSE')
clause_grammar = CFG(clause_nt, grammar.productions())
clause_parser = ChartParser(clause_grammar)

helpers.s_parser = s_parser
helpers.clause_parser = clause_parser

In [None]:
examples = [
    "I went home, I slept.",
    "I went home, and I slept.",
    "I went home and I slept.",
    "Every person needs to know a bit about math, so they can manage daily life.",
    "Every person needs to know a bit about math.",
]

for s in examples:
    print(s, "=>", is_cfg_comma_splice(s))

I went home, I slept. => True
I went home, and I slept. => False
I went home and I slept. => False
Every person needs to know a bit about math, so they can manage daily life. => False
Every person needs to know a bit about math. => False


In [None]:
import json

records = []
with open("lang-8_data.dat", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            records.append(obj)
        except:
            pass


In [None]:
rows = []

for rec in records:
    journal_id = rec[0]
    sentence_id = rec[1]
    learning_language = rec[2]
    native_language = rec[3]
    learner_sents = rec[4]
    corrections = rec[5]

    for sent, corr_list in zip(learner_sents, corrections):
        rows.append({
            "journal_id": journal_id,
            "sentence_id": sentence_id,
            "learning_language": learning_language,
            "native_language": native_language,
            "sentence": sent,
            "corrections": corr_list
        })

In [None]:
import pandas as pd
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,journal_id,sentence_id,learning_language,native_language,sentence,corrections
0,1057227,290610,Korean,English,오늘 배운 새 표현 / New expressions I learned today,[오늘 배운 새[f-blue]로운[/f-blue] 표현[f-blue]들[/f-blu...
1,1057227,290610,Korean,English,TTMIK가 제자 자주 쓰는 한국교재이에요.,"[TTMIK가 제자 자주 쓰는 한국교재[sline]이[/sline]에요., TTMI..."
2,1057227,290610,Korean,English,오늘은 새로 레슨 나와서 그 레슨에게서 새로 표현이 배웠어요.,[오늘은 새로 레슨 나와서 그 레슨에게서 새로 표현[f-red]을[/f-red] 배...
3,1057227,290610,Korean,English,밑에 그 표현들 붙혔어요.,"[밑에 그 표현들 붙[f-red]였[/f-red]어요., 밑에 그 표현들[f-blu..."
4,1057227,290610,Korean,English,TTMIK is a Korean learning resource that I use...,[]


In [None]:
# Take the first correction as our reference target
df["first_corr"] = df["corrections"].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

from helpers import comma_only_edit

# Label whether the only edits between sentence and correction are comma edits
df["comma_only_error"] = df.apply(
    lambda row: comma_only_edit(row["sentence"], row["first_corr"]),
    axis=1
)

df[["sentence", "first_corr", "comma_only_error"]].head()

In [None]:
df = df[df["learning_language"] == "English"]
df = df.reset_index(drop=True)

In [None]:
# Only keep rows where we actually have a correction
df_clean = df[df["first_corr"].notnull()].copy()

# Our label: 1 = pure comma error, 0 = not pure comma error
df_clean["label"] = df_clean["comma_only_error"].astype(int)

df_clean[["sentence", "first_corr", "label"]].head()