In [7]:
from datasets import load_dataset
import pandas as pd
import ast
import helpers
from helpers import *
import nltk
from nltk import CFG, ChartParser


In [8]:
# ds = load_dataset("jhu-clsp/jfleg")
# ds.save_to_disk("JFLEG")
# ds["test"].to_csv("test.csv")
# ds["validation"].to_csv("validation.csv")

In [9]:
test_df = pd.read_csv("test.csv")
val_df  = pd.read_csv("validation.csv")


test_df["corr_list"] = test_df["corrections"].apply(parse_corrections)
val_df["corr_list"]  = val_df["corrections"].apply(parse_corrections)

test_df["comma_candidate"] = test_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)
val_df["comma_candidate"] = val_df.apply(
    lambda row: comma_change_row(row["sentence"], row["corr_list"]), axis=1
)

test_comma = test_df[test_df["comma_candidate"]]
val_comma  = val_df[val_df["comma_candidate"]]

We will not define our own grammar, as this would be quite complicated. Instead we decide to import a pretrained parser to do parts of speech tagging. We decided on using the spaCy NLP package. As a group, we are aware that spaCy has the capability of 

In [10]:
import spacy

# Make sure you've done: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")



In [16]:

grammar_str = r"""
S -> CLAUSE PUNCT
S -> CLAUSE CONJ CLAUSE PUNCT

CLAUSE -> NP VP

NP -> PRON
NP -> DET N
NP -> N

VP -> V
VP -> V NP
VP -> V ADV
VP -> V NP ADV
VP -> AUX V
VP -> AUX V NP
VP -> AUX V ADV
VP -> AUX V NP ADV
VP -> V NP PP
VP -> AUX V NP PP

PP -> P NP

PRON -> 'PRON'
DET  -> 'DET'
N    -> 'N'
V    -> 'V'
AUX  -> 'AUX'
P    -> 'P'
CONJ -> 'CONJ'
PUNCT -> 'PUNCT'
COMMA -> 'COMMA'
ADV -> 'ADV'
"""

grammar = CFG.fromstring(grammar_str)
s_parser = ChartParser(grammar)

clause_nt = nltk.Nonterminal('CLAUSE')
clause_grammar = CFG(clause_nt, grammar.productions())
clause_parser = ChartParser(clause_grammar)

helpers.s_parser = s_parser
helpers.clause_parser = clause_parser

In [17]:
examples = [
    "I went home, I slept.",
    "I went home, and I slept.",
    "I went home and I slept.",
    "Every person needs to know a bit about math, so they can manage daily life.",
    "Every person needs to know a bit about math.",
]

for s in examples:
    print(s, "=>", is_cfg_comma_splice(s))

I went home, I slept. => True
I went home, and I slept. => False
I went home and I slept. => False
Every person needs to know a bit about math, so they can manage daily life. => False
Every person needs to know a bit about math. => False
