# Create Corpora

(C) 2023-2024 by [Damir Cavar](http://damir.cavar.me/)

This notebook generates the datasets for the Ellipsis experiments with Large Language Models (LLMs) and other NLP tools.

This code is part of the [NLP-Lab](http://nlp-lab.org/) [Ellipsis Project](https://nlp-lab.org/ellipsis/).

In [142]:
import os
import regex as re
import pandas as pd
from ipydatagrid import DataGrid
import glob
import csv
from nltk.tokenize.treebank import TreebankWordDetokenizer
import random

In [125]:
re_entry = re.compile(r"\n+(?P<ellipsis>.+)\n^----\n(?P<fullform>^.+)\n(?P<rest>(^([AB]:\s+.+|#.+)\n)*)", re.MULTILINE |re.UNICODE)

In [126]:
def parse_txt(filename: str):
    ellipsis = []
    fullforms = []
    with open(filename, mode='r', encoding='utf-8') as f:
        text = f.read()
        for line in f:
            line = line.strip()
        counter = 0
        for match in re_entry.finditer(text):
            ellipsis.append(match.group('ellipsis').strip())
            fullforms.append(match.group('fullform').strip())
            counter += 1
    return counter, ellipsis, fullforms

In [3]:
def parse_txt_format(filename: str, debug: bool=False) -> list:
    res = []
    if "ellipsis" in filename:
        with open(filename, mode='r', encoding='utf-8') as ifp:
            text = ifp.read()
        if debug:
            print(text)
        counter = 0
        for match in re_entry.finditer(text):
            counter += 1
        return counter
    return 0

In [127]:
directory = os.path.join("..", "data", "*.txt")
res = []
res_data = set()
for file in glob.glob(directory):
    count, ellipsis, fullform = parse_txt(file)
    if count > 0:
        res.append( (os.path.splitext(os.path.basename(file))[0], count) )
    if ellipsis:
        res_data.update( set( [ + e for e in zip(ellipsis, fullform) ]) )
res.sort(key=lambda x: x[2], reverse=True)
df = pd.DataFrame(res, columns=("type", "count"), index=None)

In [128]:
ru_pairs = [ (TreebankWordDetokenizer().detokenize(x[1].split()), TreebankWordDetokenizer().detokenize(x[2].split())) for x in res_data if x[0] == 'Russian' ]
ru_pairs_labeled = [ (TreebankWordDetokenizer().detokenize(e[0].split()), 1) for e in ru_pairs ]
ru_pairs_labeled.extend( [ (TreebankWordDetokenizer().detokenize(e[1].split()), 0) for e in ru_pairs ] )

In [129]:
distract_directory = os.path.join("..", "data", "distractors", "*.txt")
just_distractors = []
for filename in glob.glob(distract_directory):
    with open(filename, mode='r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("#"):
                if line.startswith("# STOP"):
                    break
                continue
            ru_pairs_labeled.append( (line, 0) )

In [112]:
random.shuffle(ru_pairs_labeled)

In [130]:
for x in ru_pairs_labeled[:10]:
    print(x)
count_ellipsis = sum( [ 1 for x in ru_pairs_labeled if x[1] == 1 ] )
count_non_ellipsis = sum( [ 1 for x in ru_pairs_labeled if x[1] == 0 ] )
print(f"Examples with ellipsis {count_ellipsis}\nExamples without ellipsis {count_non_ellipsis}")

('If I need to ask for money up front later, I will ___.', 1)
('Big cyclical companies are using "all the tricks they can ___ to stabilize earnings," says Mr. Sloan.', 1)
("Jill likes your story even though she hates Bill's ___.", 1)
("Many are quick to emphasize that just because the market can fall as fast as it did ___ Friday doesn't mean it will tank again, despite some disquieting similarities between now and October 1987.", 1)
('His wife also works for the paper, as did his father ___.', 1)
('Did Frank get married first, or ___ Larry ___?', 1)
("They didn't ___.", 1)
('With her keen on him, and him ___ on her, the party should be fun.', 1)
('"A lot of people think I will give away the store, but I can assure you I will not ___," he says.', 1)
('Why, she tried to learn you your book, she tried to learn you your manners, she tried to be good to you every way she knowed how ___.', 1)
Examples with ellipsis 559
Examples without ellipsis 646


In [131]:
def clean_string(s):
    res = re.sub(r"\s\s+", " ", s.replace("___", ""))
    return TreebankWordDetokenizer().detokenize(res.split())

In [132]:
clean_ru_pairs_labeled = [ (clean_string(x[0]), x[1]) for x in ru_pairs_labeled ]
first_task_data = pd.DataFrame(clean_ru_pairs_labeled, columns=["sentence", "ellipsis"])
print(first_task_data)

                                               sentence  ellipsis
0     If I need to ask for money up front later, I w...         1
1     Big cyclical companies are using "all the tric...         1
2     Jill likes your story even though she hates Bi...         1
3     Many are quick to emphasize that just because ...         1
4     His wife also works for the paper, as did his ...         1
...                                                 ...       ...
1200  Exports in October stood at $5.29 billion, a m...         0
1201  South Korea's economic boom, which began in 19...         0
1202  Government officials said exports at the end o...         0
1203  Despite the gloomy forecast, South Korea has r...         0
1204  From January to October, the nation's accumula...         0

[1205 rows x 2 columns]


In [133]:
for i in range(10):
    first_task_data = first_task_data.sample(frac=1).reset_index(drop=True)
    first_task_data.to_csv(f'task1_random_{i+1}.csv', header=None, index=None)

In [134]:
second_task_data = pd.DataFrame([ (clean_string(x[0]), x[0]) for x in ru_pairs ], columns=["without", "with"])
for i in range(10):
    second_task_data = second_task_data.sample(frac=1).reset_index(drop=True)
    second_task_data.to_csv(f'task2_random_{i+1}.csv', header=None, index=None)

In [135]:
third_task_data = pd.DataFrame([ (clean_string(x[0]), x[1]) for x in ru_pairs ], columns=["without", "with"])
for i in range(10):
    third_task_data = third_task_data.sample(frac=1).reset_index(drop=True)
    third_task_data.to_csv(f'task3_random_{i+1}.csv', header=None, index=None)