Check for files that couldn't be written to the log because of unicode errors

In [None]:
import logging
import os

In [None]:
base_dir = "D:\\proofread"
os.listdir(base_dir)

In [None]:
unlogged_files = []

for file_name in os.listdir(base_dir):
    with open("temp.txt", "at") as file:
        try:
            file.write(file_name)
        except:
            unlogged_files.append(file_name)

In [None]:
unlogged_ids = list(set([u[:13] for u in unlogged_files]))

Remove unlogged ids from sentences.tsv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
df = df[~df.file.isin(unlogged_ids)]
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

Check for files with incomplete sentence extraction

In [None]:
import re

In [None]:
with open("C:\\Users\\Banjamin\\sentence-pair-extraction\\buildSentenceDataset.log", 'r', encoding="utf-8", errors='ignore') as file:
    lines = file.readlines()

In [None]:
info_lines = [l for l in lines if l.split(":")[0] == "INFO"]

In [None]:
incomplete_cases = []
for line in info_lines:
    match = re.search("Added (\d+) of (\d+) sentences for (O-20\d{2}-\d{6})", line)
    if match is not None:
        if int(match.group(1)) != int(match.group(2)):
            incomplete_cases.append(match.group(3))

In [None]:
print("{} cases with incomplete extraction out of {} total cases".format(len(incomplete_cases), len(info_lines)))

Remove incomplete cases from sentences.tsv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
df = df[~df.file.isin(incomplete_cases)]
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

Build list of case ids to exclude from execution. If correct is set to true, cases with incomplete sentence extraction will be repeated. Otherwise, only cases not in the log will be processed.

In [None]:
correct = True
exclude_case_ids = []
info_lines = [l for l in lines if l.split(":")[0] == "INFO"]

for line in info_lines:
    processed_match = re.search("O-20\d{2}-\d{6}", line)
    if processed_match is not None:
        if correct:
            correct_match = re.search("Added (\d+) of (\d+) sentences for (O-20\d{2}-\d{6})", line)
            if correct_match is not None:
                if int(correct_match.group(1)) == int(correct_match.group(2)):
                    exclude_case_ids.append(correct_match.group(3))
        else:
            exclude_case_ids.append(processed_match.group(0))

Remove incomplete cases from sentences.tsv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
df = df[~df.file.isin(incomplete_cases)]
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

Remove certain case IDs from log file

In [None]:
import re
import pickle
import pandas as pd

In [None]:
with open("C:\\Users\\Banjamin\\sentence-pair-extraction\\buildSentenceDataset.log", 'r', encoding="utf-8", errors='ignore') as file:
    lines = file.readlines()

In [None]:
with open("long_sentence_files", 'rb') as file:
    long_sentence_files = pickle.load(file)

In [None]:
with open("C:\\Users\\Banjamin\\sentence-pair-extraction\\buildSentenceDataset_temp.log", 'w', encoding="utf-8", errors='ignore') as file:
    for line in lines:
        match = re.search("O-20\d{2}-\d{6}", line)
        if match is not None:
            if match.group(0) not in long_sentence_files:
                file.write(line)

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
df = df[~df.file.isin(long_sentence_files)]
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

In [None]:
long_sentence_df = 
long_sentence_df["length"] = long_sentence_df.original.str.len()
long_sentence_df

In [None]:
long_sentence_df.groupby('file').length.max().describe()

Examine dataframe

In [None]:
import pandas as pd
import editdistance
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
df = df.fillna(" ")

In [None]:
df.original.str.len().hist()
plt.show()

In [None]:
df.original.str.len().describe()

Some sentences are excessively long (too long for the T5 tokenizer). These may be the files for which sentence tokenization failed.

In [None]:
counts = df[["file", "original"]].groupby("file").count()
counts[counts.original < 10]

In [None]:
less_than_ten = counts[counts.original < 10].index.values
less_than_ten_df = df[df.file.isin(less_than_ten)]

In [None]:
less_than_ten_df["changes"] = less_than_ten_df.astype(str).apply(lambda row: editdistance.eval(row.original, row.revised) / max(len(row.original), len(row.revised)), axis=1)

In [None]:
less_than_ten_df.head()

In [None]:
single_sentence_files = list(counts[counts.original == 1].index)

In [None]:
df[df.file.isin(single_sentence_files)].original.str.len().hist()
plt.show()

In [None]:
df[df.file.isin(single_sentence_files)].original.str.len().describe()

In [None]:
df[~df.file.isin(single_sentence_files)].original.str.len().describe()

Clearly this does not cover all long sentences. We will use the T5 tokenizer to determine the acceptable sentences directly (using Colab).

### Correct sentence tokenization errors

Sometimes, the spacy sentence tokenizer has made some errors.

The following code is designed to correct for these sentence tokenization errors. Sentences for which the original and revised both start with a nonupper case letter are joined to the preceeding sentence.

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import editdistance
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")

In [None]:
df.shape

In [None]:
previous_file = df.file.values[0]
compressed = [previous_file]
for file in df.file.values[1:]:
    if file != previous_file:
        if file not in compressed:
            compressed.append(file)
        else:
            print(file)
    previous_file = file

In [None]:
df[df.file == "O-2016-000006"]

In [None]:
df[df.original == df.loc[958016].original]

In [None]:
df = df[df.file != "o-2016-000255"]
df = df[~((df.index < 1120108) & (df.file == "o-2016-006897"))]
df = df[~((df.index < 958768) & (df.file == "O-2016-000006"))]

In [None]:
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t")
#Replace NAs with whitespaace or empty string
#df = df.dropna()
#df = df.fillna(" ")

In [None]:
#Clean the leading whitespace
df["original"] = df.original.apply(lambda x: x.strip("/\r"))
df["revised"] = df.revised.apply(lambda x: x.strip("/\r"))

In [None]:
#df = df[df.original.str.len() != 0]
#df = df[df.revised.str.len() != 0]
df = df[~((df.original.str.len() == 0) & (df.revised.str.len() == 0))]

In [None]:
df = df.reset_index(drop=True)
df.head()

In [None]:
df.loc[:10000].file.unique()

### ToDO: use the average edit distance per file to remove files that had sentence misallignment during extraction

Papers with a negative skewness in the distribution of extent of changes are likely to have been incorrectly extracted (i.e., the sentences are misaligned. They can be removed by check the skewnews of extent  of changes for all papers.

In [None]:
temp = df[df.file == "O-2016-002070"]
temp["changes"] = temp.apply(lambda row: editdistance.eval(row.original, row.revised) / max(len(row.original), len(row.revised)), axis=1)

In [None]:
temp.changes.hist()
plt.show()
print("Skewness: ", temp.changes.skew())

The execution of editdistance for the full dataset is too memory demanding for pandas. Instead run the calculation in dask.

In [None]:
df.shape[0] * 0.001

In [None]:
def get_extent_of_changes(row):
    max_length = max(len(row.original), len(row.revised))
    if max_length < 895:
        return editdistance.eval(row.original, row.revised) / max_length
    else:
        return np.nan

In [None]:
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences.tsv", sep="\t", index=False)

In [None]:
df["changes"] = df.astype(str).apply(lambda row: get_extent_of_changes(row), axis=1)

In [None]:
df.head()

In [None]:
skewness = df.groupby(["file"]).changes.skew()
misalligned_files = skewness[skewness < -0.6].index

In [None]:
df = df[~df.file.isin(misalligned_files)]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences_for_tokenizer_correction.tsv", sep="\t", index=False)

Combine incorrectly divided sentences

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences_for_tokenizer_correction.tsv", sep="\t")

In [None]:
half_sentence_indexes = list(df[~df.astype(str).original.apply(lambda x: x[0].isupper()) & ~df.astype(str).revised.apply(lambda x: x[0].isupper())].index)

In [None]:
half_sentence_indexes[:15]

In [None]:
# Approx 16 min
combine_index_lists = [[-1]]
max_index = 0
for i in tqdm(half_sentence_indexes):
    if i > max_index:
        index_list = [i-1]
        while i in half_sentence_indexes:
            index_list.append(i)
            i += 1
        combine_index_lists.append(index_list)
        max_index = max(max_index, max(index_list))
combine_index_lists.pop(0)

In [None]:
combine_index_lists

An example of incorrect sentence tokenization is shown as follows. The spacy tokenizer mistaked the period following "Fig" as the end of a sentence.

In [None]:
df.loc[2196].original, df.loc[2196].revised

In [None]:
df.loc[2197].original, df.loc[2197].revised

In [None]:
df.loc[2198].original, df.loc[2198].revised

In [None]:
df.loc[2199].original, df.loc[2199].revised

In [None]:
df.loc[2200].original, df.loc[2200].revised

Depending on the first character of the sencond sentence fragment, the two fragments should be joined either with a space or witout a space

In [None]:
no_space_join_1 = ['!', '%', ',', '-', '.', '/', ':', ';', '?', '_', '`', '|', '‐', '‒', '–', '—',]
no_space_join_2 = ['" ', "' "]

In [None]:
def join_fragments(fragment_list):
    result = ""
    for fragment in fragment_list:
        if fragment[0] in no_space_join_1 or fragment[0:2] in no_space_join_2:
            result += fragment
        else:
            result += " " + fragment
    return result[1:]

In [None]:
df.loc[1478:1481]

In [None]:
combine_index_lists[1][1:]

In [None]:
# Approx 8 hours
for index_list in tqdm(combine_index_lists):
    start, stop = index_list[0], index_list[-1]
    if start >= 0:
        original = df.loc[start:stop].groupby(df.loc[start:stop]["file"])["original"].transform(lambda x: join_fragments(x)).loc[start].replace(" ,", ",")
        revised = df.loc[start:stop].groupby(df.loc[start:stop]["file"])["revised"].transform(lambda x: join_fragments(x)).loc[start].replace(" ,", ",")

        df.loc[start].original = original
        df.loc[start].revised = revised

        df = df.drop(index_list[1:])

In [None]:
df.loc[812].original

#### TODO: Remove sentences that are too long for the T5 tokenizer (512 tokens) (send to Colab)
#### TODO: After correcting tokenization errors, remove sentences where the original and revised are the same

In [None]:
df[df.original != df.revised]

In [None]:
tdf = pd.DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]})
tdf

In [None]:
tdf.drop([2])

In [None]:
import pandas as pd
import editdistance

In [None]:
df = pd.read_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences_for_tokenizer_correction.tsv", sep="\t")

In [None]:
df = df.drop("changes", axis=1)

In [None]:
df["changes"] = df.astype(str).apply(lambda row: editdistance.eval(row.original, row.revised) / max(len(row.original), len(row.revised)), axis=1)

In [None]:
df.to_csv("C:\\Users\\Banjamin\\sentence-pair-extraction\\sentences_final.tsv", sep="\t", index=False)