# Faster Sentence Splitting
This notebook uses OCRed text for a volume year and splits it into sentences using regular expression pattern matching.<br>
For this notebook to run, there should be an OCRed folder that should contain a .txt file, a .tsv file, and an images sub-folder (more details in the notebook).

To understand the code, please view `sentence_splitting_explained.ipynb`.

In [1]:
from nltk.tokenize import PunktSentenceTokenizer
import pandas as pd
from os import listdir
import re

<br>
Either get the year variable from elsewhere (such as when this notebook is accessed from another file) or specify the year.

In [2]:
# Get the year variable from somewhere else
%store -r year

In [3]:
# # If running this notebook independently,
# # Uncoment the following line of code
# year = '1901'

In [4]:
# This is the directory that will contain the OCRed output:
dir_OCR = "/work/otb-lab/OCRed/" + str(year)

print(f"Working on {year} under {dir_OCR}")

Working on 1901 under /work/otb-lab/OCRed/1901


In [5]:
if 'acts_path' in globals():
    print("Deleting acts_path")
    del acts_path

In [6]:
# Try reading in "{year}_text.txt" if the Acts and Joints were seperate for the year
try:
    # Try to get a filename in the directory which has 'act' and 'text' in it
    for file in listdir(dir_OCR):
        
        if "txt" in file.lower() and "joint" not in file.lower() and "concurrent" not in file.lower() and "bills" not in file.lower():
            if "acts" in file.lower():
                acts_path = dir_OCR + "/" + file
                break
                
            elif "act" in file.lower():
                acts_path = dir_OCR + "/" + file
                break
    
    print(acts_path)
    with open(acts_path, 'r') as f:
        data = f.read()

    # If the read is successful, set a flag that identifies that the Acts and Joints are seperate
    actsSep = True

# However, if the directory contains {year}_Both.txt instead, a FileNotFoundError will be returned for the above code.
# So, catch that error and read in "{year}_Both.txt"
except:
    
    try:
    
    
        # Try to get a filename in the directory which has 'both' and 'text' in it
        for file in listdir(dir_OCR):

            if "both" in file.lower() and "txt" in file.lower() and "joint" not in file.lower() and "concurrent" not in file.lower() and "bills" not in file.lower():
                acts_path = dir_OCR + "/" + file

        print(acts_path)
        with open(acts_path, 'r') as f:
            data = f.read()

        actsSep = False  # The flag being False means that the Acts and Joints are not seperate
        
        
    # However, some years might contain {year}_Acts_Joints.txt
    except:
        
        # Try to get a filename in the directory which has 'both' and 'text' in it
        for file in listdir(dir_OCR):

            if "acts" in file.lower() and "joints" in file.lower() and "txt" in file.lower():
                acts_path = dir_OCR + "/" + file

        print(acts_path)
        with open(acts_path, 'r') as f:
            data = f.read()

        actsSep = False  # The flag being False means that the Acts and Joints are not seperate


# This variable holds all the OCRed text as a String
# data

/work/otb-lab/OCRed/1901/1901_Acts.txt


In [7]:
print("The number of pages OCRed for {year} is: {count}".format(year = year, count = (data.count("\n\n")+1)))

The number of pages OCRed for 1901 is: 258


<br>

## A. Training the tokenizer

In [8]:
sent_tokenizer = PunktSentenceTokenizer(data)
sentences = sent_tokenizer.tokenize(data)

# A List of tokens/sentences as seperated by nltk's PunktSentenceTokenizer
# sentences

<br>

## B. Creating the dataframe

In [9]:
# Add to a new DataFrame
df = pd.DataFrame()
df["sentence"] = sentences

In [10]:
# Strip sentences of trailing and leading whitespaces
df['sentence'] = df['sentence'].str.strip()

In [11]:
print("Length of the initial dataframe:", df.shape[0], "\nThis is the number of tokenized sentences.")

Length of the initial dataframe: 1864 
This is the number of tokenized sentences.


<br>

## C. Adding page file names

In [12]:
try:
    dir_imgs = dir_OCR + "/images"
    imgs = listdir(dir_imgs)
except FileNotFoundError:

    try:
        dir_imgs = dir_OCR + "/Images"
        imgs = listdir(dir_imgs)
    except FileNotFoundError:
        
        try:
            dir_imgs = dir_OCR + "/images.zip"
            imgs = listdir(dir_imgs)

        except FileNotFoundError:
            
            try:
                dir_imgs = dir_OCR + "/Images.zip"
                imgs = listdir(dir_imgs)

            except FileNotFoundError:
                dir_imgs = dir_OCR + "/" + year
                imgs = listdir(dir_imgs)
            

print(f"The images directory is {dir_imgs}")
    
imgs = [img for img in imgs if "jpg" in img or "tiff" in img or "JPG" in img or "TIFF" in img]
imgs.sort()
print("The number of image files for this year is:", len(imgs))

The images directory is /work/otb-lab/OCRed/1901/images
The number of image files for this year is: 270


In [13]:
fileType = imgs[0].split(".")[1]
print(f"The files are of type: {fileType}")

The files are of type: jpg


<br>

In [14]:
for file in listdir(dir_OCR):
    if "tsv" in file.lower():
        
        if actsSep:           
            
            # Try to get a filename in the directory which has 'act' (or 'acts') and 'tsv' in it
            if "act" in file.lower() or "acts" in file.lower():
                words_path = dir_OCR + '/' + file
               
        else:
    
            if "both" in file.lower() or "joints" in file.lower():
                words_path = dir_OCR + '/' + file               


df_words = pd.read_table(words_path)
print(words_path)

/work/otb-lab/OCRed/1901/1901_Acts_data.tsv


<br>

In [15]:
df['page'] = pd.NA

# Drop the columns which are unessecary for our analysis
df_words.drop(columns=["left", "top", "width", "height", "conf"], inplace=True)

# Drop the rows which don't contain a word in the "text" column
df_words.dropna(inplace=True)

# Relabel the "name" column to "page" column
df_words.rename(columns={"name": "page"}, inplace=True)

# Reassign index after dropping nas
df_words = df_words.assign(row_number=range(len(df_words)))
df_words.set_index('row_number', inplace=True)

# Drop the 'page' column from the org dataframe
df.drop(columns=['page'], inplace=True)

# Add an empty 'start_page' and 'end_page' column
df['start_page'] = pd.NA
df['end_page'] = pd.NA

In [16]:
# Remove "\n\n" from the original dataframe as they will interfere with the analysis
df['sentence'] = df['sentence'].str.replace("\n\n", "", regex = False)

In [17]:
# Tracker for df_words:
words_trkr = 0

# Loop over the original dataframe
for i in range(0, df.shape[0]):

    # For each sentence, extract the first and last word
    tmp_sentence = df.iloc[i]['sentence'].split(" ")
    start, last = tmp_sentence[0], tmp_sentence[-1]
    
    # Get the page number for the start and end word
    try:
        start_page = df_words.iloc[words_trkr]['page']
    except IndexError:
        try:
            words_trkr -= len(tmp_sentence)
            start_page = df_words.iloc[words_trkr]['page']
        except:
            start_page = df_words['page'].iloc[-1]
    
    try:
        end_page = df_words.iloc[words_trkr + len(tmp_sentence)]['page']
    except IndexError:
        try:
            end_page = df_words.iloc[words_trkr]['page']
        except:
            end_page = df_words['page'].iloc[-1]
    

    # Remove the filename from the pages:
    start_page = start_page.split(".")[0]
    end_page = end_page.split(".")[0]

    
    # Assign the page number to their respective columns in the dataframe
    df.at[i, 'start_page'] = start_page
    df.at[i, 'end_page'] = end_page
    
    # Update tracker
    words_trkr += len(tmp_sentence)

<br>

## D. Further Cleaning and Regex
Remove unecessary words in the sentences which do not contribute to the overall meaning.

In [18]:
# New dataframe so that the results of the matching can be compared
df_cleaned = df.copy()

# A new dictionary to keep track of the number of errors
errorsDict = {}

<br>

### 1. Removing section identifiers

In [19]:
errorsDict['section identifiers'] = df_cleaned['sentence'].str.count(pat = r"^(S|s|E|e|C|c|T|t|I|i|O|o|N|n){2,}(\.|,|:|;| ){0,2}[0Oo1Iil!2Z5S6G\d]{1,2}(. |.| |)|(S|s|E|e|C|c|T|t|I|i|O|o|N|n){2,}(\.|,|:|;| ){0,2}[0Oo1Iil!2Z5S6G\d]{1,2}(. |.| |)$").sum()

In [20]:
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r"^(S|s|E|e|C|c|T|t|I|i|O|o|N|n){2,}(\.|,|:|;| ){0,2}[0Oo1Iil!2Z5S6G\d]{1,2}(. |.| |)|(S|s|E|e|C|c|T|t|I|i|O|o|N|n){2,}(\.|,|:|;| ){0,2}[0Oo1Iil!2Z5S6G\d]{1,2}(. |.| |)$",
                                                            repl = "",
                                                            regex = True)

<br>

### 2. Removing end of line hyphenation

In [21]:
errorsDict['EOL hyphenation'] = df_cleaned['sentence'].str.count(pat = '[-][ ]').sum()

In [22]:
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = '[-][ ]',
                                                            repl = "",
                                                            regex = True)

<br>

### 3. Relocating incorrect "Approved ..." phrases

In [23]:
# A seperate and special method is needed for this match
# because the match will be appended to the previous law
def replaceInDF(rgx_match: re.Pattern, df: pd.DataFrame, retCount = False):
    '''
    Find the provided regex pattern in the provided dataframe.
    
    Parameters
    ----------
    rgx_match : re.Pattern
        A regular expression pattern that will be search for and replaced in the df
    df: pandas.Dataframe
        A Pandas dataframe to search and replace for
        Should contain a 'sentence' column, in which the matches which will be replaced
    retCount: Boolean
        A flag to identify whether the function should return the number of matches.
        
    Returns
    -------
    if retCount == True:
        A tuple consisting of:
            df: pandas.Dataframe
            The modified Dataframe with the matches performed
            errorCount: int
            A count of how many times this error was found.
    else:
        df: pandas.Dataframe
        The modified Dataframe with the matches performed
    '''
    
    if retCount:
        errorCount = 0
    
    for i in range(0, df.shape[0]):
        
        # Look for matches
        m = re.search(rgx_match, df.iloc[i]['sentence'])

        # If matches found then add to the previous sentence
        if m:
            df.at[i-1, 'sentence'] = df.iloc[i-1]['sentence'] + " " + str(m.group())
    
    
        if retCount:
            # Remove the matched patterns from sentences
            df.at[i, 'sentence'], numError = re.subn(rgx_match, '', df.iloc[i]['sentence'])
            errorCount += numError
        else:
            df.at[i, 'sentence'] = re.sub(rgx_match, '', df.iloc[i]['sentence'])
        
    if retCount:
        return df, errorCount
    else:
        return df

In [24]:
rgx_match = re.compile(
    r'^Approved the [0Oo1Iil!2Z5S6G\d]{1,2}(?:th|st|nd|rd) day of [A-Z][a-z]+, A\. D\. .{4}(. |.| |)\b|Approved [A-Z][a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(?:th|st|nd|rd), A\. D\. .{4}(. |.| |)\b')

In [25]:
# df_cleaned = replaceInDF(rgx_match, df_cleaned, retCount=False)
df_cleaned, errorsDict['Approved phrases'] = replaceInDF(rgx_match, df_cleaned, retCount=True)

<br>

### 4. Removing Act seperators

In [26]:
errorsDict['Act seperators'] = df_cleaned['sentence'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()

In [27]:
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = "",
                                                            regex = True)

<br>

### 5. Removing incorrect numbers at the start

In [28]:
errorsDict['Incorrect starting nums'] = df_cleaned['sentence'].str.count(pat = r'^[0Oo1Iil!2Z5S6G\d]{1,3}(. |.| |)').sum()

In [29]:
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'^[0Oo1Iil!2Z5S6G\d]{1,3}(. |.| |)',
                                                            repl = "",
                                                            regex = True)

<br>

### 6. Removing session headers

In [30]:
disregarded = 0  # Count for the number removed

for i, sent in enumerate(df_cleaned['sentence']):

    # If the sentence with "an" is found, exit the loop
    if 'act' in sent.lower().strip():
           break
    
    # Disregard the sentence since it does not start with "an"
    df_cleaned.drop(index=i, inplace=True)

    disregarded += 1

In [31]:
# Reset the index
df_cleaned.reset_index(drop=True, inplace=True)

In [32]:
print(f"Total number of sentences disregarded: {disregarded}.")
errorsDict['Session headers'] = disregarded

Total number of sentences disregarded: 7.


<br>

### 7. Converting to uppercase

In [33]:
def upperIfNeeded(sentence, ratio = 0.50):
    '''
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    sentence: str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    '''
    
    count = 0  # A count of the number of already uppercased words
    
    for word in sentence.split(" "):
        # Check whether the word consists of only letters,
        # has a length greater than 1, is uppercase, and 
        # isn't "SECTION"
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    if (count/len(sentence.split(" ")) > ratio):
        # Return all uppercase words
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [34]:
# Apply the above function to each sentence in 'org_sent'
# And store the output a new column named 'modified'
df_cleaned['sentence'] = df_cleaned.apply(lambda x: upperIfNeeded(x['sentence']), axis=1)

<br>

### Results
<b>Note:</b> The error fixing above is not perfect and some errors are still present in the dataframe after performing these operations.
Also, some errors are too random to match and search using a pattern, and are still present in the dataframe.
<br>The output below shows the number of errors corrected for this volume

In [35]:
errorsDict

{'section identifiers': 528,
 'EOL hyphenation': 1826,
 'Approved phrases': 114,
 'Act seperators': 4,
 'Incorrect starting nums': 340,
 'Session headers': 7}

<br>

## E. Character length
Add the character length feature. This is added here because the lengths of the sentences might have changed during the cleaning process above.

In [36]:
df_cleaned["length"] = df_cleaned['sentence'].str.len()

In [37]:
df_cleaned.head()

Unnamed: 0,sentence,start_page,end_page,length
0,AN ACT TO GRANT TO THE CITY COUNCIL OF CHARLES...,35,35,179
1,Be it enacted by the General Assembly of the S...,35,36,888
2,The grant herein made is upon the express cond...,36,36,287
3,c. 3.,36,36,5
4,That in case of the failure of the United Stat...,36,36,269


Get rid of sentences with a low number of characters as they might not form meaningful sentences.
<br>However, first, get the statistics on the length column to avoid removing meaningful sentences.

<br>
Define a cutoff for the sentences. All sentences belows this length will be removed

In [38]:
cut_len = 30

In [39]:
# Initial length
ilen = df_cleaned.shape[0]

In [40]:
df_cleaned = df_cleaned[ df_cleaned["length"] > cut_len]
print("Length of the cleaned dataframe: ", df_cleaned.shape[0])
print("Reduction of about {:.2f}%".format( (1 - df_cleaned.shape[0]/ilen) * 100))

Length of the cleaned dataframe:  1359
Reduction of about 26.82%


In [41]:
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned.index.name = "index"

# Rearrange columns
cols = df_cleaned.columns.tolist()
cols = [cols[0]] + cols[-1:] + cols[1:-1]
df_cleaned = df_cleaned[cols]

<br>

## F. Adding features

### 1. Adding ID

In [42]:
def addPrefix(fileName: str, nameLen: int) -> str:
    '''
    Since the fileNames from the excel parsing could be any of any length
    (ranging from 1-3), this function appends a string of 0's to the 
    start of the input so that it is the specified nameLen lengths long.
    
    Parameters
    ----------
    fileName : str
        The file name that needs to be prefixed
        The fileName shouldn't have a prefix, such as '.tiff'
    nameLen : int
        The length of the expected name of the file
        Ex. '00034.jpg' would have length of 5
        so nameLen should be 5

    Returns
    -------
    str
        A length nameLen file name (prefixed with 0's)
    '''
    
    # prefix_length = nameLen - len(fileName)
    prefix = "0" * (nameLen - len(fileName))
    
    return prefix + fileName

In [43]:
df_cleaned.reset_index(inplace=True)
df_cleaned.rename(columns={"index" : "id"}, inplace=True)

In [44]:
# The length of the id of the last row in the dataframe, which is used to assess how many 0's will be prefixed to the other ids
maxNumLength = len(str(df_cleaned.last_valid_index()))

df_cleaned['id'] = df_cleaned.apply(lambda x: str(year) + "_" + addPrefix( str(x['id']), maxNumLength ), axis=1)

<br>

### 2. Adding the remaining identifiers

In [45]:
df_cleaned.insert(1, 'law_type', 'Acts')
df_cleaned.insert(2, 'state', 'SOUTH CAROLINA')

<br>

## Exporting

In [46]:
# # Export the final dataframe to csv for viewing
# df_cleaned.to_csv(f"{year}_faster.csv", index=False)