# Sentence Splitting
This notebook uses OCRed text for a all volume years in a folder and splits them into sentences using regular expression pattern matching.<br>
For this notebook to run, there should be an OCRed folder that should contain a .txt file, a .tsv file, and an images sub-folder (more details in the notebook).
<br>There should also be an accompanying `splitting_functs.py` file which has most of the functions required to run this notebook.

In [1]:
# from nltk.tokenize import PunktSentenceTokenizer
import nltk
import pandas as pd
from os import listdir
import re
import sys
from tqdm import tqdm  # For printing out progress bar
from numpy import nan, array_split
import multiprocessing

from splitting_functs import *

pd.set_option('display.max_colwidth', None)

In [2]:
# Read all folder names in the OCR (or a specified) directory
# ocred_path = '/work/otb-lab/OCRed'
ocred_path = '/Users/nitingupta/Desktop/OTB/OCRed'

years = [name for name in listdir(ocred_path) if not name.startswith('.')]
years.sort()
print(years)

['1892', '1893', '1894', '1901', '1918', '1921', '1928', '1948', '1956']


<br>

## Aquiring data

In [3]:
# Create an empty list for the final dataframe 
df = []

In [4]:
# Set up the progress bar
progress_bar = tqdm(total=len(years), file=sys.stderr)

for year in years:
    
    # Update the progress bar
    progress_bar.set_description(f"Processing year {year}")
    
    # This is the directory that will contain the OCRed output:
    dir_OCR = ocred_path + '/' + str(year)
    
    if 'acts_path' in globals():
        del acts_path
        
    acts_path, actsSep = getActsPaths(dir_OCR)
    
    if acts_path is None:
        sys.exit(f'acts path not found for {year}')

    with open(acts_path, 'r') as f:
        # This variable holds all the OCRed text as a String
        data = f.read()
        
    count = data.count("\n\n")+1
    if count < 100:
        sys.exit(f'Count seems low for {year}. Count: {count}')
    # print("The number of pages OCRed for {year} is: {count}".format(year = year, count = count))
    
    # Training the tokenizer
    sent_tokenizer = nltk.PunktSentenceTokenizer(data)
    sentences = sent_tokenizer.tokenize(data)  # A List of tokens/sentences as seperated by nltk's PunktSentenceTokenizer

    # Create a temporary dataframe
    df_temp = pd.DataFrame()
    df_temp["sentence"] = sentences
    
    # A flag to keep track of the first index for each year
    df_temp['first'] = False
    
    # Strip sentences of trailing and leading whitespaces
    df_temp['sentence'] = df_temp['sentence'].str.strip()
    # Remove "\n\n" from the original dataframe as they will interfere with the analysis
    df_temp['sentence'] = df_temp['sentence'].str.replace("\n\n", "", regex = False)
    
    # print("Length of the initial dataframe:", df.shape[0], "\nThis is the number of tokenized sentences.")
    
    # Remove session headers
    df_temp = removeSessionHeaders(df_temp)
    
    imgs, dir_imgs = getImgs(dir_OCR, year)
    # print("The number of image files for this year is:", len(imgs))
    
    fileType = imgs[0].split(".")[1]
    
    df_words = getWordsFrame(acts_path, actsSep)
    
    # Add an empty 'start_page' and 'end_page' column
    df_temp['start_page'] = pd.NA
    df_temp['end_page'] = pd.NA
    
    df_temp = getStartEndPages(df_temp, df_words)
    
    # Adding year
    df_temp.insert(0, 'year', year)
    
    # Get the images path
    df_temp = getImgsPath(df_temp, fileType, dir_imgs)
    
    df_temp.at[0, 'first'] = True
    
    # Append this year's dataframe to the final dataframe
    df.append(df_temp)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.set_description(f"Processed the list")
progress_bar.close()

# Convert the list to a dataframe
df = pd.concat(df, ignore_index=True)

Processed the list: 100%|██████████| 9/9 [00:12<00:00,  1.38s/it]  


In [5]:
print(f"Length of the dataframe: {df.shape[0]}")
display(df)

Length of the dataframe: 37387


Unnamed: 0,year,sentence,first,start_page,end_page,path
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Repre- sentatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
2,1892,Src.,False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
3,1892,2.,False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
4,1892,"The officers of this battalion shall consist of a Lieu- tenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
...,...,...,...,...,...,...
37382,1956,"The Commissioner of Agriculture of South Caro- lina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37383,1956,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of im- porting noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confine- ment.",False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37384,1956,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37385,1956,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,False,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg


<br>

## Pre-Cleaning

In [6]:
# New dataframe so that the results of the matching can be compared
df_cleaned = df.copy()

# A new dictionary to keep track of the number of errors
errorsDict = {}

<br>

### Correcting Some Important Words

In [7]:
target_words = ['section']  # Add more target words
print("Correcting the following word spellings in the dataframe:", target_words)

Correcting the following word spellings in the dataframe: ['section']


In [8]:
num_cores = multiprocessing.cpu_count()
chunks = array_split(df_cleaned, num_cores)

# Initialize NLTK spell checker
nltk.download('words', quiet=True)
spell = nltk.corpus.words.words()

with multiprocessing.Pool(num_cores) as pool:
    threshold = 1.5  # Adjust the threshold as needed
    
    params = [(chunk, spell, target_words, threshold) for chunk in chunks]    
    processed_chunks = pool.starmap(process_chunk, params)

df_cleaned = pd.concat(processed_chunks, ignore_index=True)

In [9]:
df_cleaned.drop(['sentence'], axis = 1, inplace=True)
df_cleaned.rename({'corrected_sentence':'sentence'}, axis=1, inplace=True)

In [10]:
df_cleaned.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Repre- sentatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieu- tenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,


In [11]:
df_cleaned['org_words'].value_counts().head()

            36255
“section      458
sections      320
seotion       146
srction        41
Name: org_words, dtype: int64

<br>

### Removing End-Of-Line Hyphenation

In [12]:
errorsDict['EOL hyphenation'] = df_cleaned['sentence'].str.count(pat = '[-][ ]').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = '[-][ ]',
                                                            repl = "",
                                                            regex = True)

In [13]:
errorsDict

{'EOL hyphenation': 23151}

### Relocating Incorrect "Approved ..." Phrases

In [14]:
rgx_match = re.compile(
    r'^(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){1,2} day of [a-z]+, a\. d\. .{4}(. |.| |)\b)|(approved [a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){1,2}, a\. d\. .{4}(. |.| |)\b)', 
    flags=re.IGNORECASE)

# Search for matches in the 'sentence' column
matches = df_cleaned['sentence'].str.extract(rgx_match)[3]

# Remove the matched patterns from sentences
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(rgx_match, '', n=-1)

# Add matches to the previous sentence
df_cleaned['sentence'] = df_cleaned['sentence'].str.cat(matches.shift(-1), 
                                                        sep=' ', na_rep='')

errorsDict['Approved phrases'] = matches.count().sum()

In [15]:
errorsDict

{'EOL hyphenation': 23151, 'Approved phrases': 755}

### Removing Act Seperators

In [16]:
errorsDict['Act seperators'] = df_cleaned['sentence'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = '',
                                                            regex = True)

In [17]:
errorsDict

{'EOL hyphenation': 23151, 'Approved phrases': 755, 'Act seperators': 216}

In [18]:
print(f"Length of the dataframe: {df_cleaned.shape[0]}")

Length of the dataframe: 37387


<br>

## Adding Section and Act Labels
Add Section and Act labels for each sentence.

In [19]:
df_updated = df_cleaned.copy()

<br>

In [20]:
def getAct(sentence, fallback, pattern, group = 1):
    """
    Get Act labels for the given sentence.
    If the match is an empty string (’’) then the last act is updated by 1.
    Else the matched act number is the new act number.

    Note: Joint Resolutions might be present in the OCRed text, 
    but are not needed here. So, to maintain continuity with Act labels, 
    Joint Resolutions are labeled as a new Act.

    Requires a lastAct variable to keep track of the previous act.
    
    Parameters
    ----------
    sentence : str
        The sentence to output the act for.
    fallback : str
        If no act is found, then return this str instead.
    pat : re.Pattern
        The pattern to search for.
    group : int
        The group to extract from the match.

    Returns
    -------
    str
        The Act number for this `sentence`.
    """  

    global lastAct
    res = pattern.search(sentence.lower())

    # If a Joint Resolution starts with this sentence...
    if 'joint' in sentence.lower().split()[:3] and 'resolution' in sentence.lower().split()[:4]:
        lastAct = str(int(lastAct) + 1)
        return lastAct
    
    # Else if a new Act starts...
    elif res:
        if res.group(group) == '':  # Act number not given, but act is started
            lastAct = str(int(lastAct) + 1)

        else:  # Act number given
            lastAct = res.group(group)

        return lastAct
    
    return fallback

In [21]:
df_updated['act'] = None

# Compile the regex pattern
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act)')
# Initialize lastAct
lastAct = '0'

# Apply the function to the DataFrame
df_updated['act'] = df_updated.apply(lambda x: getAct(x['sentence'], x['act'], pattern), axis=1)

In [22]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,


<br>

<br>

In [23]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act).*?(?:section)\s+([\w]{1,4})[\s.]*(?:be it)')
df_updated['section_rgx1'] = df_updated['sentence'].str.lower().str.extract(pattern)[2]

In [24]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,


<br>

In [25]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^(S|s|E|e|r|C|c){1,}(T|t|I|i|O|o|N|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}')
df_updated['section_rgx3'] = df_updated['sentence'].str.lower().str.extract(pattern)[3]

In [26]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,


<br>

In [27]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'(s|e|r|c){1,}(t|i|o|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}$')

# Search for matches in the 'sentence' column
matches = df_updated['sentence'].str.lower().str.extract(pattern)[3]
# Add match to the next row
matches = matches.shift(1)

df_updated['section_rgx4'] = matches

# Replace the first 'None' value to NaN
df_updated.at[0, 'section_rgx4'] = nan

In [28]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0,
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,,


<br>

In [29]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'§(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)')
df_updated['section_rgx5'] = df_updated['sentence'].str.lower().str.extract(pattern)[1]

In [30]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,,,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0,,
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,,,
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,,,


<br>

In [31]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^([0Oo1Iil!2Z5S6G\d]{1,3})(. |.| |)')
df_updated['section_rgx6'] = df_updated['sentence'].str.lower().str.extract(pattern)[0]

<br>

In [32]:
# Combine all section columns
df_updated['section_comb'] = df_updated['section_rgx1'].fillna(df_updated['section_rgx3']).fillna(df_updated['section_rgx4']).fillna(df_updated['section_rgx5']).fillna(df_updated['section_rgx6'])

In [33]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,,,,,
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0,,,,1.0
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,,,,2.0,2.0
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,,,,,


<br>

In [34]:
# Initialize lastAct
lastAct = '0'

# Go through sentences and label all occurences of new acts with their section numbers
def labelSections(row):
    """
    Label every sentence that has a new act number (something different from the last row's) 
    with 0 if a section does not already exists there.
    
    Requires lastAct variable to be initialized outside of this function.
    
    Parameters
    ----------
    row : pandas.Dataframe row
        The row to perform operation on

    Returns
    -------
    pandas.Dataframe row
        The modified row.
    """
    
    global lastAct
    
    # If at the first index or if the previous act num is not equal to this act
    if row['first'] == True or lastAct != row['act']:
        
        # If this section value is None, label with 0
        if pd.isnull(row['section_comb']):
            row['section_comb'] = 0
            
    # Update lastAct...
    lastAct = row['act']
     
    return row


# Apply the function to the DataFrame
df_updated = df_updated.apply(labelSections, axis=1)

In [35]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,,,,,0.0
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0,,,,1.0
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,,,,2.0,2.0
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,,,,,


<br>

In [36]:
map_dict = {
    'o':0,
    'O':0,
    'I':1,
    'i':1,
    'l':1,
    '!':1,
    'Z':2,
    'z':2,
    'S':5,
    'G':6
}

df_updated['section_comb'] = df_updated.apply(lambda x: fixCol(x['section_comb'], map_dict), axis =1)
df_updated['act'] = df_updated.apply(lambda x: fixCol(x['act'], map_dict), axis =1)

In [37]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1.0,,,,,,0.0
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,,,1.0,,,,1.0
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,,,,,,,
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,,,,,,2.0,2.0
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,,,,,,,


<br>

In [38]:
def fillMissing(value, firstIndex):
    """
    Fill in missing values for a column by setting missing values to the previous value.
    Not using ffill() from Pandas since we need to account for special cases (basically
    reseting the lastValid index) in the case that a new volume starts.
    Requires lastValid variable to be defined outside of the function.

    Parameters
    ----------
    value : str
        Value to check and fill.
    firstIndex : bool
        A flag for whether this row's value is the first value of a volume.
        Basically whether a new volume is starting.

    Returns
    -------
    str
        The new value.
    """

    global lastValid

    # If at the first index
    if firstIndex == True:
            
        # If a value doesn't exist, set that value and lastValid to 0
        if pd.isnull(value):
            value = 0
        
        # Set the lastValid's value to this value
        lastValid = value
        return value        


    # Else, if not at the first index
    # If a value does not exist
    if pd.isnull(value):
        # Set value to the lastValid value
        value = lastValid

    # If a value exists, that update lastValid
    else:
        lastValid = value

    return value

In [39]:
lastValid = 0
df_updated['section_comb'] = df_updated.apply(lambda x: fillMissing(x['section_comb'], x['first']), axis =1)

lastValid = 0
df_updated['act'] = df_updated.apply(lambda x: fillMissing(x['act'], x['first']), axis =1)

In [40]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1892,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,False,,1,,,,,,0
1,1892,False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,,1,,1.0,,,,1
2,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,False,,1,,,,,,1
3,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,False,,1,,,,,2.0,2
4,1892,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,,1,,,,,,2


<br>

In [41]:
df_updated.rename({'section_comb':'section'}, axis=1, inplace=True)

In [42]:
cols_keep = ['year', 'sentence', 'start_page', 'end_page', 'act', 'section', 'path']
df_updated.drop(
    list(set(df_updated.columns.tolist()) - set(cols_keep)), axis = 1, inplace=True)

In [43]:
print(f"Length of the dataframe: {df_updated.shape[0]}")
display(df_updated)

Length of the dataframe: 37387


Unnamed: 0,year,start_page,end_page,path,sentence,act,section
0,1892,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,1,0
1,1892,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",1,1
2,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,1,1
3,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,1,2
4,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",1,2
...,...,...,...,...,...,...,...
37382,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",1132,1
37383,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",1132,1
37384,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,1132,1
37385,1956,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,1132,1


<br>

## Post-Cleaning
Some cleaning after adding Section and Acts.

In [44]:
def upperIfNeeded(sentence, ratio = 0.50):
    """
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    Needs an `uppered` variable to be defined outside of this scope.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    """

    global uppered
    
    # A count of the number of already uppercased words
    count = 0
    
    # Check whether the word consists of only letters,
    # has a length greater than 1, is uppercase, and 
    # isn't "SECTION"
    for word in sentence.split(" "):
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    # return all uppercase words
    if (count/len(sentence.split(" ")) > ratio):
        uppered += 1
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [45]:
uppered = 0
df_updated['sentence'] = df_updated.apply(lambda x: upperIfNeeded(x['sentence']), axis=1)
errorsDict['Uppercased'] = uppered

In [46]:
errorsDict

{'EOL hyphenation': 23151,
 'Approved phrases': 755,
 'Act seperators': 216,
 'Uppercased': 608}

<br>

## Character Length
Add the character length feature.
<br>This is added here because the lengths of the sentences might have changed during the cleaning process above.

In [47]:
df_updated["length"] = df_updated['sentence'].str.len()

In [48]:
df_updated.head()

Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1892,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,1,0,108
1,1892,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",1,1,476
2,1892,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,Src.,1,1,5
3,1892,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,2.,1,2,3
4,1892,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",1,2,355


<br>

### Removing Sentences With Low Character Length

Get rid of sentences with a low number of characters as they might not form meaningful sentences.
<br>Define a cutoff for the sentences. All sentences belows this length will be removed.
<br>Our research has shown that 30 character limit seems to be optimal for keeping informative sentences in the corpus.

In [49]:
cut_len = 30

In [50]:
# Initial length
ilen = df_updated.shape[0]

In [51]:
df_updated = df_updated[ df_updated["length"] > cut_len ]
print("Length of the cleaned dataframe: ", df_updated.shape[0])
print("Reduction of about {:.2f}%".format( (1 - df_updated.shape[0]/ilen) * 100))

Length of the cleaned dataframe:  28910
Reduction of about 22.67%


In [52]:
df_updated.reset_index(drop=True, inplace=True)

In [53]:
df_updated

Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1892,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,1,0,108
1,1892,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",1,1,476
2,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",1,2,355
3,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,They shall be paid the same as battalion staffs in the Volunteer Troops.,1,2,73
4,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",1,2,233
...,...,...,...,...,...,...,...,...
28905,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",1132,1,228
28906,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",1132,1,411
28907,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,1132,1,88
28908,1956,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,1132,1,89


<br>

## Dropping duplicates

In [54]:
print(f"The number of dropped sentences is {df_updated[df_updated.duplicated(subset=['sentence'])].shape[0]}")

The number of dropped sentences is 1784


In [55]:
df_dropped = df_updated.drop_duplicates(subset=['sentence'])

In [56]:
print(f"Length of the dataframe: {df_dropped.shape[0]}")
display(df_dropped)

Length of the dataframe: 27126


Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1892,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,1,0,108
1,1892,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",1,1,476
2,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",1,2,355
3,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,They shall be paid the same as battalion staffs in the Volunteer Troops.,1,2,73
4,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",1,2,233
...,...,...,...,...,...,...,...,...
28903,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132 An Act To Require The Commissioner Of Agriculture Of South Carolina To Determine What Are Noxious Weeds And Plants; To Prevent Importation Into The State Of Such Weeds And Plants And To Provide A Penalty For The Violation of The Provisions Of This Act.,1132,113,258
28904,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Be it enacted by the General Assembly of the State of South Carolina Commissioner of Agriculture—regulate importation of noxious weeds—penalties SECTION 1.,1132,0,156
28905,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",1132,1,228
28906,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",1132,1,411


<br>

## Adding Features

In [57]:
import warnings
warnings.filterwarnings("ignore")

<br>

### Adding ID

In [58]:
# The final dataframe (will be concatenated) containing the id's
df_final = []

# Loop through each year in the dataframe
for year in years:
    
    # Make a temporary dataframe that only contains rows for this year
    df_temp = df_dropped[df_dropped.year.str.startswith(year)]

    # Reset it's index
    df_temp.reset_index(drop=True, inplace=True)

    # Add a new column with the index value
    df_temp['id'] = df_temp.index.values
    
    # Get the length of the id of the last row in the dataframe, which is used to assess how many 0's will be prefixed to the other ids
    maxNumLength = len(str(df_temp.last_valid_index()))

    # Add the updated id
    df_temp['id'] = df_temp.apply(lambda x: str(year) + "_" + addPrefix( str(x['id']), maxNumLength ), axis=1)
        
    # Append to the final dataframe
    df_final.append(df_temp)

In [59]:
# Concatenate the final dataframe
df_final = pd.concat(df_final, ignore_index=True)

# Set it's index to the new ids
df_final.set_index('id', inplace=True)

In [60]:
print(f"Length of the dataframe: {df_final.shape[0]}")
display(df_final)

Length of the dataframe: 27126


Unnamed: 0_level_0,year,start_page,end_page,path,sentence,act,section,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1892_0000,1892,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,1,0,108
1892_0001,1892,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",1,1,476
1892_0002,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",1,2,355
1892_0003,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,They shall be paid the same as battalion staffs in the Volunteer Troops.,1,2,73
1892_0004,1892,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",1,2,233
...,...,...,...,...,...,...,...,...
1956_4308,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132 An Act To Require The Commissioner Of Agriculture Of South Carolina To Determine What Are Noxious Weeds And Plants; To Prevent Importation Into The State Of Such Weeds And Plants And To Provide A Penalty For The Violation of The Provisions Of This Act.,1132,113,258
1956_4309,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,Be it enacted by the General Assembly of the State of South Carolina Commissioner of Agriculture—regulate importation of noxious weeds—penalties SECTION 1.,1132,0,156
1956_4310,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",1132,1,228
1956_4311,1956,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",1132,1,411


<br>

### Adding the Remaining Features

In [61]:
df_final.insert(1, 'law_type', pd.NA)
df_final.insert(2, 'state', 'SOUTH CAROLINA')

In [62]:
df_final['law_type'] = df_final.apply(lambda x : addJoints(x['sentence']), axis=1)

In [63]:
def fixJoints(row_law_type, row_Act_label):
    """
    Fix incorrect labels which should be "Joint Resolution".
    
    Parameters
    ----------
    row_law_type : str
        The current row's law type.
        Ex. 'Act' or 'Joint Resolution'

    row_Act_label: str
       The current row's Act label/value.

    Returns
    -------
    str
        Either "Joint Resolution" or `row_law_type`.
    
    """
    global joint_label

    # If the row's law_type is 'Joint Resolution', then assign the label number to `joint_label`.
    # This row is already a 'Joint Resolution'
    if row_law_type == 'Joint Resolution':
        joint_label = row_Act_label
        return 'Joint Resolution'
    
    # If this row is not a 'Joint Resolution', then
    # If the row's Act label is the same as `joint_label` then that row should be a Joint Resolution.
    elif row_Act_label == joint_label:
        return 'Joint Resolution'
    
    # Otherwise, that row is an act
    else:
        return row_law_type
        
        
joint_label = -1
df_final['law_type'] = df_final.apply(lambda x: fixJoints(x['law_type'], x['act']), axis = 1)

<br>

## Some Final Touches

In [64]:
df_final.drop('year', axis=1, inplace=True)

In [65]:
cols_keep.remove('year')
cols_keep.insert(0, 'state')
cols_keep.insert(0, 'law_type')
cols_keep.insert(3, 'length')

In [66]:
cols_keep

['law_type',
 'state',
 'sentence',
 'length',
 'start_page',
 'end_page',
 'act',
 'section',
 'path']

In [67]:
df_final = df_final[cols_keep]

In [68]:
print(f"Length of the dataframe: {df_final.shape[0]}")
display(df_final)

Length of the dataframe: 27126


Unnamed: 0_level_0,law_type,state,sentence,length,start_page,end_page,act,section,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1892_0000,Act,SOUTH CAROLINA,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,108,045,045,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1892_0001,Act,SOUTH CAROLINA,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",476,045,046,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1892_0002,Act,SOUTH CAROLINA,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",355,046,046,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
1892_0003,Act,SOUTH CAROLINA,They shall be paid the same as battalion staffs in the Volunteer Troops.,73,046,046,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
1892_0004,Act,SOUTH CAROLINA,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",233,046,046,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
...,...,...,...,...,...,...,...,...,...
1956_4308,Act,SOUTH CAROLINA,1132 An Act To Require The Commissioner Of Agriculture Of South Carolina To Determine What Are Noxious Weeds And Plants; To Prevent Importation Into The State Of Such Weeds And Plants And To Provide A Penalty For The Violation of The Provisions Of This Act.,258,01448,01448,1132,113,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_4309,Act,SOUTH CAROLINA,Be it enacted by the General Assembly of the State of South Carolina Commissioner of Agriculture—regulate importation of noxious weeds—penalties SECTION 1.,156,01448,01448,1132,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_4310,Act,SOUTH CAROLINA,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",228,01448,01448,1132,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_4311,Act,SOUTH CAROLINA,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",411,01448,01448,1132,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg


<br>

## Exporting

In [69]:
# df_final.to_csv('final_splits_testing.csv')

In [70]:
# df_test = df_final.drop(['law_type', 'state', 'length', 'end_page', 'path'], axis=1)
# df_test.to_csv(f'{year}_testing.csv')
# print(f'exporting {year}')