# Sentence Splitting
This notebook uses OCRed text for a all volume years in a folder and splits them into sentences using regular expression pattern matching.<br>
For this notebook to run, there should be an OCRed folder that should contain a .txt file, a .tsv file, and an images sub-folder (more details in the notebook).
<br>There should also be an accompanying `splitting_functs.py` file which has most of the functions required to run this notebook.

In [1]:
# from nltk.tokenize import PunktSentenceTokenizer
import nltk
import pandas as pd
import re
import sys
import multiprocessing
import os

from tqdm import tqdm  # For printing out progress bar
from numpy import nan, array_split

from splitting_functs import *

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_colwidth', None)

In [2]:
# Read all folder names in the OCR (or a specified) directory
ocred_path = '/work/otb-lab/OCRed'
# ocred_path = '/Users/nitingupta/Desktop/OTB/OCRed'

years = [name for name in os.listdir(ocred_path) if not name.startswith('.')]
years.sort()
print(years)

['1868-69', '1869-1870', '1870-1871', '1871', '1871-1872', '1872-1873', '1873', '1873-1874', '1874', '1875-76', '1877-78', '1878', '1879', '1880', '1881-82', '1883', '1884', '1885', '1886-1887', '1888', '1889', '1890', '1891', '1892', '1893', '1894', '1896', '1897', '1898', '1899', '1900', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913', '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921', '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1958b', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968']


<br>

## Aquiring data

In [3]:
# Create an empty list for the final dataframe 
df = []

In [4]:
# Set up the progress bar
progress_bar = tqdm(total=len(years), file=sys.stderr)

for year in years:
    
    # Update the progress bar
    progress_bar.set_description(f"Processing year {year}")
    
    # This is the directory that will contain the OCRed output:
    dir_OCR = ocred_path + '/' + str(year)
    
    if 'acts_path' in globals():
        del acts_path
        
    acts_path, actsSep = getActsPaths(dir_OCR)
    
    if acts_path is None:
        sys.exit(f'acts path not found for {year}')

    with open(acts_path, 'r') as f:
        # This variable holds all the OCRed text as a String
        data = f.read()
        
    count = data.count("\n\n")+1
    if count < 100:
        sys.exit(f'Count seems low for {year}. Count: {count}')
    # print("The number of pages OCRed for {year} is: {count}".format(year = year, count = count))
    
    # Training the tokenizer
    sent_tokenizer = nltk.PunktSentenceTokenizer(data)
    sentences = sent_tokenizer.tokenize(data)  # A List of tokens/sentences as seperated by nltk's PunktSentenceTokenizer

    # Create a temporary dataframe
    df_temp = pd.DataFrame()
    df_temp["sentence"] = sentences
    
    # A flag to keep track of the first index for each year
    df_temp['first'] = False
    
    # Strip sentences of trailing and leading whitespaces
    df_temp['sentence'] = df_temp['sentence'].str.strip()
    # Remove "\n\n" from the original dataframe as they will interfere with the analysis
    df_temp['sentence'] = df_temp['sentence'].str.replace("\n\n", "", regex = False)
    
    # print("Length of the initial dataframe:", df.shape[0], "\nThis is the number of tokenized sentences.")
    
    # Remove session headers
    df_temp = removeSessionHeaders(df_temp)
    
    imgs, dir_imgs = getImgs(dir_OCR, year)
    # print("The number of image files for this year is:", len(imgs))
    
    fileType = imgs[0].split(".")[1]
    
    df_words = getWordsFrame(acts_path, actsSep)
    
    # Add an empty 'start_page' and 'end_page' column
    df_temp['start_page'] = pd.NA
    df_temp['end_page'] = pd.NA
    
    df_temp = getStartEndPages(df_temp, df_words)
    
    # Adding year
    df_temp.insert(0, 'year', year)
    
    # Get the images path
    df_temp = getImgsPath(df_temp, fileType, dir_imgs)
    
    df_temp.at[0, 'first'] = True
    
    # Append this year's dataframe to the final dataframe
    df.append(df_temp)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.set_description(f"Processed the list")
progress_bar.close()

# Convert the list to a dataframe
df = pd.concat(df, ignore_index=True)

Processed the list: 100%|██████████| 100/100 [04:41<00:00,  2.81s/it]      


In [5]:
print(f"Length of the dataframe: {df.shape[0]}")
display(df)

Length of the dataframe: 596688


Unnamed: 0,year,sentence,first,start_page,end_page,path
0,1868-69,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PRO- VIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE ME- CHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTY-TWO.",True,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1,1868-69,"Whereas, by an Act of Congress, approved the twenty-third day of July, in the year of our Lord one thousand eight hundred and sixty-six, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixty-two, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixty-two, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the me- chanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
2,1868-69,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assem- bly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefits-of the said Act of Con- gress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixty-two, entitled “An Act donating pub- lic lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,071,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
3,1868-69,Sec. 2.,False,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
4,1868-69,"Upon the passage of this Act, the Governor of the State is au- thorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
...,...,...,...,...,...,...
596683,1968,"Section 14-2563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : “Section 14-2563.1.",False,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
596684,1968,The board is authorized to buy any real estate needed for county purposes.,False,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
596685,1968,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",False,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
596686,1968,"No purchase or sale shall be made unless the written approval of the county board of administrators is first obtained.” SECTION 9. Time effective——This act shall take effect upon ap- proval by the Governor, except Sections 7 and 8, which shall be effective July 1, 1969.",False,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff


<br>

## Pre-Cleaning

In [6]:
# New dataframe so that the results of the matching can be compared
df_cleaned = df.copy()

# A new dictionary to keep track of the number of errors
errorsDict = {}

### Correcting Some Important Words

In [7]:
target_words = ['section']  # Add more target words
print("Correcting the following word spellings in the dataframe:", target_words)

Correcting the following word spellings in the dataframe: ['section']


In [8]:
num_cores = multiprocessing.cpu_count()
chunks = array_split(df_cleaned, num_cores)
print(f'Using {num_cores} cores.')

with multiprocessing.Pool(num_cores) as pool:
    threshold = 1.5  # Adjust the threshold as needed
    
    params = [(chunk, target_words, threshold) for chunk in chunks]    
    processed_chunks = pool.starmap(correct_chunk, params)

df_cleaned = pd.concat(processed_chunks, ignore_index=True)

Using 64 cores.


In [9]:
df_cleaned.drop(['sentence'], axis = 1, inplace=True)
df_cleaned.rename({'corrected_sentence':'sentence'}, axis=1, inplace=True)

In [10]:
df_cleaned.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PRO- VIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE ME- CHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTY-TWO.",False,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twenty-third day of July, in the year of our Lord one thousand eight hundred and sixty-six, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixty-two, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixty-two, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the me- chanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assem- bly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefits-of the said Act of Con- gress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixty-two, entitled “An Act donating pub- lic lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is au- thorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,


In [11]:
df_cleaned['org_words'].value_counts().head()

            579472
“section      8367
sections      4568
seotion       1307
secrion        682
Name: org_words, dtype: int64

<br>

### Splitting Sentences Based On "Approved ..." Phrases
Some "Approved..." phrases appear at the end of an Act. Sometimes a phrase, such as a new Act, might also by added on to the end of this phrase.
The phrases after the incorrect "Approved" phrases should be split into a new sentence.

In [12]:
approved_rgx_strings = [r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})', 
                        r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved: [a-z| |.]+, [a-z]+.)', 
                        r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})']

In [13]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [14]:
approved_rgx

[re.compile(r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
 re.IGNORECASE|re.UNICODE)]

In [15]:
df_new = df_cleaned.copy()

for i, rgx in enumerate(approved_rgx_strings):
    print(f'Working on pattern {i+1} using {os.cpu_count()} cores.')
    
    # Split the DataFrame using multiprocessing
    new_rows = []

    # Create a multiprocessing pool with the number of desired processes
    with multiprocessing.Pool() as pool:
        args_list = [(row, rgx, list(df_new.columns)) for _, row in df_new.iterrows()]

        # Use starmap to pass the tuples as separate arguments to process_row
        results = pool.starmap(process_row, args_list)

    # Flatten the list of lists into a single list of split rows
    new_rows = [item for sublist in results for item in sublist]

    # Create a new DataFrame with the split rows
    df_new = pd.DataFrame(new_rows)

Working on pattern 1 using 64 cores.
Working on pattern 2 using 64 cores.
Working on pattern 3 using 64 cores.
Working on pattern 4 using 64 cores.
Working on pattern 5 using 64 cores.


In [16]:
df_new.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PRO- VIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE ME- CHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTY-TWO.",False,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twenty-third day of July, in the year of our Lord one thousand eight hundred and sixty-six, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixty-two, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixty-two, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the me- chanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assem- bly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefits-of the said Act of Con- gress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixty-two, entitled “An Act donating pub- lic lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is au- thorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,


In [17]:
if df_new.shape[0] > df_cleaned.shape[0]:
    errorsDict['Split Sentences'] = df_new.shape[0] - df_cleaned.shape[0]
    df_cleaned = df_new
    display(errorsDict)

{'Split Sentences': 1232}

<br>

### Relocating Incorrect "Approved ..." Phrases
Since "Approved..." phrases appear in different formats throughout the years, the code uses multiple Regex strings to match the first occurence.
<br>By testing on the entire corpus, these 5 strings match the formats in all years. However, there will be some outliers (errors) that can not be captured by these strings.

In [18]:
# Add the caret in the regex strings...
for i, string in enumerate(approved_rgx_strings):
    approved_rgx_strings[i] = '^' + string
approved_rgx_strings

['^(approved the [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
 '^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved: [a-z| |.]+, [a-z]+.)',
 '^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\\d]{1,2}(.| |,){0,1})']

In [19]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [20]:
approved_rgx

[re.compile(r'^(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
 re.IGNORECASE|re.UNICODE)]

In [21]:
errorsDict['Approved phrases'] = 0

for rgx_match in approved_rgx:
    
    # Search for matches in the 'sentence' column
    matches = df_cleaned['sentence'].str.extract(rgx_match)[0]
    
    # Remove the matched patterns from sentences
    df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(rgx_match, '', n=-1)
    
    # Add matches to the end of previous sentence
    df_cleaned['sentence'] = df_cleaned['sentence'].str.cat(matches.shift(-1), 
                                                            sep=' ', na_rep='')
    
    errorsDict['Approved phrases'] += matches.count().sum()

In [22]:
errorsDict

{'Split Sentences': 1232, 'Approved phrases': 28346}

In [23]:
df_cleaned.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PRO- VIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE ME- CHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTY-TWO.",False,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twenty-third day of July, in the year of our Lord one thousand eight hundred and sixty-six, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixty-two, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixty-two, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the me- chanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assem- bly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefits-of the said Act of Con- gress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixty-two, entitled “An Act donating pub- lic lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is au- thorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,


<br>

### Removing End-Of-Line Hyphenation
Whenever a word in the sentence continues from the end of a line to the beginning of the next line and is joined by a hyphen, the OCRed sentence also contains that hyphen and a space.
For example, 'Commander-in-Chief' is OCRed as 'Com- mander-in-Chief'
The following code implements regex patterns to remove "- " in the text since each hyphenated word is split with "- ".

In [24]:
errorsDict['EOL hyphenation'] = df_cleaned['sentence'].str.count(pat = r'(—|_|-)( )*').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'(—|_|-)( )*',
                                                            repl = "",
                                                            regex = True)

In [25]:
errorsDict

{'Split Sentences': 1232, 'Approved phrases': 28346, 'EOL hyphenation': 653842}

In [26]:
df_cleaned.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,


<br>

### Removing Act Seperators
The horizontal lines differentiating one Act from another show up as U+2014 : EM DASH characters (one or multiple) in the OCR.
<br>For example, '——- —— AN ACT...' or '—— AN ACT...'

In [27]:
errorsDict['Act seperators'] = df_cleaned['sentence'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = '',
                                                            regex = True)

In [28]:
errorsDict

{'Split Sentences': 1232,
 'Approved phrases': 28346,
 'EOL hyphenation': 653842,
 'Act seperators': 0}

In [29]:
print(f"Length of the dataframe: {df_cleaned.shape[0]}")

Length of the dataframe: 597920


<br>

## Adding Section and Act Labels
Add Section and Act labels for each sentence.
<br><br><b>NOTE:</b> Volumes before, and including, 1894 have Act labels in their marginalias. During the marginlia removal process, these Act labels are removed. 
To assign each sentence an Act label, the code beklow utilizes Regex patterns finding Act labels in the text. Thus, for volumes with Act labels in their marginalias, the Act labels are highly inaccurate.
<br>That is not to say that Act or Section labels for other volumes will be a 100% accurate due to the nature of errors arising from marginalia removals, OCR, and non-perfect Regex matches.

In [30]:
df_updated = df_cleaned.copy()

In [31]:
start_act_labels = {
                    '1869-1870': 188, 
                    '1870-71': 301, 
                    '1872-1873': 217, 
                    '1873-1874': 388, 
                    '1873': 388, 
                    '1874': 647, 
                    '1877-78': 294, 
                    '1878': 545, 
                    '1880': 262, 
                    '1881-82': 431, 
                    '1883': 164, 
                    '1884': 415, 
                    '1886': 237, 
                    '1889': 166, 
                    '1890': 433, 
                    '1891': 670, 
                    '1893': 283, 
                    '1894': 506
}

In [32]:
def getAct(year, sentence, fallback, first, pattern, group = 1):
    """
    Get Act labels for the given sentence.
    If the match is an empty string (’’) then the last act is updated by 1.
    Else the matched act number is the new act number.

    Note: Joint Resolutions might be present in the OCRed text, 
    but are not needed here. So, to maintain continuity with Act labels, 
    Joint Resolutions are labeled as a new Act.

    Requires a lastAct variable to keep track of the previous act.
    
    Parameters
    ----------
    sentence : str
        The sentence to output the act for.
    fallback : str
        If no act is found, then return this str instead.
    pat : re.Pattern
        The pattern to search for.
    group : int
        The group to extract from the match.

    Returns
    -------
    str
        The Act number for this `sentence`.
    """  

    global lastAct
    res = pattern.search(sentence.lower())
    
    # See if `year` is in start_act_labels for a new volume start
    if first:
        if year in start_act_labels:
            lastAct = str(start_act_labels[year] - 1)
        # Else, check if a new volume starts
        else:
            lastAct = '0'
    
    # If a Joint Resolution starts with this sentence...
    if 'joint' in sentence.lower().split()[:3] and 'resolution' in sentence.lower().split()[:4]:
        lastAct = str(int(lastAct) + 1)
        return lastAct
    
    # Else if a new Act starts...
    elif res:
        if res.group(group) == '':  # Act number not given, but act is started
            lastAct = str(int(lastAct) + 1)

        else:  # Act number given
            lastAct = res.group(group)

        return lastAct
    
    return fallback

In [33]:
df_updated['act'] = None

# Compile the regex pattern
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act)')

# Initialize lastAct
lastAct = '0'

# Apply the function to the DataFrame
df_updated['act'] = df_updated.apply(lambda x: getAct(x['year'], x['sentence'], x['act'], x['first'], pattern, 1), axis=1)

In [34]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,


<br>

In [35]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act).*?(?:section)\s+([\w]{1,4})[\s.]*(?:be it)')
df_updated['section_rgx1'] = df_updated['sentence'].str.lower().str.extract(pattern)[2]

In [36]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,


<br>

In [37]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^(S|s|E|e|r|C|c){1,}(T|t|I|i|O|o|N|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}')
df_updated['section_rgx3'] = df_updated['sentence'].str.lower().str.extract(pattern)[3]

In [38]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,


<br>

In [39]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'(s|e|r|c){1,}(t|i|o|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}$')

# Search for matches in the 'sentence' column
matches = df_updated['sentence'].str.lower().str.extract(pattern)[3]
# Add match to the next row
matches = matches.shift(1)

df_updated['section_rgx4'] = matches

# Replace the first 'None' value to NaN
df_updated.at[0, 'section_rgx4'] = nan

In [40]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,,1.0
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,,2.0


<br>

In [41]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'§(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)')
df_updated['section_rgx5'] = df_updated['sentence'].str.lower().str.extract(pattern)[1]

In [42]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,,,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,,1.0,
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0,,
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,,2.0,


<br>

In [43]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^([0Oo1Iil!2Z5S6G\d]{1,3})(. |.| |)')
df_updated['section_rgx6'] = df_updated['sentence'].str.lower().str.extract(pattern)[0]

<br>

In [44]:
# Combine all section columns
df_updated['section_comb'] = df_updated['section_rgx1'].fillna(df_updated['section_rgx3']).fillna(df_updated['section_rgx4']).fillna(df_updated['section_rgx5']).fillna(df_updated['section_rgx6'])

In [45]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,,,,,
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,,,,,
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,,1.0,,,1.0
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0,,,,2.0
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,,2.0,,,2.0


<br>

In [46]:
# Initialize lastAct
lastAct = '0'

# Go through sentences and label all occurences of new acts with their section numbers
def labelSections(row):
    """
    Label every sentence that has a new act number (something different from the last row's) 
    with 0 if a section does not already exists there.
    
    Requires lastAct variable to be initialized outside of this function.
    
    Parameters
    ----------
    row : pandas.Dataframe row
        The row to perform operation on

    Returns
    -------
    pandas.Dataframe row
        The modified row.
    """
    
    global lastAct
    
    # If at the first index or if the previous act num is not equal to this act
    if row['first'] == True or lastAct != row['act']:
        
        # If this section value is None, label with 0
        if pd.isnull(row['section_comb']):
            row['section_comb'] = 0
            
    # Update lastAct...
    lastAct = row['act']
     
    return row


# Apply the function to the DataFrame
df_updated = df_updated.apply(labelSections, axis=1)

In [47]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,,,,,0
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,,,,,0
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,,1.0,,,1
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0,,,,2
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,,2.0,,,2


<br>

In [48]:
map_dict = {
    'o':0,
    'O':0,
    'I':1,
    'i':1,
    'l':1,
    '!':1,
    'Z':2,
    'z':2,
    'S':5,
    'G':6
}

df_updated['section_comb'] = df_updated.apply(lambda x: fixCol(x['section_comb'], map_dict), axis =1)
df_updated['act'] = df_updated.apply(lambda x: fixCol(x['act'], map_dict), axis =1)

In [49]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1.0,,,,,,0
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,,,,,,,0
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,,,,1.0,,,1
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,,,2.0,,,,2
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,,,,2.0,,,2


<br>

In [50]:
def fillMissing(value, firstIndex):
    """
    Fill in missing values for a column by setting missing values to the previous value.
    Not using ffill() from Pandas since we need to account for special cases (basically
    reseting the lastValid index) in the case that a new volume starts.
    Requires lastValid variable to be defined outside of the function.

    Parameters
    ----------
    value : str
        Value to check and fill.
    firstIndex : bool
        A flag for whether this row's value is the first value of a volume.
        Basically whether a new volume is starting.

    Returns
    -------
    str
        The new value.
    """

    global lastValid

    # If at the first index
    if firstIndex == True:
            
        # If a value doesn't exist, set that value and lastValid to 0
        if pd.isnull(value):
            value = 0
        
        # Set the lastValid's value to this value
        lastValid = value
        return value        


    # Else, if not at the first index
    # If a value does not exist
    if pd.isnull(value):
        # Set value to the lastValid value
        value = lastValid

    # If a value exists, that update lastValid
    else:
        lastValid = value

    return value

In [51]:
lastValid = 0
df_updated['section_comb'] = df_updated.apply(lambda x: fillMissing(x['section_comb'], x['first']), axis =1)

lastValid = 0
df_updated['act'] = df_updated.apply(lambda x: fillMissing(x['act'], x['first']), axis =1)

In [52]:
df_updated.head()

Unnamed: 0,year,first,start_page,end_page,path,sentence,flag,org_words,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_rgx6,section_comb
0,1868-69,True,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",False,,1,,,,,,0
1,1868-69,False,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",False,,1,,,,,,0
2,1868-69,False,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",False,,1,,,1.0,,,1
3,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,False,,1,,2.0,,,,2
4,1868-69,False,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",False,,1,,,2.0,,,2


<br>

In [53]:
df_updated.rename({'section_comb':'section'}, axis=1, inplace=True)

In [54]:
cols_keep = ['year', 'sentence', 'start_page', 'end_page', 'act', 'section', 'path']
df_updated.drop(
    list(set(df_updated.columns.tolist()) - set(cols_keep)), axis = 1, inplace=True)

In [55]:
print(f"Length of the dataframe: {df_updated.shape[0]}")
display(df_updated)

Length of the dataframe: 597920


Unnamed: 0,year,start_page,end_page,path,sentence,act,section
0,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT AccEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",1,0
1,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1,0
2,1868-69,071,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",1,1
3,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,1,2
4,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",1,2
...,...,...,...,...,...,...,...
597915,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"Section 142563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",1374,1425
597916,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,The board is authorized to buy any real estate needed for county purposes.,1374,1425
597917,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",1374,1425
597918,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"No purchase or sale shall be made unless the written approval of the county board of administrators is first obtained.” SECTION 9. Time effectiveThis act shall take effect upon approval by the Governor, except section 7 and 8, which shall be effective July 1, 1969. Approved the 12th day of July, 1968.",1374,1425


<br>

## Post-Cleaning
Some cleaning after adding Section and Acts.

In [56]:
def upperIfNeeded(sentence, ratio = 0.50):
    """
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    Needs an `uppered` variable to be defined outside of this scope.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    """

    global uppered
    
    # A count of the number of already uppercased words
    count = 0
    
    # Check whether the word consists of only letters,
    # has a length greater than 1, is uppercase, and 
    # isn't "SECTION"
    for word in sentence.split(" "):
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    # return all uppercase words
    if (count/len(sentence.split(" ")) > ratio):
        uppered += 1
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [57]:
uppered = 0
df_updated['sentence'] = df_updated.apply(lambda x: upperIfNeeded(x['sentence']), axis=1)
errorsDict['Uppercased'] = uppered

In [58]:
errorsDict

{'Split Sentences': 1232,
 'Approved phrases': 28346,
 'EOL hyphenation': 653842,
 'Act seperators': 0,
 'Uppercased': 2497}

<br>

## Character Length
Add the character length feature.
<br>This is added here because the lengths of the sentences might have changed during the cleaning process above.

In [59]:
df_updated["length"] = df_updated['sentence'].str.len()

In [60]:
df_updated.head()

Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1868-69,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",1,0,285
1,1868-69,71,71,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1,0,1161
2,1868-69,71,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",1,1,979
3,1868-69,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,Sec. 2.,1,2,12
4,1868-69,72,72,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",1,2,197


<br>

### Removing Sentences With Low Character Length

Get rid of sentences with a low number of characters as they might not form meaningful sentences.
<br>Define a cutoff for the sentences. All sentences belows this length will be removed.
<br>Our research has shown that 30 character limit seems to be optimal for keeping informative sentences in the corpus.

In [61]:
cut_len = 25

In [62]:
# Initial length
ilen = df_updated.shape[0]

In [63]:
df_updated = df_updated[ df_updated["length"] > cut_len ]
print("Length of the cleaned dataframe: ", df_updated.shape[0])
print("Reduction of about {:.2f}%".format( (1 - df_updated.shape[0]/ilen) * 100))

Length of the cleaned dataframe:  469638
Reduction of about 21.45%


In [64]:
df_updated.reset_index(drop=True, inplace=True)

In [65]:
df_updated

Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",1,0,285
1,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1,0,1161
2,1868-69,071,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",1,1,979
3,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",1,2,197
4,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",1,1,122
...,...,...,...,...,...,...,...,...
469633,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,Buy and sell real estate.,1374,8,30
469634,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"Section 142563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",1374,1425,192
469635,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,The board is authorized to buy any real estate needed for county purposes.,1374,1425,79
469636,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",1374,1425,168


<br>

## Dropping duplicates

In [66]:
print(f"The number of dropped sentences is {df_updated[df_updated.duplicated(subset=['sentence'])].shape[0]}")

The number of dropped sentences is 74173


In [67]:
df_dropped = df_updated.drop_duplicates(subset=['sentence'])

In [68]:
print(f"Length of the dataframe: {df_dropped.shape[0]}")
display(df_dropped)

Length of the dataframe: 395465


Unnamed: 0,year,start_page,end_page,path,sentence,act,section,length
0,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",1,0,285
1,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1,0,1161
2,1868-69,071,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",1,1,979
3,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",1,2,197
4,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",1,1,122
...,...,...,...,...,...,...,...,...
469633,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,Buy and sell real estate.,1374,8,30
469634,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"Section 142563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",1374,1425,192
469635,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,The board is authorized to buy any real estate needed for county purposes.,1374,1425,79
469636,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",1374,1425,168


<br>

## Adding Features

In [69]:
import warnings
warnings.filterwarnings("ignore")

<br>

### Adding ID

In [70]:
# The final dataframe (will be concatenated) containing the id's
df_final = []

# Loop through each year in the dataframe
for year in years:
    
    # Make a temporary dataframe that only contains rows for this year
    df_temp = df_dropped[df_dropped.year.str.startswith(year)]

    # Reset it's index
    df_temp.reset_index(drop=True, inplace=True)

    # Add a new column with the index value
    df_temp['id'] = df_temp.index.values
    
    # Get the length of the id of the last row in the dataframe, which is used to assess how many 0's will be prefixed to the other ids
    maxNumLength = len(str(df_temp.last_valid_index()))

    # Add the updated id
    df_temp['id'] = df_temp.apply(lambda x: str(year) + "_" + addPrefix( str(x['id']), maxNumLength ), axis=1)
        
    # Append to the final dataframe
    df_final.append(df_temp)

In [71]:
# Concatenate the final dataframe
df_final = pd.concat(df_final, ignore_index=True)

# Set it's index to the new ids
df_final.set_index('id', inplace=True)

In [72]:
print(f"Length of the dataframe: {df_final.shape[0]}")
display(df_final)

Length of the dataframe: 403481


Unnamed: 0_level_0,year,start_page,end_page,path,sentence,act,section,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1868-69_0000,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",1,0,285
1868-69_0001,1868-69,071,071,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1,0,1161
1868-69_0002,1868-69,071,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",1,1,979
1868-69_0003,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",1,2,197
1868-69_0004,1868-69,072,072,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",1,1,122
...,...,...,...,...,...,...,...,...
1968_6685,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,Buy and sell real estate.,1374,8,30
1968_6686,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"Section 142563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",1374,1425,192
1968_6687,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,The board is authorized to buy any real estate needed for county purposes.,1374,1425,79
1968_6688,1968,1000,1000,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",1374,1425,168


<br>

### Adding the Remaining Features

In [73]:
df_final.insert(1, 'law_type', pd.NA)
df_final.insert(2, 'state', 'SOUTH CAROLINA')

In [74]:
df_final['law_type'] = df_final.apply(lambda x : addJoints(x['sentence']), axis=1)

In [75]:
def fixJoints(row_law_type, row_Act_label):
    """
    Fix incorrect labels which should be "Joint Resolution".
    
    Parameters
    ----------
    row_law_type : str
        The current row's law type.
        Ex. 'Act' or 'Joint Resolution'

    row_Act_label: str
       The current row's Act label/value.

    Returns
    -------
    str
        Either "Joint Resolution" or `row_law_type`.
    
    """
    global joint_label

    # If the row's law_type is 'Joint Resolution', then assign the label number to `joint_label`.
    # This row is already a 'Joint Resolution'
    if row_law_type == 'Joint Resolution':
        joint_label = row_Act_label
        return 'Joint Resolution'
    
    # If this row is not a 'Joint Resolution', then
    # If the row's Act label is the same as `joint_label` then that row should be a Joint Resolution.
    elif row_Act_label == joint_label:
        return 'Joint Resolution'
    
    # Otherwise, that row is an act
    else:
        return row_law_type
        
        
joint_label = -1
df_final['law_type'] = df_final.apply(lambda x: fixJoints(x['law_type'], x['act']), axis = 1)

<br>

## Some Final Touches

In [76]:
df_final.drop('year', axis=1, inplace=True)

In [77]:
cols_keep.remove('year')
cols_keep.insert(0, 'state')
cols_keep.insert(0, 'law_type')
cols_keep.insert(3, 'length')

In [78]:
cols_keep

['law_type',
 'state',
 'sentence',
 'length',
 'start_page',
 'end_page',
 'act',
 'section',
 'path']

In [79]:
df_final = df_final[cols_keep]

In [80]:
print(f"Length of the dataframe: {df_final.shape[0]}")
display(df_final)

Length of the dataframe: 403481


Unnamed: 0_level_0,law_type,state,sentence,length,start_page,end_page,act,section,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1868-69_0000,Act,SOUTH CAROLINA,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",285,071,071,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1868-69_0001,Act,SOUTH CAROLINA,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1161,071,071,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1868-69_0002,Act,SOUTH CAROLINA,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",979,071,072,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1868-69_0003,Act,SOUTH CAROLINA,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",197,072,072,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
1868-69_0004,Act,SOUTH CAROLINA,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",122,072,072,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
...,...,...,...,...,...,...,...,...,...
1968_6685,Act,SOUTH CAROLINA,Buy and sell real estate.,30,1000,1000,1374,8,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
1968_6686,Act,SOUTH CAROLINA,"Section 142563.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",192,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
1968_6687,Act,SOUTH CAROLINA,The board is authorized to buy any real estate needed for county purposes.,79,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
1968_6688,Act,SOUTH CAROLINA,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",168,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff


<br>

## Exporting

In [95]:
saveDir = './results/'

In [96]:
# Check if the direcotry exists
if os.path.exists(saveDir):
    print('Directory exists!')
else:
#     print(f'Creating directory at {saveDir}')
#     os.makedirs(saveDir)
    print(f'Directory does not exist!')

Directory exists!


In [97]:
# # Save one csv containg all years
# df_final.to_csv(f'{saveDir}final_splits.csv')

In [98]:
# # Save a csv for each year
# for year in years:
#     df_final[df_final.index.to_series().str.startswith(year)].to_csv(f'{saveDir}/{year}_final_splits.csv')