# Sentence Splitting
This notebook uses OCRed text for a all volume years in a folder and splits them into sentences using regular expression pattern matching.<br>
For this notebook to run, there should be an OCRed folder that should contain a .txt file, a .tsv file, and an images sub-folder (more details in the notebook).

In [1]:
from nltk.tokenize import PunktSentenceTokenizer
import pandas as pd
from os import listdir
import re
import sys
from tqdm import tqdm  # For printing out progress bar
from numpy import nan
from spellchecker import SpellChecker

pd.set_option('display.max_colwidth', None)

In [2]:
# Read all folder names in the OCR (or a specified) directory
# ocred_path = '/work/otb-lab/OCRed'
ocred_path = '/Users/nitingupta/Desktop/OTB/OCRed'

years = [name for name in listdir(ocred_path) if not name.startswith('.')]
years.sort()
print(years)

['1892', '1893', '1894', '1901', '1918', '1921', '1928', '1948', '1956']


<br>

## Aquiring data

In [3]:
def getActsPaths(dir_OCR):
    """
    This function searches the given OCR directory path to find a path to a 
    text file containing the OCRed output for that year's Acts.
    The function accounts for the fact that there might be many variations in 
    the filename since the Acts and Joints could be seperate or mixed.
    
    Note:
        If the Acts and Joints are seperate for the give year, 
        the acts path will likely contain a filename in the format: 
        `{year}_Acts.txt`.
    
        However, if the Acts and Joints are mixed for the give year, 
        the acts path will might contain a filename as followes: 
        `{year}_both.txt` or `{year}_Acts_Joints.txt`.

    Parameters
    ----------
    dir_OCR : str
        The path for a year's OCR folder.

    Returns
    -------
    tuple of (str, bool)
        The path to the acts text file which is appended to `dir_OCR`.
        A flag identifying whether the Acts and Joints are seperate 
        for this year's volume. See note above.
    """

    # If the Acts and Joints were seperate for the year
    try:
        # Lists of strings that should and should not be in the file name
        mustContain = ['txt']
        eitherContain = ['act', 'acts']
        notContain = ['joint', 'joints', 'concurrent', 'concurrents', 
                      'bill', 'bills']

        for file in listdir(dir_OCR):
            file_lowered = file.lower()

            # Check if each of the mustContain strings are in the name
            # and any of the eitherContain strings are in the name
            # and each of the notContain strings are not in the name
            if all([x in file_lowered for x in mustContain]) and \
               any([x in file_lowered for x in eitherContain]) and \
               all([x not in file_lowered for x in notContain]):
                acts_path = dir_OCR + "/" + file
                break

        # If a path was found
        if 'acts_path' in locals():
            # The flag being True means that the Acts and Joints are seperate
            return (acts_path, True)
        else:
            raise Exception

    # However, if the Acts and Joints were not seperate for this year, 
    # then a FileNotFoundError will be returned for the above code.
    # So, catch that error and read in the combined file
    except:
        # Some years might contain 'both' as a keyword in the filename,
        # but some might contain 'acts_joints' insteads. 
        # So try both possibilities.

        # Try for 'both'
        try:
            # Lists of strings that should and should not be in the file name
            doContain = ['txt', 'both']
            notContain = ['joint', 'joints', 'concurrent', 
                          'concurrents', 'bill', 'bills']

            for file in listdir(dir_OCR):
                file_lowered = file.lower()

                # Check if each of the doContain strings are in the name and 
                # each of the notContain strings are not in the name
                if all([x in file_lowered for x in doContain]) and \
                   all([x not in file_lowered for x in notContain]):
                    acts_path = dir_OCR + "/" + file
                    break
            
            # If a path was found
            if 'acts_path' in locals():
                # True means that the Acts and Joints are seperate
                return (acts_path, True)
            else:
                raise Exception
                    
        # Try 'acts_joints'
        except:
        
            # Lists of strings that should and should not be in the file name
            mustContain = ['txt']
            eitherContain1 = ['act', 'acts']
            eitherContain2 = ['joints', 'joint']
            notContain = ['concurrent', 'concurrents', 'bill', 'bills']

            for file in listdir(dir_OCR):
                file_lowered = file.lower()

                # Check if each of the mustContain strings are in the name
                # and any of the eitherContain strings are in the name
                # and each of the notContain strings are not in the name
                if all([x in file_lowered for x in mustContain]) and \
                   any([x in file_lowered for x in eitherContain1]) and \
                   any([x in file_lowered for x in eitherContain2]) and \
                   all([x not in file_lowered for x in notContain]):
                    acts_path = dir_OCR + "/" + file
                    break

                    
        # After either of the above (nested) try-except statements, 
        # execute the following...
        if 'acts_path' in locals():
            # False means that the Acts and Joints are not seperate
            return (acts_path, False)
        else:
            return (None, False)

In [4]:
def removeSessionHeaders(df):
    """
    This function removes session headers (containing information about the
    session held) which appear at the start of each volume.
    To remove them, the code removes all sentences until the first valid 
    sentence appears, which usually starts with "An Acts ...".
    
    Parameters
    ----------
    df : pandas.Dataframe
        The dataframe to remove session headers from.

    Returns
    -------
    pandas.Dataframe
        The modified dataframe which session headers removed.
    """
    
    for i, sent in enumerate(df['sentence']):

        # If the sentence with "an" is found, exit the loop
        if 'an act' in sent.lower().strip():
               break

        # Else, disregard the sentence since it does not start with "an"
        df.drop(index=i, inplace=True)
    
    # Reset the index
    df.reset_index(drop=True, inplace=True)
    
    return df

In [5]:
def getImgs(dir_OCR):
    """
    This function searches the gives OCR directory path for the images 
    sub-folder. It then returns the path this sub-folder and the list of all 
    images contained in it.

    Parameters
    ----------
    dir_OCR : str
        The path for a year's OCR folder.

    Returns
    -------
    tuple of (list, str)
        The list of all images contained in the images sub-folder.
        Str path (an extension of `dir_OCR`) to the images sub-folder.
    """
    
    # Since many variation might exists, nested try-excepts are needed.
    try:
        dir_imgs = dir_OCR + "/images"
        imgs = listdir(dir_imgs)

    except FileNotFoundError:
        try:
            dir_imgs = dir_OCR + "/Images"
            imgs = listdir(dir_imgs)

        except FileNotFoundError:
            try:
                dir_imgs = dir_OCR + "/images.zip"
                imgs = listdir(dir_imgs)

            except FileNotFoundError:
                try:
                    dir_imgs = dir_OCR + "/Images.zip"
                    imgs = listdir(dir_imgs)

                except FileNotFoundError:
                    dir_imgs = dir_OCR + "/" + year
                    imgs = listdir(dir_imgs)
    
    # Only keep images which have a valid extensions
    imgs = [img for img in imgs if "jpg" in img or "tiff" in img or "JPG" in img or "TIFF" in img]
    imgs.sort()
    return imgs, dir_imgs

In [6]:
def getWordsFrame(acts_path, actsSep):
    """
    This function reads in the path to the acts file and returns a Pandas 
    Dataframe containing the each word in the corpus and its filename.
    
    Parameters
    ----------
    acts_path : str
        The path to the acts text file.
    actsSep : bool
        Flag for whether the the Acts and Joints are seperate for this volume.

    Returns
    -------
    pandas.Dataframe
        A dataframe containing words and their page numbers (filenames).
    """

    # Most likely, the tsv file with be similar to the 'acts_path'
    # but will have '_data' added before the file extension
    # Ex. if 'acts_path' = '1928_Acts.txt', 
    # then 'word_path' = '1928_Acts_data.tsv'
    try:
        words_path = acts_path.split('.')[0] + '_data.tsv'
        # print(words_path)
        df_words = pd.read_table(words_path)

    # If that file does not exist, then search
    except:

        if actsSep:
            # Lists of strings that should and should not be in the file name
            mustContain = ['tsv', 'data']
            eitherContain = ['act', 'acts']
            notContain = ['joint', 'joints', 'concurrent', 'concurrents', 
                          'bill', 'bills']
            
            for file in listdir(dir_OCR):
                file_lowered = file.lower()

                # Check if each of the mustContain strings are in the name
                # and any of the eitherContain strings are in the name
                # and each of the notContain strings are not in the name
                if all([x in file_lowered for x in mustContain]) and \
                   any([x in file_lowered for x in eitherContain]) and \
                   all([x not in file_lowered for x in notContain]):
                    words_path = dir_OCR + '/' + file                
            
        else:
            # Lists of strings that should and should not be in the file name
            mustContain = ['tsv', 'data', 'both']
            eitherContain = ['joint', 'joints']
            notContain = ['concurrent', 'concurrents', 'bill', 'bills']

            for file in listdir(dir_OCR):
                file_lowered = file.lower()

                # Check if each of the mustContain strings are in the name
                # and any of the eitherContain strings are in the name
                # and each of the notContain strings are not in the name
                if all([x in file_lowered for x in mustContain]) and \
                   any([x in file_lowered for x in eitherContain]) and \
                   all([x not in file_lowered for x in notContain]):
                    words_path = dir_OCR + '/' + file               

        df_words = pd.read_table(words_path)

        
    # Drop the columns which are unessecary for our analysis
    df_words.drop(columns=["left", "top", "width", "height", "conf"], inplace=True)

    # Drop the rows which don't contain a word in the "text" column
    df_words.dropna(inplace=True)
    # Reset index
    df_words.reset_index(drop=True)

    # Relabel the "name" column to "page" column
    df_words.rename(columns={"name": "page"}, inplace=True)

    return df_words    

In [7]:
def getStartEndPages(df, df_words):
    """
    This function reads in given dataframes and fills in the start and end 
    pages for each sentence.
    The data in the two dataframes must match.
    
    Parameters
    ----------
    df : pandas.Dataframe
        The original dataframe which will have start and end pages assigned for 
        each row.
    df_words : pandas.Dataframe
        A dataframe containing words and their page numbers (filenames).

    Returns
    -------
    pandas.Dataframe
        A dataframe with labelled start and end pages.
    """

    # Tracker for df_words:
    words_trkr = 0

    # Loop over the original dataframe
    for i in range(0, df.shape[0]):

        # For each sentence, extract the first and last word
        tmp_sentence = df.iloc[i]['sentence'].split(" ")
        start, last = tmp_sentence[0], tmp_sentence[-1]

        # Get the page number for the start and end word
        try:
            start_page = df_words.iloc[words_trkr]['page']
        except IndexError:
            try:
                words_trkr -= len(tmp_sentence)
                start_page = df_words.iloc[words_trkr]['page']
            except:
                start_page = df_words['page'].iloc[-1]

        try:
            end_page = df_words.iloc[words_trkr + len(tmp_sentence)]['page']
        except IndexError:
            try:
                end_page = df_words.iloc[words_trkr]['page']
            except:
                end_page = df_words['page'].iloc[-1]


        # Remove the filename from the pages:
        start_page = start_page.split(".")[0]
        end_page = end_page.split(".")[0]


        # Assign the page number to their respective columns in the dataframe
        df.at[i, 'start_page'] = start_page
        df.at[i, 'end_page'] = end_page

        # Update tracker
        words_trkr += len(tmp_sentence)
    
    return df

In [8]:
def getImgsPath(df):
    """
    This function adds an online image path to each sentence based on its 
    start page.
    
    Parameters
    ----------
    df : pandas.Dataframe
        The dataframe containing start pages for which links will be assigned.

    Returns
    -------
    pandas.Dataframe
        A dataframe with online links for each sentence.
    """
    
    pre_path = 'https://emailsc.sharepoint.com/:i:/r/sites/' + \
        'COTEAM-ULIB-OntheBooks/Shared%20Documents/General/' + \
        "/".join(dir_imgs.split("/")[-3:]) + '/'
    
    df['path'] = pre_path + df['start_page'].astype(str) + '.' + fileType
    return df

In [9]:
# Create an empty list for the final dataframe 
df = []

In [10]:
# Set up the progress bar
progress_bar = tqdm(total=len(years), file=sys.stderr)

for year in years:
    
    # Update the progress bar
    progress_bar.set_description(f"Processing year {year}")
    
    # This is the directory that will contain the OCRed output:
    dir_OCR = ocred_path + '/' + str(year)
    
    if 'acts_path' in globals():
        del acts_path
        
    acts_path, actsSep = getActsPaths(dir_OCR)
    
    if acts_path is None:
        sys.exit(f'acts path not found for {year}')

    with open(acts_path, 'r') as f:
        # This variable holds all the OCRed text as a String
        data = f.read()
        
    count = data.count("\n\n")+1
    if count < 100:
        sys.exit(f'Count seems low for {year}. Count: {count}')
    # print("The number of pages OCRed for {year} is: {count}".format(year = year, count = count))
    
    # Training the tokenizer
    sent_tokenizer = PunktSentenceTokenizer(data)
    sentences = sent_tokenizer.tokenize(data)  # A List of tokens/sentences as seperated by nltk's PunktSentenceTokenizer

    # Create a temporary dataframe
    df_temp = pd.DataFrame()
    df_temp["sentence"] = sentences
    
    # A flag to keep track of the first index for each year
    df_temp['first'] = False
    
    # Strip sentences of trailing and leading whitespaces
    df_temp['sentence'] = df_temp['sentence'].str.strip()
    # Remove "\n\n" from the original dataframe as they will interfere with the analysis
    df_temp['sentence'] = df_temp['sentence'].str.replace("\n\n", "", regex = False)
    
    # print("Length of the initial dataframe:", df.shape[0], "\nThis is the number of tokenized sentences.")
    
    # Remove session headers
    df_temp = removeSessionHeaders(df_temp)
    
    imgs, dir_imgs = getImgs(dir_OCR)
    # print("The number of image files for this year is:", len(imgs))
    
    fileType = imgs[0].split(".")[1]
    
    df_words = getWordsFrame(acts_path, actsSep)
    
    # Add an empty 'start_page' and 'end_page' column
    df_temp['start_page'] = pd.NA
    df_temp['end_page'] = pd.NA
    
    df_temp = getStartEndPages(df_temp, df_words)
    
    # Adding year
    df_temp.insert(0, 'year', year)
    
    # Get the images path
    df_temp = getImgsPath(df_temp)
    
    df_temp.at[0, 'first'] = True
    
    # Append this year's dataframe to the final dataframe
    df.append(df_temp)
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.set_description(f"Processed the list")
progress_bar.close()

# Convert the list to a dataframe
df = pd.concat(df, ignore_index=True)

Processed the list: 100%|███████████████████████████████| 9/9 [00:08<00:00,  1.11it/s]


In [11]:
df.shape[0]

37387

In [12]:
df

Unnamed: 0,year,sentence,first,start_page,end_page,path
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Repre- sentatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
2,1892,Src.,False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
3,1892,2.,False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
4,1892,"The officers of this battalion shall consist of a Lieu- tenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
...,...,...,...,...,...,...
37382,1956,"The Commissioner of Agriculture of South Caro- lina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37383,1956,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of im- porting noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confine- ment.",False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37384,1956,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,False,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
37385,1956,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,False,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg


<br>

## Pre-Cleaning

In [13]:
# New dataframe so that the results of the matching can be compared
df_cleaned = df.copy()

# A new dictionary to keep track of the number of errors
errorsDict = {}

<br>

### Correcting Some Important Words
Correcting the following word spellings in the dataframe:
- Section

In [14]:
spell = SpellChecker(distance=1)

In [15]:
# Define a function to correct words in the 'sentence'
def correct_words(text):
    flag = False
    words = text.split()
    corrected_words = []
    
    for word in words:
        corrected_word = word
    
        # Check if the word is a minor misspelling of "section"
        if spell.correction(word) == "section":
            corrected_word = "section"
            flag = True
        
        corrected_words.append(corrected_word)

    return (' '.join(corrected_words), flag)

In [16]:
df_cleaned['corrected'] = False
df_cleaned['sentence'], df_cleaned['corrected'] = df.apply(lambda x: correct_words(x['sentence']), axis=1, result_type='expand').T.values

### Removing End-Of-Line Hyphenation

In [17]:
errorsDict['EOL hyphenation'] = df_cleaned['sentence'].str.count(pat = '[-][ ]').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = '[-][ ]',
                                                            repl = "",
                                                            regex = True)

In [18]:
errorsDict

{'EOL hyphenation': 23151}

### Relocating Incorrect "Approved ..." Phrases

In [19]:
rgx_match = re.compile(
    r'^(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){1,2} day of [a-z]+, a\. d\. .{4}(. |.| |)\b)|(approved [a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){1,2}, a\. d\. .{4}(. |.| |)\b)', 
    flags=re.IGNORECASE)

# Search for matches in the 'sentence' column
matches = df_cleaned['sentence'].str.extract(rgx_match)

# Remove the matched patterns from sentences
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(rgx_match, '', n=-1)

# Add matches to the previous sentence
df_cleaned['sentence'] = df_cleaned['sentence'].str.cat(matches.shift(-1), 
                                                        sep=' ', na_rep='')

errorsDict['Approved phrases'] = matches.count().sum()

In [20]:
errorsDict

{'EOL hyphenation': 23151, 'Approved phrases': 5052}

### Removing Act Seperators

In [21]:
errorsDict['Act seperators'] = df_cleaned['sentence'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()
df_cleaned['sentence'] = df_cleaned['sentence'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = '',
                                                            regex = True)

In [22]:
errorsDict

{'EOL hyphenation': 23151, 'Approved phrases': 5052, 'Act seperators': 216}

In [23]:
df_cleaned.shape[0]

37387

<br>

## Adding Section and Act Labels

In [24]:
df_updated = df_cleaned.copy()

<br>

In [25]:
def getAct(sentence, fallback, pat, group = 1):
    """
    Get Act labels for the given sentence.
    If the match is an empty string (’’) then the last act is updated by 1.
    Else the matched act number is the new act number.

    Requires a lastAct variable to keep track of the previous act.
    
    Parameters
    ----------
    sentence : str
        The sentence to output the act for.
    fallback : str
        If no act is found, then return this str instead.
    pat : re.Pattern
        The pattern to search for.
    group : int
        The group to extract from the match.

    Returns
    -------
    str
        The Act number for this `sentence`.
    """  

    global lastAct
    res = pattern.search(sentence.lower())
    
    if res:    
        if res.group(group) == '':  # Act number not given, but act is started
            lastAct = str(int(lastAct) + 1)
        else:
            lastAct = res.group(group)

        return lastAct

    return fallback

In [26]:
df_updated['act'] = None

# Compile the regex pattern
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act)')
# Initialize lastAct
lastAct = '0'

# Apply the function to the DataFrame
df_updated['act'] = df_updated.apply(lambda x: getAct(x['sentence'], x['act'], pattern), axis=1)

In [27]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,


<br>

In [28]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^([\d]*)([. ]*)(?:an act|act).*?(?:section)\s+([\w]{1,4})[\s.]*(?:be it)')
df_updated['section_rgx1'] = df_updated['sentence'].str.lower().str.extract(pattern)[2]

In [29]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,


<br>

In [30]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'^(S|s|E|e|r|C|c){1,}(T|t|I|i|O|o|N|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}')
df_updated['section_rgx3'] = df_updated['sentence'].str.lower().str.extract(pattern)[3]

In [31]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,


<br>

In [32]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'(s|e|r|c){1,}(t|i|o|n)*(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)(. |.| |){1,3}$')

# Search for matches in the 'sentence' column
matches = df_updated['sentence'].str.lower().str.extract(pattern)[3]
# Add match to the next row
matches = matches.shift(1)

df_updated['section_rgx4'] = matches

# Replace the first 'None' value to NaN
df_updated.at[0, 'section_rgx4'] = nan

In [33]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,,
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0,
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,


<br>

In [34]:
# Go through sentences and label some occurences of new sections with their numbers
pattern = re.compile(r'§(\.|,|:|;| ){0,2}([\d]{1,3}[\w]?)')
df_updated['section_rgx5'] = df_updated['sentence'].str.lower().str.extract(pattern)[1]

In [35]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,,,
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0,,
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,


<br>

In [36]:
# Combine 'section_rgx1', 'section_rgx3', 'section_rgx5', and 'section_rgx4'
df_updated['section_comb'] = df_updated['section_rgx1'].fillna(df_updated['section_rgx3']).fillna(df_updated['section_rgx4']).fillna(df_updated['section_rgx5'])

In [37]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_comb
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,,,,
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0,,,1.0
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,


<br>

In [38]:
# Initialize lastAct
lastAct = '0'

# Go through sentences and label all occurences of new acts with their section numbers
def labelSections(row):
    """
    Label every sentence that has a new act number (something different from the last row's) 
    with 0 if a section does not already exists there.
    
    Requires lastAct variable to be initialized outside of this function.
    
    Parameters
    ----------
    row : pandas.Dataframe row
        The row to perform operation on

    Returns
    -------
    pandas.Dataframe row
        The modified row.
    """
    
    global lastAct
    
    # If at the first index or if the previous act num is not equal to this act
    if row['first'] == True or lastAct != row['act']:
        
        # If this section value is None, label with 0
        if pd.isnull(row['section_comb']):
            row['section_comb'] = 0
            
    # Update lastAct...
    lastAct = row['act']
     
    return row


# Apply the function to the DataFrame
df_updated = df_updated.apply(labelSections, axis=1)

In [39]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_comb
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,,,,0.0
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0,,,1.0
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,


<br>

In [40]:
map_dict = {
    'o':0,
    'O':0,
    'I':1,
    'i':1,
    'l':1,
    '!':1,
    'Z':2,
    'z':2,
    'S':5,
    'G':6
}

def fixCol(colValue, map_dict):
    """
    Fix the given value, if required, by replacing each letter with the
    given replacements from the map_dict.

    Parameters
    ----------
    colValue : str
        The value to check and replace.
    map_dict : dict
        The dictionary containing mappings for incorrect letters.

    Returns
    -------
    str
        The fixed value.
    """

    if pd.isnull(colValue):
        return colValue
    
    ret = ''
    
    # Since a number might be more than one character, such as '15',
    # iterate over each character
    for char in str(colValue):
        if char in map_dict:
            ret += str(map_dict[char])
        else:
            ret += char
        
    return ret

In [41]:
df_updated['section_comb'] = df_updated.apply(lambda x: fixCol(x['section_comb'], map_dict), axis =1)
df_updated['act'] = df_updated.apply(lambda x: fixCol(x['act'], map_dict), axis =1)

In [42]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_comb
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1.0,,,,,0.0
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,,,1.0,,,1.0
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,,,,,,


<br>

In [43]:
def fillMissing(value, firstIndex):
    """
    Fill in missing values for a column by setting missing values to the previous value.
    Not using ffill() from Pandas since we need to account for special cases (basically
    reseting the lastValid index) in the case that a new volume starts.
    Requires lastValid variable to be defined outside of the function.

    Parameters
    ----------
    value : str
        Value to check and fill.
    firstIndex : bool
        A flag for whether this row's value is the first value of a volume.
        Basically whether a new volume is starting.

    Returns
    -------
    str
        The new value.
    """

    global lastValid

    # If at the first index
    if firstIndex == True:
            
        # If a value doesn't exist, set that value and lastValid to 0
        if pd.isnull(value):
            value = 0
        
        # Set the lastValid's value to this value
        lastValid = value
        return value        


    # Else, if not at the first index
    # If a value does not exist
    if pd.isnull(value):
        # Set value to the lastValid value
        value = lastValid

    # If a value exists, that update lastValid
    else:
        lastValid = value

    return value

In [44]:
lastValid = 0
df_updated['section_comb'] = df_updated.apply(lambda x: fillMissing(x['section_comb'], x['first']), axis =1)

lastValid = 0
df_updated['act'] = df_updated.apply(lambda x: fillMissing(x['act'], x['first']), axis =1)

In [45]:
df_updated.head()

Unnamed: 0,year,sentence,first,start_page,end_page,path,corrected,act,section_rgx1,section_rgx3,section_rgx4,section_rgx5,section_comb
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,True,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1,,,,,0
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",False,45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,False,1,,1.0,,,1
2,1892,Src.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,1,,,,,1
3,1892,2.,False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,1,,,,,1
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",False,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,False,1,,,,,1


<br>

In [46]:
df_updated.drop(['section_rgx1', 'section_rgx3', 'section_rgx4', 'section_rgx5', 'first', 'corrected'], axis=1, inplace=True)

In [47]:
df_updated.rename({'section_comb':'section'}, axis=1, inplace=True)

In [48]:
df_updated.shape[0]

37387

In [49]:
df_updated

Unnamed: 0,year,sentence,start_page,end_page,path,act,section
0,1892,AN ACT to CoNSsTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,0
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,1
2,1892,Src.,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1
3,1892,2.,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1
...,...,...,...,...,...,...,...
37382,1956,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0
37383,1956,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0
37384,1956,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0
37385,1956,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0


<br>

## Post-Cleaning
Some cleaning after adding Section and Acts.

In [50]:
def upperIfNeeded(sentence, ratio = 0.50):
    """
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    Needs an `uppered` variable to be defined outside of this scope.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    """

    global uppered
    
    # A count of the number of already uppercased words
    count = 0
    
    # Check whether the word consists of only letters,
    # has a length greater than 1, is uppercase, and 
    # isn't "SECTION"
    for word in sentence.split(" "):
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    # return all uppercase words
    if (count/len(sentence.split(" ")) > ratio):
        uppered += 1
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [51]:
uppered = 0
df_updated['sentence'] = df_updated.apply(lambda x: upperIfNeeded(x['sentence']), axis=1)
errorsDict['Uppercased'] = uppered

In [52]:
errorsDict

{'EOL hyphenation': 23151,
 'Approved phrases': 5052,
 'Act seperators': 216,
 'Uppercased': 407}

<br>

## Character Length
Add the character length feature.
<br>This is added here because the lengths of the sentences might have changed during the cleaning process above.

In [53]:
df_updated["length"] = df_updated['sentence'].str.len()

In [54]:
df_updated.head()

Unnamed: 0,year,sentence,start_page,end_page,path,act,section,length
0,1892,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,45,45,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,0,113
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",45,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,1,481
2,1892,Src.,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,10
3,1892,2.,46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,8
4,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",46,46,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,360


<br>

### Removing Sentences With Low Character Length

Get rid of sentences with a low number of characters as they might not form meaningful sentences.
<br>Define a cutoff for the sentences. All sentences belows this length will be removed.
<br>Our research has shown that 30 character limit seems to be optimal for keeping informative sentences in the corpus.

In [55]:
cut_len = 30

In [56]:
# Initial length
ilen = df_updated.shape[0]

In [57]:
df_updated = df_updated[ df_updated["length"] > cut_len ]
print("Length of the cleaned dataframe: ", df_updated.shape[0])
print("Reduction of about {:.2f}%".format( (1 - df_updated.shape[0]/ilen) * 100))

Length of the cleaned dataframe:  29461
Reduction of about 21.20%


In [58]:
df_updated.reset_index(drop=True, inplace=True)

In [59]:
df_updated

Unnamed: 0,year,sentence,start_page,end_page,path,act,section,length
0,1892,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,0,113
1,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,1,481
2,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,360
3,1892,They shall be paid the same as battalion staffs in the Volunteer Troops.,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,78
4,1892,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,238
...,...,...,...,...,...,...,...,...
29456,1956,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,233
29457,1956,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,416
29458,1956,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,93
29459,1956,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,94


<br>

## Adding Features

In [60]:
import warnings
warnings.filterwarnings("ignore")

<br>

### Adding ID

In [61]:
def addPrefix(fileName: str, nameLen: int) -> str:
    """
    Since the fileNames from the excel parsing could be any of any length
    (ranging from 1-3), this function appends a string of 0's to the 
    start of the input so that it is the specified nameLen lengths long.
    
    Parameters
    ----------
    fileName : str
        The file name that needs to be prefixed
        The fileName shouldn't have a prefix, such as '.tiff'
    nameLen : int
        The length of the expected name of the file
        Ex. '00034.jpg' would have length of 5
        so nameLen should be 5

    Returns
    -------
    str
        A length nameLen file name (prefixed with 0's)
    """
    
    # prefix_length = nameLen - len(fileName)
    prefix = "0" * (nameLen - len(fileName))
    
    return prefix + fileName

In [62]:
# The final dataframe (will be concatenated) containing the id's
df_final = []

# Loop through each year in the dataframe
for year in years:
    
    # Make a temporary dataframe that only contains rows for this year
    df_temp = df_updated[df_updated.year.str.startswith(year)]

    # Reset it's index
    df_temp.reset_index(drop=True, inplace=True)

    # Add a new column with the index value
    df_temp['id'] = df_temp.index.values
    
    # Get the length of the id of the last row in the dataframe, which is used to assess how many 0's will be prefixed to the other ids
    maxNumLength = len(str(df_temp.last_valid_index()))

    # Add the updated id
    df_temp['id'] = df_temp.apply(lambda x: str(year) + "_" + addPrefix( str(x['id']), maxNumLength ), axis=1)
        
    # Append to the final dataframe
    df_final.append(df_temp)

In [63]:
# Concatenate the final dataframe
df_final = pd.concat(df_final, ignore_index=True)

# Set it's index to the new ids
df_final.set_index('id', inplace=True)

In [64]:
df_final.shape[0]

29461

In [65]:
df_final

Unnamed: 0_level_0,year,sentence,start_page,end_page,path,act,section,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1892_0000,1892,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,045,045,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,0,113
1892_0001,1892,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",045,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg,1,1,481
1892_0002,1892,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,360
1892_0003,1892,They shall be paid the same as battalion staffs in the Volunteer Troops.,046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,78
1892_0004,1892,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",046,046,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg,1,1,238
...,...,...,...,...,...,...,...,...
1956_5040,1956,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,233
1956_5041,1956,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,416
1956_5042,1956,Repeal: SECTION 2. All acts or parts of acts inconsistent herewith are hereby repealed.,01448,01448,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,93
1956_5043,1956,Time effective: SECTION 3. This act shall take effect upon its approval by the Governor.,01448,01449,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg,1132,0,94


<br>

### Adding the Remaining Identifiers

In [66]:
df_final.insert(1, 'law_type', 'Acts')
df_final.insert(2, 'state', 'SOUTH CAROLINA')

<br>

## Some Final Touches

In [67]:
df_final.drop('year', axis=1, inplace=True)

In [68]:
# Rearrange columns
cols = df_final.columns.tolist()
cols = cols[:3] + [cols[-1]] + cols[3:5] + cols[-3:-1] + [cols[5]]
# cols = cols[:3] + [cols[-1]] + cols[-3:-1] + cols[3:5] + [cols[5]]
df_final = df_final[cols]

<br>

### Dropping duplicates

In [69]:
print(f"The number of dropped sentences is {df_final[df_final.duplicated(subset=['sentence'])].shape[0]}")

The number of dropped sentences is 1607


In [70]:
df_dropped = df_final.drop_duplicates(subset=['sentence'])
# df_final.drop_duplicates(subset=['sentence'], ignore_index=True, inplace=True)

In [71]:
df_dropped.shape[0]

27854

In [72]:
df_dropped

Unnamed: 0_level_0,law_type,state,sentence,length,start_page,end_page,act,section,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1892_0000,Acts,SOUTH CAROLINA,AN ACT TO CONSSTITUTE A BATTALION TO BE KNOWN AS THE NAVAL BATTALION OF VOLUNTEER TROOPS OF SOUTH CAROLINA.,113,045,045,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1892_0001,Acts,SOUTH CAROLINA,"SEcTION 1. Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That there shall be allowed, in addition to the companies of the Vol-t unteer Troops of the State of South Carolina as now provided by law, not more than four companies of Naval Militia, which shall constitute a battalion, to be known as the Naval Battalion of the Volunteer Troops of South Carolina.",481,045,046,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/045.jpg
1892_0002,Acts,SOUTH CAROLINA,"The officers of this battalion shall consist of a Lieutenant Commander, who shall be appointed by the Governor, and whose rank and pay shall assimilate to that of a Major of infantry, and a staff, to consist of one Adjutant, one Ordnance Officer, one Paymaster, who shall be the mustering officer, and one Surgeon, each with the rank of First Lieutenant.",360,046,046,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
1892_0003,Acts,SOUTH CAROLINA,They shall be paid the same as battalion staffs in the Volunteer Troops.,78,046,046,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
1892_0004,Acts,SOUTH CAROLINA,"There shall also be attached to the staff the following petty officers: One Master-at-Arms, two Yeomen, one Hospital Steward, one Chief Bugler, who shall receive the same pay as the non-commissioned staff of a battalion of infantry.",238,046,046,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1892/images/046.jpg
...,...,...,...,...,...,...,...,...,...
1956_5038,Acts,SOUTH CAROLINA,1132 An Act To Require The Commissioner Of Agriculture Of South Carolina To Determine What Are Noxious Weeds And Plants; To Prevent Importation Into The State Of Such Weeds And Plants And To Provide A Penalty For The Violation of The Provisions Of This Act.,263,01448,01448,1132,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_5039,Acts,SOUTH CAROLINA,Be it enacted by the General Assembly of the State of South Carolina Commissioner of Agriculture—regulate importation of noxious weeds—penalties SECTION 1.,161,01448,01448,1132,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_5040,Acts,SOUTH CAROLINA,"The Commissioner of Agriculture of South Carolina shall determine what are noxious weeds and plants, and shall publish such determination in suitable rules and regulations which shall be duly promulgated in accordance with law.",233,01448,01448,1132,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg
1956_5041,Acts,SOUTH CAROLINA,"Any such plants or weeds unlawfully imported into the State may be seized and confiscated, and in addition thereto any person found guilty of importing noxious weeds or plants into the State in violation of the terms of this act or any rule or regulation duly promulgated by the Commissioner of Agriculture shall be fined not more than one hundred dollars or sentenced to not more than thirty days confinement.",416,01448,01448,1132,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1956/images/01448.jpg


<br>

## Exporting

In [73]:
# df_dropped.to_csv('final_splits_testing.csv')

In [74]:
# df_test = df_dropped.drop(['law_type', 'state', 'length', 'end_page', 'path'], axis=1)
# df_test.to_csv(f'{year}_testing.csv')
# print(f'exporting {year}')

<br>

In [78]:
df_dropped[df_dropped.index.str.startswith('1928')].head(10)

Unnamed: 0_level_0,law_type,state,sentence,length,start_page,end_page,act,section,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1928_0000,Acts,SOUTH CAROLINA,"574. AN ACT to Amend an Act Entitled “An Act to Raise Revenue for the Support of the State Government,” Approved Twenty-second Day, April, 1927, so as to Repeal the License Tax on Sporting Goods, Cut Glass, Etched Glass, Art Glass, and Twenty-two Caliber Cartridges and to Provide for a License Tax on Soft Drinks, Admissions, Contractors, Ammunition, Candy, Playing Cards, Manufactured Tobacco Products and Chain Stores and to Levy a Tax on Documents, for the Support of the State Government.",499,45,45,574,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00045.jpg
1928_0001,Acts,SOUTH CAROLINA,"73 of the Acts of the General Assembly of 1927, approved April 22, 1927, page 121, Acts of 1927, be, and the same is, hereby amended by immediately repealing all provisions in said Act levying a license tax on sporting goods, cut glass, etched glass, art glass, and 22 calibre cartridges.",294,45,45,574,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00045.jpg
1928_0002,Acts,SOUTH CAROLINA,Sec. 2. That said Act No.,31,45,45,574,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00045.jpg
1928_0003,Acts,SOUTH CAROLINA,"73 of the Acts of 1927, approved 22nd day of April, 1927, be, and the same is hereby amended by striking out all after the enacting words and inserting in lieu thereof, the following: DOCUMENTARY TAX Section 1.",216,45,45,574,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00045.jpg
1928_0004,Acts,SOUTH CAROLINA,"Taxes Levied.—That on and after the passage of this Act, there shall be levied, collected and paid, for and in respect of the several bonds, debentures or certificates of stock and indebtedness, and other documents, instruments, matters and things mentioned and described in Schedule A of this Act, or for or in respect of the vellum, parchment, or paper upon which such instrument, matter or things, or any of them, are written or printed, by any person who makes, signs, issues, sells, removes, consigns or ships the same or for whose benefit or use the same are made, signed, issued, sold, removed, consigned, or shipped, the several taxes specified in such schedule.",676,45,46,574,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00045.jpg
1928_0005,Acts,SOUTH CAROLINA,"Exemption, Governmental and Municipal Securities.—There shall not be taxed under this Act any bond, note or other instrument, issued by the United States, or by any foreign government, or by any State, Territory, or the District of Columbia, or local sub-division thereof, or municipal or other corporation exercising the taxing power; or any bond of indemnity required to be filed by any person to secure payment of any pension, allowance, allotment, relief, or insurance by the United States, or to secure a duplicate for, or the payment of any bond, note, certificate of indebtedness, war-savings certificate, warrant or check issued by the United States.",664,46,46,574,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00046.jpg
1928_0006,Acts,SOUTH CAROLINA,"Penalties for Evasion of Stamp Tax.—That whoever : (a) Makes, signs, issues or accepts, or causes to be made, signed, issued or accepted, any instrument, document or paper of any kind, or description whatsoever without the full amount of tax thereon being duly paid; (b) Makes use of any adhesive stamp to denote any tax imposed by this Act without cancelling or obliterating such stamps as hereinafter provided ; Is guilty of misdemeanor, and, upon conviction, shall pay a fine of not more than One Hundred ($100.00) Dollars, or be imprisoned not more than thirty (30) days, for each offense.",599,46,46,574,3,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00046.jpg
1928_0007,Acts,SOUTH CAROLINA,"Cancellation of Stamps.—That whenever an adhesive stamp is used for denoting any tax imposed by this Act on documents except as hereinafter provided, the person using or affixing the same shall write, or stamp, or cause to be written or stamped thereon, the initials of his or its name and date upon which the same is attached or used, so that the same may not again be used.",381,46,46,574,3,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00046.jpg
1928_0008,Acts,SOUTH CAROLINA,"Stamps shall be affixed in such manner that their removal will require continued application of steam or water: Provided, That the South Carolina Tax Commission may prescribe such other method for the cancellation of such stamps as it may deem expedient.",260,46,47,574,3,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00046.jpg
1928_0009,Acts,SOUTH CAROLINA,"Penalties for Fraud in Use or Reuse of Stamps.— That whoever : (a) Fraudulently cuts, tears, or removes from any vellum, parchment, paper, instrument, or writing upon which any tax is imposed by this Act, any adhesive stamp used in pursuance of this Act; (b) Fraudulently uses, joins, fixes, or places to, with, or upon any vellum, parchment, paper, instrument, or writing, upon which any tax is imposed by this Act (1) any adhesive stamp which has been cut, torn or removed from any other vellum, parchment, paper, instrument, or writing, upon which any tax is imposed by this Act; or, (2) any adhesive stamp of insufficient value; or, (3) any forged or counterfeited stamp; (c) Wilfully removes, or alters the cancellation, or defacing marks of, or otherwise prepares, any adhesive stamp, with intent to use, or cause the same to be used, after it has already been used, or knowingly or wilfully buys, sells, offers for sale, or gives away, any such washed or restored stamp to any person for use, or knowingly uses the same; (d) Knowingly, and without lawful excuse (the burden of proof of such excuse being on the accused) has in possession any washed, restored or altered stamp, which has been removed from any vellum, parchment, paper, instrument, or writing ; (e) Knowingly or wilfully prepares, buys, sells, offers for sale, or has in his or its possession any counterfeit stamps; Is guilty of a misdemeanor, and, upon conviction, shall be punished by a fine of not more than One Thousand ($1,000.00) Dollars, or by imprisonment for not more than five (5) years, or both.",1585,47,47,574,4,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1928/images/00047.jpg
