In [1]:
import os
import nltk
import pandas as pd
import multiprocessing

from splitting_functs_discrepancy import *
from numpy import nan, array_split

pd.set_option('display.max_colwidth', None)

## Data Aquiring

In [2]:
path = "./data/1877_571-586.csv"
df = pd.read_csv(path)

In [3]:
df.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton)
0,1877,571,an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1.,0
1,1877,571,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.” approved march 22, 1878.",0
2,1877,571,—— an act to alter and amend the schoo.t law of sour carolina.,1
3,1877,571,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners.",0
4,1877,571,of this board the state superintendent of education shall be ez officio chairman.,0


In [4]:
df.shape[0]

132

## Initial Data Cleaning
Concatenate all adjacent row which have the same values for all columns except the `Act` column into a single row.

In [5]:
# Initialize an empty list to store the resulting rows
result_rows = []

# Initialize variables to track the current group
current_group = [df.iloc[0]['Year'], df.iloc[0]['Page_No'], df.iloc[0]['Coding (Axton)']]
current_text = df.iloc[0]['Act']

# Iterate through the DataFrame
for index, row in df.iterrows():
    if [row['Year'], row['Page_No'], row['Coding (Axton)']] == current_group:
        # If the current row has the same values in the three columns, concatenate the text
        current_text = current_text + ' ' + row['Act']
    else:
        # If the current row has different values, add the current row to the result
        result_rows.append([current_group[0], current_group[1], current_group[2], current_text])
        # Update the current group and text
        current_group = [row['Year'], row['Page_No'], row['Coding (Axton)']]
        current_text = row['Act']

# Add the last group to the result
result_rows.append([current_group[0], current_group[1], current_group[2], current_text])

# Create a new DataFrame from the result list
result_df = pd.DataFrame(result_rows, columns=['Year', 'Page_No', 'Coding (Axton)', 'Act'])

In [6]:
result_df.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act
0,1877,571,0,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.” approved march 22, 1878."
1,1877,571,1,—— an act to alter and amend the schoo.t law of sour carolina.
2,1877,571,0,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners. of this board the state superintendent of education shall be ez officio chairman. the clerk of the state superintendent of education, as hereinafter provided for, shall be clerk of the state board of examiners. he shall be custodian of its records, papers and effects, and shall keep minutes of its proceedings; and said records, papers and minutes shall be kept in the office of the state superintendent of education and shall be open to inspection. sec. 2. that the said board shall meet on the call of its chairman, or upon the request of a majority of its members, at the office of the state superintendent of education, or at such other place as may be designated in the call. a majority of the board shall constitute a quorum for transacting business. sec. 3. that the official seal of the state superintendent of education shall be used for the authentication of the acts of the state board of examiners."
3,1877,572,0,"sec. 4. that the state board of examiners shall constitute an advisory body with whom the state superintendent of education shall have the right to consult when he is in doubt as to his official duty; and shall have power to review all decisions of the county boards of examiners, as hereinafter provided for appeals to the state board of examiners must be made through the county boards of examiners in writing and must distinctly set forth the question of law as well as the facts of the case upon which the appeal is taken, and the decision of the state board shall be final upon the matter in issue. sec. 5. that the state board of examiners shall have power 1st. to adopt rules and regulations not inconsistent with the laws of the state for its own government and for the government of the free public schools 2d. to prescribe and enforce rules for the examination of teachers 3d. to prescribe a standard of proficiency before county boards of examiners which will entitle persons examined by such boards to certificates as teachers. to prescribe and enforce the course of study in the free public schools, 5th. to prescribe and to enforce as far as practicable the use of a uniform series of text books in the free public schools, except in the city of charleston provided, that the state board of examiners shall not have power without permission of the general assembly of the state to change a text book within five (5) years from the date of its adoption."
4,1877,572,2,"to grant teachers’ state certificates, and to revoke them for immoral or unprofessional conduct, profanity or evident unfitness for teaching."


In [7]:
result_df.shape[0]

49

Remove the codings for 1 and 2

In [8]:
result_df['Coding (Axton)'] = result_df['Coding (Axton)'].map({
    0:0,
    1:'',
    2:''
})

Assign the number of words for each sentence.

In [9]:
result_df['num_words'] = result_df['Act'].str.split(" ").str.len()

In [10]:
result_df.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act,num_words
0,1877,571,0.0,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.” approved march 22, 1878.",158
1,1877,571,,—— an act to alter and amend the schoo.t law of sour carolina.,13
2,1877,571,0.0,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners. of this board the state superintendent of education shall be ez officio chairman. the clerk of the state superintendent of education, as hereinafter provided for, shall be clerk of the state board of examiners. he shall be custodian of its records, papers and effects, and shall keep minutes of its proceedings; and said records, papers and minutes shall be kept in the office of the state superintendent of education and shall be open to inspection. sec. 2. that the said board shall meet on the call of its chairman, or upon the request of a majority of its members, at the office of the state superintendent of education, or at such other place as may be designated in the call. a majority of the board shall constitute a quorum for transacting business. sec. 3. that the official seal of the state superintendent of education shall be used for the authentication of the acts of the state board of examiners.",242
3,1877,572,0.0,"sec. 4. that the state board of examiners shall constitute an advisory body with whom the state superintendent of education shall have the right to consult when he is in doubt as to his official duty; and shall have power to review all decisions of the county boards of examiners, as hereinafter provided for appeals to the state board of examiners must be made through the county boards of examiners in writing and must distinctly set forth the question of law as well as the facts of the case upon which the appeal is taken, and the decision of the state board shall be final upon the matter in issue. sec. 5. that the state board of examiners shall have power 1st. to adopt rules and regulations not inconsistent with the laws of the state for its own government and for the government of the free public schools 2d. to prescribe and enforce rules for the examination of teachers 3d. to prescribe a standard of proficiency before county boards of examiners which will entitle persons examined by such boards to certificates as teachers. to prescribe and enforce the course of study in the free public schools, 5th. to prescribe and to enforce as far as practicable the use of a uniform series of text books in the free public schools, except in the city of charleston provided, that the state board of examiners shall not have power without permission of the general assembly of the state to change a text book within five (5) years from the date of its adoption.",261
4,1877,572,,"to grant teachers’ state certificates, and to revoke them for immoral or unprofessional conduct, profanity or evident unfitness for teaching.",20


In [11]:
result_df.shape[0]

49

## Sentence Tokenize
Tokenize each sentence and split each new tokenized sentence into its own line.

In [12]:
result_df_2 = []
for row in result_df.itertuples():
    
    # Pass through the tokenizer
    if row.num_words > 1:
        act_tokenizer = nltk.PunktSentenceTokenizer(row.Act)
        sub_acts = act_tokenizer.tokenize(row.Act)  # A list of Acts as seperated by the tokenizer
        # print(len(sub_acts))
        # print(sub_acts)
        # print("\n")
    else:
        sub_acts = [row.Act]

    # For each new (sub) Act generated, add it to a new dataframe with the same values for the remaining columns
    for sub_act in sub_acts:
        data = {
                "Year": row.Year,
                "Page_No": row.Page_No,
                "Act": sub_act,
                "Coding (Axton)": row[3],
                "num_words": row.num_words
            }
        temp = pd.Series(data).to_frame().transpose()
        result_df_2.append(temp)

In [13]:
result_df_2 = pd.concat(result_df_2, ignore_index=True)

In [14]:
result_df_2.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton),num_words
0,1877,571,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.” approved march 22, 1878.",0.0,158
1,1877,571,—— an act to alter and amend the schoo.t law of sour carolina.,,13
2,1877,571,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners.",0.0,242
3,1877,571,of this board the state superintendent of education shall be ez officio chairman.,0.0,242
4,1877,571,"the clerk of the state superintendent of education, as hereinafter provided for, shall be clerk of the state board of examiners.",0.0,242


## More Cleaning
Follow cleaning in `sentence_splitting.ipynb`

In [15]:
# New dataframe so that the results of the matching can be compared
df_cleaned = result_df_2.copy()

### Correcting Some Important Words

In [16]:
target_words = ['section']  # Add more target words
print("Correcting the following word spellings in the dataframe:", target_words)

Correcting the following word spellings in the dataframe: ['section']


In [17]:
num_cores = multiprocessing.cpu_count()
chunks = array_split(df_cleaned, num_cores)
print(f'Using {num_cores} cores.')

with multiprocessing.Pool(num_cores) as pool:
    threshold = 1.5  # Adjust the threshold as needed
    
    params = [(chunk, target_words, threshold, 'Act') for chunk in chunks]    
    processed_chunks = pool.starmap(correct_chunk, params)

df_cleaned = pd.concat(processed_chunks, ignore_index=True)

Using 8 cores.


  return bound(*args, **kwds)


In [18]:
df_cleaned[df_cleaned.flag == True]

Unnamed: 0,Year,Page_No,Act,Coding (Axton),num_words,corrected_column,flag,org_words
18,1877,573,"sec. 7. that a state superintendent of education shall be elected at each general election in the same manner as other state officers, who shall enter upon the duties of his office at the time prescribed by law. sec. 8. that he shall, before entering upon the duties of his office, give bond for the use of the state of south carolina in the penal sum of five thousand (5,000) dollars, with good and sufficient sureties, to be approved by the governor, conditioned for the faithful and impartial performance of the duties of his office; and he shall, also, at the time of giving bond, take and subscribe the oath prescribed in sectiou 30 of article ii of the constitution of the state, which oath shall be endorsed upon the back of said bond, and the bond shall be filed with and preserved by the secretary of state.",0,403,"sec. 7. that a state superintendent of education shall be elected at each general election in the same manner as other state officers, who shall enter upon the duties of his office at the time prescribed by law. sec. 8. that he shall, before entering upon the duties of his office, give bond for the use of the state of south carolina in the penal sum of five thousand (5,000) dollars, with good and sufficient sureties, to be approved by the governor, conditioned for the faithful and impartial performance of the duties of his office; and he shall, also, at the time of giving bond, take and subscribe the oath prescribed in section 30 of article ii of the constitution of the state, which oath shall be endorsed upon the back of said bond, and the bond shall be filed with and preserved by the secretary of state.",True,sectiou


In [19]:
df_cleaned.drop(['Act', 'flag', 'org_words'], axis = 1, inplace=True)
df_cleaned.rename({'corrected_column':'Act'}, axis=1, inplace=True)

In [20]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1877,571,0.0,158,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.” approved march 22, 1878."
1,1877,571,,13,—— an act to alter and amend the schoo.t law of sour carolina.
2,1877,571,0.0,242,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners."
3,1877,571,0.0,242,of this board the state superintendent of education shall be ez officio chairman.
4,1877,571,0.0,242,"the clerk of the state superintendent of education, as hereinafter provided for, shall be clerk of the state board of examiners."


In [21]:
df_cleaned.shape[0]

128

### Splitting Sentences Based On "Approved ..." Phrases
Some "Approved..." phrases appear at the end of an Act. Sometimes a phrase, such as a new Act, might also by added on to the end of this phrase.
The phrases after the incorrect "Approved" phrases should be split into a new sentence.

In [22]:
approved_rgx_strings = [r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})', 
                        r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved: [a-z| |.]+, [a-z]+.)', 
                        r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})']

In [23]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [24]:
approved_rgx

[re.compile(r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
            re.IGNORECASE|re.UNICODE)]

In [25]:
df_new = df_cleaned.copy()

for i, rgx in enumerate(approved_rgx_strings):
    print(f'Working on pattern {i+1} using {os.cpu_count()} cores.')
    
    # Split the DataFrame using multiprocessing
    new_rows = []

    # Create a multiprocessing pool with the number of desired processes
    with multiprocessing.Pool() as pool:
        args_list = [(row, rgx, list(df_new.columns), 'Act') for _, row in df_new.iterrows()]

        # Use starmap to pass the tuples as separate arguments to process_row
        results = pool.starmap(process_row, args_list)

    # Flatten the list of lists into a single list of split rows
    new_rows = [item for sublist in results for item in sublist]

    # Create a new DataFrame with the split rows
    df_new = pd.DataFrame(new_rows)

Working on pattern 1 using 8 cores.
Working on pattern 2 using 8 cores.
Working on pattern 3 using 8 cores.
Working on pattern 4 using 8 cores.
Working on pattern 5 using 8 cores.


In [26]:
df_new.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1877,571,0.0,158,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.”"
1,1877,571,0.0,158,
2,1877,571,,13,—— an act to alter and amend the schoo.t law of sour carolina.
3,1877,571,0.0,242,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners."
4,1877,571,0.0,242,of this board the state superintendent of education shall be ez officio chairman.


In [27]:
df_new.shape[0]

129

In [28]:
if df_new.shape[0] > df_cleaned.shape[0]:
    print("Split sentences", df_new.shape[0] - df_cleaned.shape[0])
    df_cleaned = df_new

Split sentences 1


### Relocating Incorrect "Approved ..." Phrases
Since "Approved..." phrases appear in different formats throughout the years, the code uses multiple Regex strings to match the first occurence.
<br>By testing on the entire corpus, these 5 strings match the formats in all years. However, there will be some outliers (errors) that can not be captured by these strings.

In [29]:
# Add the caret in the regex strings...
for i, string in enumerate(approved_rgx_strings):
    approved_rgx_strings[i] = '^' + string
approved_rgx_strings

['^(approved the [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
 '^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved: [a-z| |.]+, [a-z]+.)',
 '^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\\d]{1,2}(.| |,){0,1})']

In [30]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [31]:
approved_rgx

[re.compile(r'^(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
            re.IGNORECASE|re.UNICODE)]

In [32]:
modified = 0

for rgx_match in approved_rgx:
    
    # Search for matches in the 'sentence' column
    matches = df_cleaned['Act'].str.extract(rgx_match)[0]
    
    # Remove the matched patterns from sentences
    df_cleaned['Act'] = df_cleaned['Act'].str.replace(rgx_match, '', n=-1, regex=True)
    
    # Add matches to the end of previous sentence
    df_cleaned['Act'] = df_cleaned['Act'].str.cat(matches.shift(-1), 
                                                            sep=' ', na_rep='')
    
    modified += matches.count().sum()

In [33]:
print(modified)

1


In [34]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1877,571,0.0,158,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.”"
1,1877,571,0.0,158,
2,1877,571,,13,—— an act to alter and amend the schoo.t law of sour carolina.
3,1877,571,0.0,242,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners."
4,1877,571,0.0,242,of this board the state superintendent of education shall be ez officio chairman.


In [35]:
df_cleaned.shape[0]

129

### Removing End-Of-Line Hyphenation
Whenever a word in the sentence continues from the end of a line to the beginning of the next line and is joined by a hyphen, the OCRed sentence also contains that hyphen and a space.
For example, 'Commander-in-Chief' is OCRed as 'Com- mander-in-Chief'
The following code implements regex patterns to remove "- " in the text since each hyphenated word is split with "- ".

In [36]:
modified = df_cleaned['Act'].str.count(pat = r'(—|_|-)( )*').sum()
df_cleaned['Act'] = df_cleaned['Act'].str.replace(pat = r'(—|_|-)( )*',
                                                            repl = "",
                                                            regex = True)

In [37]:
modified

9

In [38]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1877,571,0.0,158,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.”"
1,1877,571,0.0,158,
2,1877,571,,13,an act to alter and amend the schoo.t law of sour carolina.
3,1877,571,0.0,242,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners."
4,1877,571,0.0,242,of this board the state superintendent of education shall be ez officio chairman.


In [39]:
df_cleaned.shape[0]

129

### Removing Act Seperators
The horizontal lines differentiating one Act from another show up as U+2014 : EM DASH characters (one or multiple) in the OCR.
<br>For example, '——- —— AN ACT...' or '—— AN ACT...'

In [40]:
modified = df_cleaned['Act'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()
df_cleaned['Act'] = df_cleaned['Act'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = '',
                                                            regex = True)

In [41]:
modified

0

In [42]:
df_cleaned.shape[0]

129

### Uppercasing

In [43]:
def upperIfNeeded(sentence, ratio = 0.50):
    """
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    Needs an `uppered` variable to be defined outside of this scope.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    """

    global uppered
    
    # A count of the number of already uppercased words
    count = 0
    
    # Check whether the word consists of only letters,
    # has a length greater than 1, is uppercase, and 
    # isn't "SECTION"
    for word in sentence.split(" "):
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    # return all uppercase words
    if (count/len(sentence.split(" ")) > ratio):
        uppered += 1
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [44]:
uppered = 0
df_cleaned['Act'] = df_cleaned.apply(lambda x: upperIfNeeded(x['Act']), axis=1)
print(uppered)

0


## Some Final Touches

### Remove Unneeded Columns

In [45]:
df_cleaned.drop('num_words', axis=1, inplace=True)

In [46]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act
0,1877,571,0.0,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.”"
1,1877,571,0.0,
2,1877,571,,an act to alter and amend the schoo.t law of sour carolina.
3,1877,571,0.0,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners."
4,1877,571,0.0,of this board the state superintendent of education shall be ez officio chairman.


### Remove empty rows

In [47]:
df_cleaned = df_cleaned[~df_cleaned.Act.str.isspace()]

### Rearrange Columns

In [48]:
cols = list(df.columns)
df_cleaned = df_cleaned[cols]

In [49]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton)
0,1877,571,"an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. an act ro amenp an act entitleed “an act to esttabliish uniformity in the sessions of the circuit couurts.” section 1. be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that section 5 of an act entitled “an act to establish uniformity in the sessions of the circuit courts,” approved june 9th, 1877, be, and the same is hereby, amended by striking out the eleventh line thereof and inserting in lieu thereof “third monday in february, second monday in june and third monday in september.” sec. 2. that the said section be amended on line 15, at the end of the section, by adding the words ‘‘and the fourth monday in june.”",0.0
2,1877,571,an act to alter and amend the schoo.t law of sour carolina.,
3,1877,571,"be it enacted by the senate and house of representatives of the state of south carolina, now met and sitting in general assembly, and by the authority of the same, that the state superintendent of education and four (4) persons to be appointed by the governor, by and with the advice and consent of the senate, who shall hold office for two years and until their successors may be appointed, unless sooner removed by the governor, shall constitute the state board of examiners.",0.0
4,1877,571,of this board the state superintendent of education shall be ez officio chairman.,0.0
5,1877,571,"the clerk of the state superintendent of education, as hereinafter provided for, shall be clerk of the state board of examiners.",0.0


## Save

In [50]:
# Original file name without file type
og_fname = path.split('/')[-1].split('.')[0]

df_cleaned.to_csv(f'./{og_fname}_updated.csv', index=False)