# Discrepancy Fix
Fixing and cleaning data files.

In [1]:
import os
import nltk
import pandas as pd
import multiprocessing

from splitting_functs_discrepancy import *
from numpy import nan, array_split

pd.set_option('display.max_colwidth', None)

## Data Aquiring

In [2]:
path = "./data/1925_324act.csv"
df = pd.read_csv(path, encoding='utf8')

In [4]:
cols = list(df.columns)

In [5]:
df.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton)
0,1925,324,"‘an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils.",1
1,1925,324,"division of dillon county into high school districts—schools.—be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county.",1
2,1925,324,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.,2
3,1925,324,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education.",1
4,1925,324,"trustees.—except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county.",2


In [6]:
df.shape[0]

23

## Remove Unneeded Characters
Some unusual characters appear in the text. This section finds and removes them.
<br>Do note that the code will remove any character that falls outside the standard ASCII range (0-127).
But, this might also remove legitimate non-ASCII characters that are part of other languages or symbols.

In [7]:
# Function to find and list unique non-ASCII characters
def find_weird_characters(text):
    # Use a regular expression to find non-ASCII characters
    weird_characters = re.findall(r'[^\x00-\x7F]', text)
    return list(set(weird_characters))  # Return unique characters

# Apply the find_weird_characters function to the text column
df['weird_characters'] = df['Act'].apply(find_weird_characters)

In [8]:
# Function to remove non-ASCII characters
def remove_non_ascii(text):
    # Use a regular expression to find and remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# Apply the remove_non_ascii function to the text column
df['cleaned_act'] = df['Act'].apply(remove_non_ascii)

In [9]:
# Display those rows which have weird characters
df[df['weird_characters'].str.len() != 0]

Unnamed: 0,Year,Page_No,Act,Coding (Axton),weird_characters,cleaned_act
0,1925,324,"‘an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils.",1,[‘],"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,"division of dillon county into high school districts—schools.—be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county.",1,[—],"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
4,1925,324,"trustees.—except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county.",2,[—],"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."
6,1925,324,"powers of trustees—bond issues.—that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this.",2,[—],"powers of trusteesbond issues.that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this."
8,1925,324,"execution of bonds—pledge of credit—terms of bonds—sale.—such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same.",0,[—],"execution of bondspledge of creditterms of bondssale.such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same."
10,1925,324,"each and ali of such bonds shall also have coupons for the payment of interest attached, which coupons may be executed by the facsimile signature of the clerk; such bonds to bear interest at a rate, not exceeding six per centum per annum, payable semiannually, as the respective high school boards of trustees issuing same shall determine, and shall be sold at not less than par. § 5. tax for payment.—notwithstanding the limitations contained in a general or special act or any act making’a special provision on or about the subject matter of any general act, there shall be levied in each year after the issuance of such bonds in any and all high school districts issuing same and continuing so long as any of said bonds are outstanding and unpaid, a tax upon all the taxable property in such district issuing same sufficient to pay the interest upon all said bonds so issued and to pay the annual installments of the principal as and when the same shall mature, which tax shall be levied by the county auditor or in such other manner as may be provided by law, in the same manner and at the same time as other taxes in the county are levied and collected.",2,"[’, —, §]","each and ali of such bonds shall also have coupons for the payment of interest attached, which coupons may be executed by the facsimile signature of the clerk; such bonds to bear interest at a rate, not exceeding six per centum per annum, payable semiannually, as the respective high school boards of trustees issuing same shall determine, and shall be sold at not less than par. 5. tax for payment.notwithstanding the limitations contained in a general or special act or any act makinga special provision on or about the subject matter of any general act, there shall be levied in each year after the issuance of such bonds in any and all high school districts issuing same and continuing so long as any of said bonds are outstanding and unpaid, a tax upon all the taxable property in such district issuing same sufficient to pay the interest upon all said bonds so issued and to pay the annual installments of the principal as and when the same shall mature, which tax shall be levied by the county auditor or in such other manner as may be provided by law, in the same manner and at the same time as other taxes in the county are levied and collected."
11,1925,324,"special tax for maintenance, etc., of schools.—for the purpose of maintaining and supporting the high schools to be established under the authority of this act, including the costs of transporting and conveying pupils to and from such.",1,[—],"special tax for maintenance, etc., of schools.for the purpose of maintaining and supporting the high schools to be established under the authority of this act, including the costs of transporting and conveying pupils to and from such."
13,1925,324,§ 7. tax rate uniform.—that all lawful levies for maintenance and all lawful levies for bonds shall be at the same uniform rate on the assessed valuation of each and all of the common school districts federated or consolidated as high school districts.,2,"[—, §]",7. tax rate uniform.that all lawful levies for maintenance and all lawful levies for bonds shall be at the same uniform rate on the assessed valuation of each and all of the common school districts federated or consolidated as high school districts.
14,1925,324,"lake view high school district.—in consolidating the existing common school districts of the county into three centralized high school districts as provided in section one of this act, the centralized high school district created by an act of the general assembly approved on the 19th day of march, 1924, and known as the lake view high school district, shall be counted as and regarded as one of said centralized high school districts, and two additional high school districts shall be created as provided for herein: provided, however, that said existing high school district shall in no way be affected by this act except that the county board of education may, if it deems best in the laying out of high school districts, add one or more common school districts to and incorporate same in said existing centralized high school district.",2,[—],"lake view high school district.in consolidating the existing common school districts of the county into three centralized high school districts as provided in section one of this act, the centralized high school district created by an act of the general assembly approved on the 19th day of march, 1924, and known as the lake view high school district, shall be counted as and regarded as one of said centralized high school districts, and two additional high school districts shall be created as provided for herein: provided, however, that said existing high school district shall in no way be affected by this act except that the county board of education may, if it deems best in the laying out of high school districts, add one or more common school districts to and incorporate same in said existing centralized high school district."
15,1925,324,"bonds tax exempt.—that any and all bonds issued or to be issued under the authority of this act shall be exempt from state, county and municipal taxation.",0,[—],"bonds tax exempt.that any and all bonds issued or to be issued under the authority of this act shall be exempt from state, county and municipal taxation."


In [10]:
df.drop(['Act', 'weird_characters'], axis=1, inplace=True)
df.rename({'cleaned_act': 'Act'}, axis=1, inplace=True)

In [11]:
df

Unnamed: 0,Year,Page_No,Coding (Axton),Act
0,1925,324,1,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,1,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,2,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,1,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,2,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."
5,1925,324,2,"in any high school district to be established hereunder containing a town having population of more then one thousand inhabitants, the high school board of trustees for such district shall be composed of the board of trustees of the common school district containing such town, and one member from each of the other common school districts forming such high school district, to be appointed by the county board of education: provided, however, that each common school district forming a part of any high school district to be established heretinder shall have at least one representative on the board of high school trustees of the high school district of which common school district forms a part."
6,1925,324,2,"powers of trusteesbond issues.that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this."
7,1925,324,2,act shall not be deemed or held to intend or purport to authorize the issuance of bonds in excess of any limit imposed by the constitution of this state.
8,1925,324,0,"execution of bondspledge of creditterms of bondssale.such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same."
9,1925,324,0,"any and all bonds which may be issued under the authority of this act shall be serial coupon bonds and be payable in annual installments of one-twentieth part of the principal each year from the date or dates of their issue, so that the entire issue of such bonds for each respective high school district issuing same shall mature at the end of twenty years from the date or dates of issue."


## Grouping Adjacent Rows
Concatenate all adjacent row which have the same values for all columns except the `Act` column into a single row.

In [12]:
# Initialize an empty list to store the resulting rows
result_rows = []

# Initialize variables to track the current group
current_group = [df.iloc[0]['Year'], df.iloc[0]['Page_No'], df.iloc[0]['Coding (Axton)']]
current_text = df.iloc[0]['Act']

# Iterate through the DataFrame
for index, row in df.iterrows():
    if index == 0:
        continue
    if [row['Year'], row['Page_No'], row['Coding (Axton)']] == current_group:
        # If the current row has the same values in the three columns, concatenate the text
        current_text = current_text + ' ' + row['Act']
    else:
        # If the current row has different values, add the current row to the result
        result_rows.append([current_group[0], current_group[1], current_group[2], current_text])
        # Update the current group and text
        current_group = [row['Year'], row['Page_No'], row['Coding (Axton)']]
        current_text = row['Act']

# Add the last group to the result
result_rows.append([current_group[0], current_group[1], current_group[2], current_text])

# Create a new DataFrame from the result list
result_df = pd.DataFrame(result_rows, columns=['Year', 'Page_No', 'Coding (Axton)', 'Act'])

In [13]:
result_df.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act
0,1925,324,1,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils. division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
1,1925,324,2,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
2,1925,324,1,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
3,1925,324,2,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county. in any high school district to be established hereunder containing a town having population of more then one thousand inhabitants, the high school board of trustees for such district shall be composed of the board of trustees of the common school district containing such town, and one member from each of the other common school districts forming such high school district, to be appointed by the county board of education: provided, however, that each common school district forming a part of any high school district to be established heretinder shall have at least one representative on the board of high school trustees of the high school district of which common school district forms a part. powers of trusteesbond issues.that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this. act shall not be deemed or held to intend or purport to authorize the issuance of bonds in excess of any limit imposed by the constitution of this state."
4,1925,324,0,"execution of bondspledge of creditterms of bondssale.such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same. any and all bonds which may be issued under the authority of this act shall be serial coupon bonds and be payable in annual installments of one-twentieth part of the principal each year from the date or dates of their issue, so that the entire issue of such bonds for each respective high school district issuing same shall mature at the end of twenty years from the date or dates of issue."


In [14]:
result_df.shape[0]

11

Remove all quotation marks

In [15]:
result_df['Act'] = result_df['Act'].str.replace(pat = '“', repl = '')  # Not a normal quotation mark, but a special character

In [16]:
result_df['Act'] = result_df['Act'].str.replace(pat = '”', repl = '')  # Not a normal quotation mark, but a special character

In [17]:
result_df['Act'] = result_df['Act'].str.replace(pat = '"', repl = '')

Remove the codings for 1 and 2

In [18]:
result_df['Coding (Axton)'] = result_df['Coding (Axton)'].map({
    0:0,
    1:'',
    2:''
})

Assign the number of words for each sentence.

In [19]:
result_df['num_words'] = result_df['Act'].str.split(" ").str.len()

In [20]:
result_df.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act,num_words
0,1925,324,,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils. division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county.",190
1,1925,324,,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.,30
2,1925,324,,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education.",85
3,1925,324,,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county. in any high school district to be established hereunder containing a town having population of more then one thousand inhabitants, the high school board of trustees for such district shall be composed of the board of trustees of the common school district containing such town, and one member from each of the other common school districts forming such high school district, to be appointed by the county board of education: provided, however, that each common school district forming a part of any high school district to be established heretinder shall have at least one representative on the board of high school trustees of the high school district of which common school district forms a part. powers of trusteesbond issues.that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this. act shall not be deemed or held to intend or purport to authorize the issuance of bonds in excess of any limit imposed by the constitution of this state.",404
4,1925,324,0.0,"execution of bondspledge of creditterms of bondssale.such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same. any and all bonds which may be issued under the authority of this act shall be serial coupon bonds and be payable in annual installments of one-twentieth part of the principal each year from the date or dates of their issue, so that the entire issue of such bonds for each respective high school district issuing same shall mature at the end of twenty years from the date or dates of issue.",134


In [21]:
result_df.shape[0]

11

## Sentence Tokenize
Tokenize each sentence and split each new tokenized sentence into its own line.

In [22]:
# Train the sentence tokenizer on the whole text
act_tokenizer = nltk.PunktSentenceTokenizer(result_df.Act.str.cat(sep=". "))

In [23]:
result_df_2 = []
for row in result_df.itertuples():
    
    # Pass through the tokenizer
    if row.num_words > 1:
        sub_acts = act_tokenizer.tokenize(row.Act)  # A list of Acts as seperated by the tokenizer
        # print(len(sub_acts))
        # print(sub_acts)
        # print("\n")
    else:
        sub_acts = [row.Act]

    # For each new (sub) Act generated, add it to a new dataframe with the same values for the remaining columns
    for sub_act in sub_acts:
        data = {
                "Year": row.Year,
                "Page_No": row.Page_No,
                "Act": sub_act,
                "Coding (Axton)": row[3],
                "num_words": row.num_words
            }
        temp = pd.Series(data).to_frame().transpose()
        result_df_2.append(temp)

In [24]:
result_df_2 = pd.concat(result_df_2, ignore_index=True)

In [25]:
result_df_2.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton),num_words
0,1925,324,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils.",,190
1,1925,324,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county.",,190
2,1925,324,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.,,30
3,1925,324,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education.",,85
4,1925,324,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county.",,404


## More Cleaning
Follow cleaning in `sentence_splitting.ipynb`

In [26]:
# New dataframe so that the results of the matching can be compared
df_cleaned = result_df_2.copy()

### Trim extra spaces around text

In [27]:
df_cleaned['Act'] = df_cleaned.Act.str.strip()

### Correcting Some Important Words

In [28]:
target_words = ['section']  # Add more target words
print("Correcting the following word spellings in the dataframe:", target_words)

Correcting the following word spellings in the dataframe: ['section']


In [29]:
num_cores = multiprocessing.cpu_count()
chunks = array_split(df_cleaned, num_cores)
print(f'Using {num_cores} cores.')

with multiprocessing.Pool(num_cores) as pool:
    threshold = 1.5  # Adjust the threshold as needed
    
    params = [(chunk, target_words, threshold, 'Act') for chunk in chunks]    
    processed_chunks = pool.starmap(correct_chunk, params)

df_cleaned = pd.concat(processed_chunks, ignore_index=True)

Using 8 cores.


  return bound(*args, **kwds)


In [30]:
df_cleaned[df_cleaned.flag == True]

Unnamed: 0,Year,Page_No,Act,Coding (Axton),num_words,corrected_column,flag,org_words


In [31]:
df_cleaned.drop(['Act', 'flag', 'org_words'], axis = 1, inplace=True)
df_cleaned.rename({'corrected_column':'Act'}, axis=1, inplace=True)

In [32]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1925,324,,190,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,190,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,30,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,85,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,404,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."


In [33]:
df_cleaned.shape[0]

21

### Splitting Sentences Based On "Approved ..." Phrases
Some "Approved..." phrases appear at the end of an Act. Sometimes a phrase, such as a new Act, might also by added on to the end of this phrase.
The phrases after the incorrect "Approved" phrases should be split into a new sentence.

In [34]:
approved_rgx_strings = [r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})', 
                        r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))', 
                        r'(approved: [a-z| |.]+, [a-z]+.)', 
                        r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})']

In [35]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [36]:
approved_rgx

[re.compile(r'(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
            re.IGNORECASE|re.UNICODE)]

In [37]:
df_new = df_cleaned.copy()

for i, rgx in enumerate(approved_rgx_strings):
    print(f'Working on pattern {i+1} using {os.cpu_count()} cores.')
    
    # Split the DataFrame using multiprocessing
    new_rows = []

    # Create a multiprocessing pool with the number of desired processes
    with multiprocessing.Pool() as pool:
        args_list = [(row, rgx, list(df_new.columns), 'Act') for _, row in df_new.iterrows()]

        # Use starmap to pass the tuples as separate arguments to process_row
        results = pool.starmap(process_row, args_list)

    # Flatten the list of lists into a single list of split rows
    new_rows = [item for sublist in results for item in sublist]

    # Create a new DataFrame with the split rows
    df_new = pd.DataFrame(new_rows)

Working on pattern 1 using 8 cores.
Working on pattern 2 using 8 cores.
Working on pattern 3 using 8 cores.
Working on pattern 4 using 8 cores.
Working on pattern 5 using 8 cores.


In [38]:
df_new.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1925,324,,190,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,190,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,30,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,85,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,404,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."


In [39]:
df_new.shape[0]

21

In [40]:
if df_new.shape[0] > df_cleaned.shape[0]:
    print("Split sentences", df_new.shape[0] - df_cleaned.shape[0])
    df_cleaned = df_new

### Relocating Incorrect "Approved ..." Phrases
Since "Approved..." phrases appear in different formats throughout the years, the code uses multiple Regex strings to match the first occurence.
<br>By testing on the entire corpus, these 5 strings match the formats in all years. However, there will be some outliers (errors) that can not be captured by these strings.

In [41]:
# Add the caret in the regex strings...
for i, string in enumerate(approved_rgx_strings):
    approved_rgx_strings[i] = '^' + string
approved_rgx_strings

['^(approved the [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
 '^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\\. d\\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
 '^(approved: [a-z| |.]+, [a-z]+.)',
 '^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\\d]{1,2}(.| |,){0,1})']

In [42]:
approved_rgx = []
for string in approved_rgx_strings:
    approved_rgx.append( re.compile(string, re.IGNORECASE) )

In [43]:
approved_rgx

[re.compile(r'^(approved the [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2} day of [a-z]+(,| |.){1,2}((a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |)){0,1})',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+ [0Oo1Iil!2Z5S6G\d]{1,2}(t|h|s|t|n|d|r|d){0,2}, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved the [a-z]+-[a-z]+ day of [a-z]+, (a\. d\.(. |.| |){1,2}){0,1}.{4}(. |.| |))',
            re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved: [a-z| |.]+, [a-z]+.)', re.IGNORECASE|re.UNICODE),
 re.compile(r'^(approved( |,|){0,2}[a-z]+(.| |,){1,2}[0Oo1Iil!2Z5S6G\d]{1,2}(.| |,){0,1})',
            re.IGNORECASE|re.UNICODE)]

In [44]:
modified = 0

for rgx_match in approved_rgx:
    
    # Search for matches in the 'sentence' column
    matches = df_cleaned['Act'].str.extract(rgx_match)[0]
    
    # Remove the matched patterns from sentences
    df_cleaned['Act'] = df_cleaned['Act'].str.replace(rgx_match, '', n=-1, regex=True)
    
    # Add matches to the end of previous sentence
    df_cleaned['Act'] = df_cleaned['Act'].str.cat(matches.shift(-1), 
                                                            sep=' ', na_rep='')
    
    modified += matches.count().sum()

In [45]:
print(modified)

0


In [46]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1925,324,,190,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,190,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,30,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,85,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,404,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."


In [47]:
df_cleaned.shape[0]

21

### Removing End-Of-Line Hyphenation
Whenever a word in the sentence continues from the end of a line to the beginning of the next line and is joined by a hyphen, the OCRed sentence also contains that hyphen and a space.
For example, 'Commander-in-Chief' is OCRed as 'Com- mander-in-Chief'
The following code implements regex patterns to remove "- " in the text since each hyphenated word is split with "- ".

In [48]:
modified = df_cleaned['Act'].str.count(pat = r'(—|_|-)( )*').sum()
df_cleaned['Act'] = df_cleaned['Act'].str.replace(pat = r'(—|_|-)( )*',
                                                            repl = "",
                                                            regex = True)

In [49]:
modified

2

In [50]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1925,324,,190,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,190,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,30,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,85,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,404,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."


In [51]:
df_cleaned.shape[0]

21

### Removing Act Seperators
The horizontal lines differentiating one Act from another show up as U+2014 : EM DASH characters (one or multiple) in the OCR.
<br>For example, '——- —— AN ACT...' or '—— AN ACT...'

In [52]:
modified = df_cleaned['Act'].str.count(pat = r'^—+(?=\s*[A-Za-z])').sum()
df_cleaned['Act'] = df_cleaned['Act'].str.replace(pat = r'^—+(?=\s*[A-Za-z])',
                                                            repl = '',
                                                            regex = True)

In [53]:
modified

0

In [54]:
df_cleaned.shape[0]

21

### Uppercasing

In [55]:
def upperIfNeeded(sentence, ratio = 0.50):
    """
    Convert the given sentence list into an uppercase sentence list
    if the ratio of uppercase words (not including the ones with a mix of digits 
    or words like "SECTION") to the total words is greater than a fixed value.
    Needs an `uppered` variable to be defined outside of this scope.
    
    Parameters
    ----------
    sentence: str
         A str of sentence to check and convert to uppercase
        
    Returns
    -------
    str
        If check is approved the return an uppercase version of str.
        Else return the sentence.
    """

    global uppered
    
    # A count of the number of already uppercased words
    count = 0
    
    # Check whether the word consists of only letters,
    # has a length greater than 1, is uppercase, and 
    # isn't "SECTION"
    for word in sentence.split(" "):
        if word.isalpha() and len(word) > 1 and word.isupper() and word != "SECTION":
            count += 1

    # If the count to words ratio is greater
    # return all uppercase words
    if (count/len(sentence.split(" ")) > ratio):
        uppered += 1
        return sentence.upper()        
    
    # Else, return the original sentence list
    return sentence

In [56]:
uppered = 0
df_cleaned['Act'] = df_cleaned.apply(lambda x: upperIfNeeded(x['Act']), axis=1)
print(uppered)

0


## Trim extra spaces around text (again)

In [57]:
df_cleaned['Act'] = df_cleaned.Act.str.strip()

## Remove unneeded phrases

In [58]:
pat = r'^((s|e|r|c){1,}(\S)+)(\.|,|:|;| ){0,2}([\d]{1,3})(. |.| |){1}'
df_cleaned['Act'] = df_cleaned['Act'].str.replace(pat, '', flags=re.IGNORECASE, regex=True)

In [59]:
df_cleaned

Unnamed: 0,Year,Page_No,Coding (Axton),num_words,Act
0,1925,324,,190,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,190,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,30,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,85,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,404,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."
5,1925,324,,404,"in any high school district to be established hereunder containing a town having population of more then one thousand inhabitants, the high school board of trustees for such district shall be composed of the board of trustees of the common school district containing such town, and one member from each of the other common school districts forming such high school district, to be appointed by the county board of education: provided, however, that each common school district forming a part of any high school district to be established heretinder shall have at least one representative on the board of high school trustees of the high school district of which common school district forms a part."
6,1925,324,,404,"powers of trusteesbond issues.that the high school board of trustees of each and all of said high school districts, to be established hereunder shall be and hereby is vested with the power to do all acts necessary or adapted to the establishment and maintenance of a high school in each and all of their respective high school districts, and to that end may purchase lands and erect, or purchase high school buildings in each of said districts and equip the same, and for the purpose of providing the funds for such purchase of lands, or lands and buildings, and the erection and equipment of high school buildings, may notwithstanding any other or similar limits contained in any general or special act or any act making a special provision in or about the subject matter of any general act, issue and sell the bonds of each of said high school districts in an amount not exceeding eight per centum of the assessed valuation of the taxable property in each of said high school districts : provided, that this."
7,1925,324,,404,act shall not be deemed or held to intend or purport to authorize the issuance of bonds in excess of any limit imposed by the constitution of this state.
8,1925,324,0.0,134,"execution of bondspledge of creditterms of bondssale.such bonds shall be signed by the chairman and clerk of the high school board of trustees of the respective high school districts issuing same, and if a corporate seal shall have been adopted it shall be affixed thereto, and such bonds may pledge the full faith and credit of the high school district issuing same."
9,1925,324,0.0,134,"any and all bonds which may be issued under the authority of this act shall be serial coupon bonds and be payable in annual installments of onetwentieth part of the principal each year from the date or dates of their issue, so that the entire issue of such bonds for each respective high school district issuing same shall mature at the end of twenty years from the date or dates of issue."


## Some Final Touches

### Remove Unneeded Columns

In [60]:
df_cleaned.drop('num_words', axis=1, inplace=True)

In [61]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Coding (Axton),Act
0,1925,324,,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils."
1,1925,324,,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county."
2,1925,324,,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.
3,1925,324,,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education."
4,1925,324,,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county."


### Remove empty rows

In [62]:
df_cleaned = df_cleaned[df_cleaned['Act'].str.len() != 0]

### Remove rows which have only spaces

In [63]:
df_cleaned = df_cleaned[~df_cleaned.Act.str.isspace()]

### Remove rows with lower lengths

In [64]:
df_cleaned = df_cleaned[df_cleaned['Act'].str.len() > 5]

### Rearrange Columns

In [65]:
cols

['Year', 'Page_No', 'Act', 'Coding (Axton)']

In [66]:
df_cleaned = df_cleaned[cols]

## Save

In [67]:
df_cleaned.head()

Unnamed: 0,Year,Page_No,Act,Coding (Axton)
0,1925,324,"an act to require the county board of education for dillon county to divide the county of dillon into three high school districts, and to provide for the appointment of trustees for said high schools; to authorize each of said high school districts to issue bonds to provide high school buildings; and to levy a tax to pay the interest and principal on said bonds, and to levy a tax in each district to pay the expenses of maintaining such high schools, including transportation of high school pupils.",
1,1925,324,"division of dillon county into high school districtsschools.be it enacted by the general assembly of the state of: south carolina: that the county board of education of dillon county is hereby authorized and required as soon as practicable after the approval of this act to divide the county of dillon into three centralized high school districts by consolidating for high school purposes the existing common school districts, two or more common school districts to form one centralized high school districts ; such high school districts to be so laid off and formed as to best serve the educational interests of said county.",
2,1925,324,each high school district so formed and established by action of said county board shall be a body corporate with a board of high school trustees as hereinafter provided for.,
3,1925,324,"and the said board of high school trustees for each centralized high school district to be established hereunder are hereby authorized, directed and empowered to establish a central high school in each of their respective high school districts at such point in each high school district as said high school trustees deem best; and each of said high schools, when so established, shall be entitled to all the privileges and benefits of centralized high schools approved and accepted by the state board a of education.",
4,1925,324,"trustees.except in such high school districts as may contain an incorporated town having a population of more than one thousand inhabitants, the board of high school trustees of each high school district to be established under the terms of this act shall be composed of five members to be appointed by the county board of education for the same term and with the same qualifications as are now provided for by the law for trustees of the common school districts of the county.",


In [68]:
# Original file name without file type
og_fname = path.split('/')[-1].split('.')[0]

df_cleaned.to_csv(f'./updated_data/{og_fname}_updated.csv', index=False)