# Discrepancy Fix
Fixing and cleaning all 100 data files.
You must have `final_splits.csv` file which should contain the aggregated, previously split sentences.

In [1]:
import os
import nltk
import pandas as pd
import multiprocessing

from splitting_functs_discrepancy import *
from numpy import nan, array_split

pd.set_option('display.max_colwidth', None)

## Data Aquiring

In [2]:
path = "/work/otb-lab/Split_Cleanup_Updated/original_results/final_splits.csv"
df = pd.read_csv(path, encoding='utf8')

In [3]:
# Either of the following 2 lines of code might be needed for some files
# df = df.drop('Unnamed: 0', axis=1)
# df = df.rename({'Act_No': 'Page_No'}, axis = 1)

In [4]:
cols = list(df.columns)
cols

['id',
 'law_type',
 'state',
 'sentence',
 'length',
 'start_page',
 'end_page',
 'act',
 'section',
 'path']

In [5]:
df.head()

Unnamed: 0,id,law_type,state,sentence,length,start_page,end_page,act,section,path
0,1868-69_0000,Act,SOUTH CAROLINA,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",285,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1,1868-69_0001,Act,SOUTH CAROLINA,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1161,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
2,1868-69_0002,Act,SOUTH CAROLINA,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",979,71,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
3,1868-69_0003,Act,SOUTH CAROLINA,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",197,72,72,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
4,1868-69_0004,Act,SOUTH CAROLINA,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",122,72,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg


In [6]:
df.shape[0]

403481

## Cleaning

### Remove Unneeded Characters
Some unusual characters appear in the text. This section finds and removes them.
<br>Do note that the code will remove any character that falls outside the standard ASCII range (0-127).
But, this might also remove legitimate non-ASCII characters that are part of other languages or symbols.

In [7]:
# Function to find and list unique non-ASCII characters
def find_weird_characters(text):
    # Use a regular expression to find non-ASCII characters
    weird_characters = re.findall(r'[^\x00-\x7F]', text)
    return list(set(weird_characters))  # Return unique characters

# Apply the find_weird_characters function to the text column
df['weird_characters'] = df['sentence'].apply(find_weird_characters)

In [8]:
# Function to remove non-ASCII characters
def remove_non_ascii(text):
    # Use a regular expression to find and remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# Apply the remove_non_ascii function to the text column
df['cleaned_sent'] = df['sentence'].apply(remove_non_ascii)

In [9]:
# Display those rows which have weird characters
df[df['weird_characters'].str.len() != 0]

Unnamed: 0,id,law_type,state,sentence,length,start_page,end_page,act,section,path,weird_characters,cleaned_sent
0,1868-69_0000,Act,SOUTH CAROLINA,"AN ACT ACCEPTING THE BENEFITS OF “AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS,” APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",285,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"[”, “]","AN ACT ACCEPTING THE BENEFITS OF AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS, APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO."
1,1868-69_0001,Act,SOUTH CAROLINA,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled “ An Act to amend the fifth Section of an Act entitled ‘An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,’” approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1161,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"[”, “, ’, ‘]","Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled An Act to amend the fifth Section of an Act entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1."
2,1868-69_0002,Act,SOUTH CAROLINA,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled “An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts,” and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",979,71,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"[”, “]","Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire."
9,1868-69_0009,Act,SOUTH CAROLINA,"7 WENTYFIVE Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the Governor be, and he is hereby, authorized and empowered to negotiate a loan of one hundred and twentyfive thousand dollars, or so much thereof as is necessary to meet the current expenses of the State, at the lowest rate of interest possible; and that, for this purpose, he is authorized’ to use, as collateral security, such an amount of the Bills Receivable, bonds, stocks, or other securities, owned by the State, as may be necessary to effect the said loan; and the State officers having such Bills Receivable, bonds, stocks, or other securities, in their custody, are hereby authorized and required to deliver the same to the Governor, when called on, for this purpose.",864,72,72,2,7,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,[’],"7 WENTYFIVE Be tt enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the Governor be, and he is hereby, authorized and empowered to negotiate a loan of one hundred and twentyfive thousand dollars, or so much thereof as is necessary to meet the current expenses of the State, at the lowest rate of interest possible; and that, for this purpose, he is authorized to use, as collateral security, such an amount of the Bills Receivable, bonds, stocks, or other securities, owned by the State, as may be necessary to effect the said loan; and the State officers having such Bills Receivable, bonds, stocks, or other securities, in their custody, are hereby authorized and required to deliver the same to the Governor, when called on, for this purpose."
13,1868-69_0013,Act,SOUTH CAROLINA,"Srcrion 1. Be it enacted by the Senate and House of Representatives of the State pf South Carolina, now met and sitting in General Assembly, and by the authority of the same, That William C. Langley, Samuel Keyser and Charles D. ook, and others, and their associates and successors, are hereb orate, under te made and created a body politic and co: the name and sty of “Langley Manufacturing Company ~p for the purpose of manufacturing cotton yarns and cloths, paper, and such other fabrics as the demands of the community may require, and for procuring and making such machine to carry on said manufactures; and also for the transaction of all such business as may be connected with the above purposes, with a capital of three hundred thousand dollars, with the privilege to increase it to any extent not exceeding six hundred thousand dollars, the consent of a m obtained.",879,72,73,3,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,[“],"Srcrion 1. Be it enacted by the Senate and House of Representatives of the State pf South Carolina, now met and sitting in General Assembly, and by the authority of the same, That William C. Langley, Samuel Keyser and Charles D. ook, and others, and their associates and successors, are hereb orate, under te made and created a body politic and co: the name and sty of Langley Manufacturing Company ~p for the purpose of manufacturing cotton yarns and cloths, paper, and such other fabrics as the demands of the community may require, and for procuring and making such machine to carry on said manufactures; and also for the transaction of all such business as may be connected with the above purposes, with a capital of three hundred thousand dollars, with the privilege to increase it to any extent not exceeding six hundred thousand dollars, the consent of a m obtained."
...,...,...,...,...,...,...,...,...,...,...,...,...
403440,1968_6649,Act,SOUTH CAROLINA,"The Senator and the House Members shall instruct the county auditor to levy the millage which in their discretion is necessary for school purposes on or before April fifteenth.” SECTION 2. Time effective This act shall take effect upon approval by the Governor. Approved the 9th day of July, 1968.",301,996,996,1371,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/0996.tiff,[”],"The Senator and the House Members shall instruct the county auditor to levy the millage which in their discretion is necessary for school purposes on or before April fifteenth. SECTION 2. Time effective This act shall take effect upon approval by the Governor. Approved the 9th day of July, 1968."
403453,1968_6662,Act,SOUTH CAROLINA,"Any board member who serves for eight consecutive years shall not be eligible for reappointment for at least one year after the expiration of his last term.” SECTION 2. Time effective This act shall take effect upon approval by the Governor. Approved the 12th day of July, 1968.",282,997,997,1373,50,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/0997.tiff,[”],"Any board member who serves for eight consecutive years shall not be eligible for reappointment for at least one year after the expiration of his last term. SECTION 2. Time effective This act shall take effect upon approval by the Governor. Approved the 12th day of July, 1968."
403457,1968_6666,Act,SOUTH CAROLINA,"R1238 is amended by adding the following at the end of the section: “Effective January 1, 1969, the county supervisor shall have authority to pay claims authorized by the annual county supply appropriation approved by the Board of Commissioners without further approval of the board.” SECTION 2. Time effectiveThis act shall take effect upon approval by the Governor. Approved the 12th day of July, 1968.",408,997,998,1374,1238,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/0997.tiff,"[”, “]","R1238 is amended by adding the following at the end of the section: Effective January 1, 1969, the county supervisor shall have authority to pay claims authorized by the annual county supply appropriation approved by the Board of Commissioners without further approval of the board. SECTION 2. Time effectiveThis act shall take effect upon approval by the Governor. Approved the 12th day of July, 1968."
403475,1968_6684,Act,SOUTH CAROLINA,"In the event of a tie vote on any question before the board, the county board of administrators shall be called to sit with the board and vote upon such matters, and a majority vote of the whole shall settle the question at issue.” SECTION 8.",247,1000,1000,1374,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff,[”],"In the event of a tie vote on any question before the board, the county board of administrators shall be called to sit with the board and vote upon such matters, and a majority vote of the whole shall settle the question at issue. SECTION 8."


In [10]:
df.drop(['sentence', 'weird_characters'], axis=1, inplace=True)
df.rename({'cleaned_sent': 'sentence'}, axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,id,law_type,state,length,start_page,end_page,act,section,path,sentence
0,1868-69_0000,Act,SOUTH CAROLINA,285,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS, APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO."
1,1868-69_0001,Act,SOUTH CAROLINA,1161,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled An Act to amend the fifth Section of an Act entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1."
2,1868-69_0002,Act,SOUTH CAROLINA,979,71,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire."
3,1868-69_0003,Act,SOUTH CAROLINA,197,72,72,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned."
4,1868-69_0004,Act,SOUTH CAROLINA,122,72,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight."


### Trim extra spaces around text

In [12]:
df['sentence'] = df['sentence'].str.strip()

### Remove unneeded phrases
Remove words like 'section' or 'sec' from the start of the text.

In [13]:
pat = r'^((s|e|r|c){1,}(\S)+)(\.|,|:|;| ){0,2}([\d]{1,3})(. |.| |){1}'
df['sentence'] = df['sentence'].str.replace(pat, '', flags=re.IGNORECASE, regex=True, n=1)

In [14]:
df.head()

Unnamed: 0,id,law_type,state,length,start_page,end_page,act,section,path,sentence
0,1868-69_0000,Act,SOUTH CAROLINA,285,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"AN ACT ACCEPTING THE BENEFITS OF AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS, APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO."
1,1868-69_0001,Act,SOUTH CAROLINA,1161,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled An Act to amend the fifth Section of an Act entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1."
2,1868-69_0002,Act,SOUTH CAROLINA,979,71,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire."
3,1868-69_0003,Act,SOUTH CAROLINA,197,72,72,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned."
4,1868-69_0004,Act,SOUTH CAROLINA,122,72,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight."


### Rearrange Columns

In [15]:
# Rearrange to match the original columns
df = df[cols]

In [16]:
df

Unnamed: 0,id,law_type,state,sentence,length,start_page,end_page,act,section,path
0,1868-69_0000,Act,SOUTH CAROLINA,"AN ACT ACCEPTING THE BENEFITS OF AN ACT DONATING PUBLIC LANDS TO THE SEVERAL STATES AND TERRITORIES WHICH MAY PROVIDE COLLEGES FOR THE BENEFIT OF AGRICULTURE AND THE MECHANIC ARTS, APPROVED THE SECOND DAY OF JULY, IN THE YEAR OF OUR LORD ONE THOUSAND EIGHT HUNDRED AND SIXTYTWO.",285,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
1,1868-69_0001,Act,SOUTH CAROLINA,"Whereas, by an Act of Congress, approved the twentythird day of July, in the year of our Lord one thousand eight hundred and sixtysix, entitled An Act to amend the fifth Section of an Act entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, approved the second day of July, ig the year of our Lord one thousand eight hundred and sixtytwo, so as to extend the time within which the provisions of said Act shall be accepted and such colleges established, it was, among other things, by the Senate and House of Representatives of the United States of America, in Congress assembled, enacted that the time in which the several States may comply with the provisions of the said Act of July second, eighteen hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, is extended so that the acceptance of the benefits of the said Act may be expressed within three years from the passage of the Act first above mentioned: Section 1.",1161,71,71,1,0,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
2,1868-69_0002,Act,SOUTH CAROLINA,"Beit enacted by the Senate and House of Representatives of the State of South Carolina, now met and sitting in General Assembly, and by the authority of the same, That the State of South Carolina does hereby express its acceptance of the benefitsof the said Act of Congress, approved on the second day of July, in the year of our Lord one thousand eight hundred and sixtytwo, entitled An Act donating public lands to the several States and Territories which may provide colleges for the benefit of agriculture and the mechanic arts, and does hereby assent to the provisions in said Act contained, and to the conditions on which the grant of land and scrip by said Act authorized is made, and binds herself to the faithful performance of all the stipulations by her to be assumed in said Act contained; and it is further desired, that the State may be allowed to use the same for the establishment and support of a system of common free schools, if the State may so desire.",979,71,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/071.jpg
3,1868-69_0003,Act,SOUTH CAROLINA,"Upon the passage of this Act, the Governor of the State is authorized to take such measures as he may deem necessary to secure the early realization of the benefits of the Act above mentioned.",197,72,72,1,2,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
4,1868-69_0004,Act,SOUTH CAROLINA,"In the Senate House, the twentysecond day of July, in the year of our Lord one thousand eight hundred and sixtyeight.",122,72,72,1,1,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1868-69/images/072.jpg
...,...,...,...,...,...,...,...,...,...,...
403476,1968_6685,Act,SOUTH CAROLINA,Buy and sell real estate.,30,1000,1000,1374,8,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
403477,1968_6686,Act,SOUTH CAROLINA,"63.1 of the 1962 Code, relating to the purchase and sale of real estate by the Lancaster County Board of Directors, is amended by striking it and inserting : section 142563.1.",192,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
403478,1968_6687,Act,SOUTH CAROLINA,The board is authorized to buy any real estate needed for county purposes.,79,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff
403479,1968_6688,Act,SOUTH CAROLINA,"The board is further authorized to sell any real estate belonging to the county, except school property, when the property is no longer needed for county purposes.",168,1000,1000,1374,1425,https://emailsc.sharepoint.com/:i:/r/sites/COTEAM-ULIB-OntheBooks/Shared%20Documents/General/OCRed/1968/images/1000.tiff


## Save

In [17]:
# Create an updated directory if it doesn't exist
os.makedirs('./updated_data', exist_ok=True)

In [18]:
# Original file name without file type
og_fname = path.split('/')[-1].split('.')[0]

In [19]:
updated_fname = og_fname + '_Nov3'
updated_fname

'final_splits_Nov3'

In [20]:
# Save the big csv
df.to_csv(f'./updated_data/{updated_fname}.csv', index=False)

### Split into 100 csvs

In [21]:
# Get a list of years
years = list(df['id'].str.split('_').str[0].unique())

In [22]:
# Save a csv for each year
for year in years:
    df[df['id'].str.startswith(year)].to_csv(f'./updated_data/{year}_{updated_fname}.csv', index=False)