### Text Scraping

In [165]:
from urllib.request import urlopen
import pandas as pd
import numpy as np
import pdfplumber
import requests

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download("stopwords")
# nltk.download("wordnet")

from bs4 import BeautifulSoup

import difflib

from io import BytesIO
import copy

from wordcloud import WordCloud


<IPython.core.display.Javascript object>

In [59]:
#load in supreme court database
#replace read_csv file name with your file name and directory
scdf = pd.read_csv("SCDB_2024_01_caseCentered_Citation 2.csv")

In [311]:
#check number of cases with avaliable oral argument transcripts within the considered time period
all_rel = scdf[scdf["term"].isin(list(range(2000, 2024, 1)))]
len(all_rel[~all_rel["dateArgument"].isna()])

1664

In [218]:
def extract_argument_transcripts(year):
    """
    Extract all URLs for oral arguments for a given year on the supreme court oral arguments page 
    
    Inputs: 
    - year (int or string)

    Outputs:
    - td_df (pd df): pd dataframe with case name, date argued, truncated oral arg. string and full oral_arg link, matched to informaton
    in Spaeth et al. (2024)'s supreme court data base by case name
    """
    numyear = year
    styear = str(year)
    url = f"https://www.supremecourt.gov/oral_arguments/argument_transcript/{styear}"
    request = requests.get(url)

    soup = BeautifulSoup(request.text, "html.parser")
    
    td = {"case_names" : [],
          "scdb_match" : [],
          "dateArg" : [],
          "oral_arg" : []}
    
    #find all links on page
    for link in soup.find_all('a'):
        raw_link = link.get('href')
        string_link = str(raw_link)
        #set different search strings based on SC data storage patterns
        if year >= 2010: 
            search_string = "/argument_transcripts/"
        else:
            search_string = "pdfs/transcripts"

        #obtain case name based on page setup
        if search_string in string_link:
            casename = link.next_element.next_element.next_element.next_element
            cleaned_casename = str(casename).upper()

            #dictionaries to deal with scraped text title mismatching SCDB convention
            acc = {"EPA" : "ENVIRONMENTAL PROTECTION AGENCY", "CFPB" : "CONSUMER FINANCIAL PROTECTION BUREAU", 
                  "FBI" : "FEDERAL BUREAU OF INVESTIGATION", "CIA" : "CENTRAL INTELLIGENCE AGENCY",
                  "SEC" : "SECURITIES AND EXCHANGE COMMISSION", "NRA" : "NATIONAL RIFLE ASSOCIATION",
                  "INS" : "IMMIGRATION AND NATURALIZATION SERVICE",
                  "NLRB" : "NATIONAL LABOR RELATIONS BOARD",
                  "DOE" : "DEPARTMENT OF EDUCATION",
                  "EEOC" : "EQUAL EMPLOYMENT OPPORTUNITY COMMISSION",
                  "UNITED STATES DIST. COURT FOR D. C." : "UNITED STATES DISTRICT COURT FOR THE DISTRICT OF COLUMBIA",
                  "FCC" : "FEDERAL COMMUNICATIONS COMMISSION",
                  "NASA" : "NATIONAL AERONAUTICS AND SPACE ADMINISTRATION",
                  "FAA" : "FEDERAL AVIATION ADMINISTRATION",
                  "FDIC" : "FEDERAL DEPOSIT INSURANCE CORPORATION", 
                  "NAT. FED'N OF INDEP. BUS." : "NATIONAL FEDERATION OF INDEPENDENT BUSINESS",
                  "IRS" : "INTERNAL REVENUE SERVICE",
                  "CFPB" : "CONSUMER FINANCIAL PROTECTION BUREAU"}

            short = {"COMM'N" : "COMMISSION", 
                    "ASSN." : "ASSOCIATION",
                    "INS." : "INSURANCE",
                    "INT'L" : "INTERNATIONAL",
                    "SERVS." : "SERVICES",
                    "CORP." : "CORPORATION",
                    "DEPT." : "DEPARTMENT"}

            special_match = {"RAYMOND " : "RAYMOND V. WILLIAM T. HENDON", 
                             "ZADVYDAS V. UNDERDOWN" : "KESTUTIS ZADVYDAS V. CHRISTINE G. DAVIS AND IMMIGRATION AND NATURALIZATION SERVICE",
                            "CALCANO-MARTINEZ V. UNITED STATES" : "DEBORIS CALCANO-MARTINEZ, et al. V. IMMIGRATION AND NATURALIZATION SERVICE",
                            "HUNT V. CROMARTIE" : "JAMES B. HUNT, JR., GOVERNOR OF NORTH CAROLINA, et al. V. MARTIN CROMARTIE et al.",
                            "OWASSO INDEPENDENT SCHOOL DIST. NO. I-011 V. FALVO" : "OWASSO INDEPENDENT SCHOOL DISTRICT NO. I-011, AKA OWASSO PUBLIC SCHOOLS, et al. V. KRISTJA J. FALVO, PARENT AND NEXT FRIEND OF HER MINOR CHILDREN, ELIZABETH PLETAN, PHILIP PLETAN, AND ERICA PLETAN",
                            "MEYER V. HOLLY " : "DAVID MEYER, INDIVIDUALLY AND IN HIS CAPACITY AS PRESIDENT AND DESIGNATED OFFICER/BROKER OF TRIAD, INC., ETC. V. EMMA MARY ELLEN HOLLEY, ET VIR, et al.",
                            "MOSELEY V. V. SECRET CATALOGUE, INC." : "VICTOR MOSELEY AND CATHY MOSELEY, DBA VICTOR'S LITTLE SECRET V. V SECRET CATALOGUE, INC., et al.",
                            "VIRGINIA V. MARYLAND" : "VIRGINIA V. MARYLAND",
                            "ASHCROFT V. RAICH" : "ALBERTO R. GONZALES, ATTORNEY GENERAL, et al. V. ANGEL MCCLARY RAICH et al.",
                            "ROUSEY V. JACOWAY" : "RICHARD GERALD ROUSEY, ET UX. V. JILL R. JACOWAY",
                            "HUDSON V. MICHIGAN (REARGUED)" : "BOOKER T. HUDSON, JR. V. MICHIGAN",
                            "CENTRAL VA. COMMUNITY COLLEGE V. KATZ" : "CENTRAL VIRGINIA COMMUNITY COLLEGE, et al. V. BERNARD KATZ, LIQUIDATING SUPERVISOR FOR WALLACE'S BOOKSTORES, INC",
                            "BOBBY V. BIES" : "DAVID BOBBY, WARDEN V. MICHAEL BIES",
                            "BARTLETT V. STRICKLAND" : "GARY BARTLETT, EXECUTIVE DIRECTOR OF THE NORTH CAROLINA STATE BOARD OF ELECTIONS et al. V. DWIGHT STRICKLAND et al.",
                            "HENDERSON V. SHINSEKI" : "DORETHA H. HENDERSON, AUTHORIZED REPRESENTATIVE OF DAVID L. HENDERSON, DECEASED V. PETITIONER C. ERIC K. SHINESKI, SECRETARY OF VETERANS AFFAIRS",
                            "CHASE BANK USA, N. A. V. MCCOY" : "CHASE BANK USA, NA., PETITIONER V. JAMES A. MCCOY, INDIVIDUALLY AND ON BEHALF OF ALL OTHERS SIMILARLY SITUATED",
                            "SCHUETTE V. BAMN" : "SCHUETTE V. COALITION TO DEFEND AFFIRMATIVE ACTION",
                            "TRINITY LUTHERAN CHURCH OF COLUMBIA, INC. V. COMER" : "TRINITY LUTHERAN CHURCH V. COMER, DIRECTOR, MISSOURI DEPARTMENT OF NATURAL RESOURCES",
                            "OH ADJUTANT GEN.'S DEPT. V. FLRA" : "OHIO ADJUTANT GENERAL'S DEPARTMENT V. FEDERAL LABOR RELATIONS AUTHORITY",
                            "VIDAL, UNDER SEC. OF COMM. V. ELSTER" : "VIDAL V. ELSTER"}


            #deal with a few exceptional circumstance case names where the names are not correctly extracted from transcript
            if (cleaned_casename == "IN RE GRAND JURY") or (cleaned_casename == "NETCHOICE, LLC V. PAXTON"):
                cleaned_casename = "CANT MATCH V. CANT MATCH"
            if "REARGUED" in cleaned_casename:
                cleaned_casename = cleaned_casename.replace(" (REARGUED)", "")


            #clean case name string based on dictionaries above
            if cleaned_casename in special_match:
                cleaned_casename = special_match[cleaned_casename]
                
            if any(shortening in cleaned_casename for shortening in short):
                matches = [key for key in short if key in cleaned_casename]
                for item in matches:
                    cleaned_casename = cleaned_casename.replace(item, short[item])
        
            #split case names for future processing
            casename_list = cleaned_casename.split(" ")
            casename_parties = cleaned_casename.split(" V. ")

            #print(casename_parties)
            casename_parties1 = casename_parties[0]
            casename_parties2 = casename_parties[1]
            
            if casename_parties1 in acc:
                casename_parties1 = acc[casename_parties1]
            if casename_parties2 in acc:
                casename_parties2 = acc[casename_parties2]
            
            parties_search = f"{casename_parties1}.*{casename_parties2}|{casename_parties2}.*{casename_parties1}"

            if casename_parties[1] == "UNITED STATES":
                casename_list = casename_list[:-2] + [casename_parties2]
            elif casename_parties[0] == "UNITED STATES":
                casename_list =  [casename_parties1] + casename_list[2:]

            #print(cleaned_casename)
            #print(casename_list)

            date = casename.next_element.next_element.next_element

            cleaned_link = str(raw_link)
            cleaned_link = cleaned_link.replace("..", "")
            #print(cleaned_link)
            
            td["case_names"].append(cleaned_casename)
            td["dateArg"].append(date)
            td["oral_arg"].append(cleaned_link)

            # print(casename)
            # print(raw_link)
            year_df = scdf[(scdf["term"] == numyear) | (scdf["term"] == numyear - 1) | (scdf["term"] == numyear + 1)]
            
            matched_list = difflib.get_close_matches(cleaned_casename, year_df["caseName"], cutoff = 0.55)
            #print(matched_list)
            
            if (len(casename_list) == 3):
                print("hit")
                search = year_df[year_df['caseName'].str.contains(parties_search)==True]

                if len(search) != 0:
                    print("hit if")
                    matched_name = search["caseName"].values[0]
                    td["scdb_match"].append(matched_name)
                else:
                    if len(matched_list) != 0:
                        print("hit if in else")
                        matched_name = matched_list[0]
                        td["scdb_match"].append(matched_name)
                    else:
                        print("hit else in else")
                        td["scdb_match"].append("NA")
                        
            elif (len(casename_list) != 3) & (len(matched_list) != 0):
                matched_name = matched_list[0]
                td["scdb_match"].append(matched_name)
            elif (len(casename_list) != 3) & (len(matched_list) == 0):

                search = year_df[year_df['caseName'].str.contains(parties_search)==True]
                
                if len(search) == 1:
                    matched_name = search["caseName"].values[0]
                    td["scdb_match"].append(matched_name)
                else:
                    td["scdb_match"].append("NA")
            else:
                td["scdb_match"].append("NA")
                
            
    td_df = pd.DataFrame(td)
    td_df.insert(0, "year", numyear)

    if numyear >= 2010:
        td_df["full_link"] = "https://www.supremecourt.gov/oral_arguments" + td_df["oral_arg"]
    else:
        td_df["full_link"] = "https://www.supremecourt.gov" + td_df["oral_arg"]
    td_df.replace("NA", pd.NA, inplace = True)
    
    return td_df


In [None]:
#note: these functions need to be finetuned since they aren't perfomring a perfect split of the text
def split_text(text):
    """
    Function to split text into different speaker utterances with speaker tags
    
    Inputs: 
    - text(string) : oral argument text

    Outputs:
    - split_text(list of tuples, (speaker tag, text):  lists of tuples containing speaker tag and the text for the utterance
    """
    pattern = re.compile(r"(QUESTION|MS\.|MRS\.|JUSTICE \w+|CHIEF JUSTICE \w+|MR\. \w+):\s*(.*?)\s*(?=(?:QUESTION|MS\.|MRS\.|JUSTICE \w+|CHIEF JUSTICE \w+|MR\. \w+):|$)", re.DOTALL)
    
    results = [(match[0], match[1].strip()) for match in pattern.findall(text)]
    
    return results
    
def split_by_speaker_category(results):
    """
    Function to split oral argument transcript text into questions posed by justices and answers issued by petitioners / respondents

    Inputs:
    - results (list of tuples, (speaker tag, text)) : can be obtained from function split_text

    Outputs: 
    - tuple (justice text, attorneys text): tuple of strings containing all utterances by justices and all utterances by attorneys
    """
    judicial_text = []
    attorney_text = []
    
    for speaker, text in results:
        if speaker.startswith("QUESTION") or "JUSTICE" in speaker:
            judicial_text.append(text)
        elif speaker.startswith("MR.") or speaker.startswith("MRS.") or speaker.startswith("MS."):
            attorney_text.append(text)
    
    return " ".join(judicial_text), " ".join(attorney_text)

In [157]:
def extract_text(url, identifier):
    """
    Use PDF plumber to extract all text from a given document 
    
    Inputs:
    -url(str): url for pdf document to extract
    -identifier(string): identifer to allow for easier merging / pose processing of documents. default = the name of each case

    Outputs
    - df(pd df): dataframe containing all text within a document, all text extracted by split_by_speaker_category to be texts uttered by
    justices or by attorneys
    OR
    - error message: attempting to load the page with URL encountered a status 200 error.
    """
    response = requests.get(url)
    if response.status_code == 200:
        document_dict = {"id" : [identifier],
                         "raw_text" : [],
                        "all_text" : [], 
                        "justice_text" : [],
                        "filer_text" : []}
        all_text = ""
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            doc = pdf.pages
            for p in doc:
                page_num = p.page_number
                print(f"Extracting Page {page_num}")

               
                if page_num == 1:
                    #get information about speakers
                    text = p.extract_text()
                    #test_text = test_text + text

                    attorneys = re.findall(r"([A-Z].*)(?=, ESQ.)", text)

                    for i, atty in enumerate(attorneys):
                        idx = str(i+1)
                        at_col_name = "attorney_" + idx
                        at_ln_col = "attorney_" + idx + "_LN"

                        split_name = atty.split()
                        ln = split_name[-1]
                        if ln.find(".") != -1:
                            ln = split_name[-2]
                            ln = ln.replace(",", "")
                        
                        document_dict[at_col_name] = [atty]
                        document_dict[at_ln_col] = [ln]
                elif page_num == 2:
                    #skips second page
                    continue
                    
                else:
                    #get all text in a page
                    text = p.extract_text()
    
                    #Do some preliminary cleaning
                    text = text.replace("- -", "")
                    text = text.replace("\'", "")
                    text = re.sub(r'\d+', "", text)
                    text = text.replace("Heritage Reporting Corporation", "")
                    text = text.replace("Official — Subject to Final Review", "")
                    
                    aldo = r"\nALDERSON REPORTING COMPANY, INC\.\n FOURTEENTH STREET, N\.W\.\nSUITE \nWASHINGTON, D\.C\. \n\(\)-\n\(\) FOR DEPO"
                    pattern = re.compile(aldo, re.S)
                    text = re.sub(pattern, ' ', text)

                                 
                    all_text = all_text + text

            clean_text = re.findall(r'(?:MR\.|MS\.|MRS\.|QUESTION|JUSTICE|CHIEF JUSTICE)\s+[A-Z]+:\s*(.*?)(?=\n (?:MR\.|MS\.|MRS\.|QUESTION|JUSTICE|CHIEF JUSTICE|QUESTION:|$))', all_text, re.DOTALL)
            clean_text = " ".join(clean_text)
            clean_text = clean_text.replace("\n", "")

            raw_text = all_text.replace("\n", "")

            split = split_text(raw_text)
            justice_text, filer_text = split_by_speaker_category(split)
            
            document_dict["raw_text"] = raw_text
            document_dict["all_text"] = clean_text
            document_dict["justice_text"] = justice_text
            document_dict["filer_text"] = filer_text

            df = pd.DataFrame(document_dict)
        return df     
    else:
        errormessage = f"Failed to fetch PDF. Status code: {response.status_code}"
        return errormessage


In [158]:
def extract_text_for_year(dictionary, year, case_name_col = "case_names"):
    """
    Function to extract the oral arguemnt texts for a given year

    Inputs: 
    - dictionary (dict): Dictionary containing identifiers and links to oral arguments, can be obtained from extract_argument_transcripts
    - year (int) : year information to print

    Outputs:
    - fdf (pd df): dataframe containing URL to extract, as well as all text for each case within the dataframe
    """
    print(f"working on {year}\n")
    to_it = dictionary
    to_concat = []

    for i, row in to_it.iterrows():
        url = row["full_link"]
        identifier = row[case_name_col]
        print(identifier)
        text_df = extract_text(url, identifier)
        to_concat.append(text_df)

        
    text_df = pd.concat(to_concat)

    fdf = pd.merge(to_it, text_df, left_on = case_name_col, right_on = "id")

    return fdf

In [312]:
#obtain links of all oral arguments from supreme court website 
%%time
oa_storage = {}
years = list(range(2000, 2025, 1))
for yr in years: 
    print(yr)
    doc_name = f"./matched_df/to_match_df_{yr}.csv"
    df = extract_argument_transcripts(yr)
    oa_storage[yr] = df

    df.to_csv(doc_name)

2000
['BECKER', 'MONTGOMERY']
BECKER V. MONTGOMERY
['BECKER', 'V.', 'MONTGOMERY']
hit
hit if
['TYLER', 'CAIN']
TYLER V. CAIN
['TYLER', 'V.', 'CAIN']
hit
hit if
['NEW HAMPSHIRE', 'MAINE']
NEW HAMPSHIRE V. MAINE
['NEW', 'HAMPSHIRE', 'V.', 'MAINE']
['UNITED STATES', 'UNITED FOODS, INC.']
UNITED STATES V. UNITED FOODS, INC.
['UNITED STATES', 'V.', 'UNITED', 'FOODS,', 'INC.']
['ALABAMA', 'BOZEMAN']
ALABAMA V. BOZEMAN
['ALABAMA', 'V.', 'BOZEMAN']
hit
hit if
['CEDRIC KUSHNER PROMOTIONS, LTD.', 'KING']
CEDRIC KUSHNER PROMOTIONS, LTD. V. KING
['CEDRIC', 'KUSHNER', 'PROMOTIONS,', 'LTD.', 'V.', 'KING']
['NORFOLK SHIPBUILDING & DRYDOCK CORPORATION', 'GARRIS']
NORFOLK SHIPBUILDING & DRYDOCK CORPORATION V. GARRIS
['NORFOLK', 'SHIPBUILDING', '&', 'DRYDOCK', 'CORPORATION', 'V.', 'GARRIS']
['IDAHO', 'UNITED STATES ']
IDAHO V. UNITED STATES 
['IDAHO', 'V.', 'UNITED', 'STATES', '']
['POLLARD', 'E. I. DUPONT DE NEMOURS & CO.']
POLLARD V. E. I. DUPONT DE NEMOURS & CO.
['POLLARD', 'V.', 'E.', 'I.', 'DUPONT'

In [313]:
#merge oral argument transcript dictionary with supreme court database case vars
oa_dict = {}
for year in oa_storage:
    merged = pd.merge(oa_storage[year], scdf, how = "left", left_on = "scdb_match", right_on = "caseName")
    
    oa_dict[year] = merged

oa_dict = pd.concat(oa_dict)
oa_dict = oa_dict.drop_duplicates(subset = "caseId", keep = "first")
oa_dict = oa_dict.reset_index(drop = True)

In [314]:
#check number of cases
len(oa_dict)

1639

In [319]:
#remove all non-unanimous cases from df
oa_non_unan = {}
sum_non_unan = {"year" : [],
               "number_total" : [],
               "non_unan" : []}

years = list(range(2000, 2024, 1))
for year in years:
    df = oa_dict[oa_dict["year"] == year]
    
    #skip 2024 since supreme court data base does not yet contain 2024 case information
    if year == 2024:
        continue

    #filter unmatched cases and drop duplicate matches
    df = df[~df["caseId"].isna()]
    df = df.drop_duplicates(subset = "caseId", keep = "first")

    #filter non-unanimous votes from dataframe
    new_df = df[(df["majVotes"] != 9) & (df["minVotes"] != 9)]
    
    oa_non_unan[year] = new_df
    
    sum_non_unan["year"].append(year)
    sum_non_unan["number_total"].append(len(df))
    sum_non_unan["non_unan"].append(len(new_df))

#construct summary statistics for cases
summary_df_nonunan = pd.DataFrame(sum_non_unan)
summary_df_nonunan["percentage_nonunan"] =  (summary_df_nonunan["non_unan"] / summary_df_nonunan["number_total"]) * 100

print(summary_df_nonunan.to_latex(index = False, float_format="%.2f"))


\begin{tabular}{rrrr}
\toprule
year & number_total & non_unan & percentage_nonunan \\
\midrule
2000 & 81 & 49 & 60.49 \\
2001 & 75 & 49 & 65.33 \\
2002 & 72 & 44 & 61.11 \\
2003 & 71 & 42 & 59.15 \\
2004 & 65 & 46 & 70.77 \\
2005 & 74 & 47 & 63.51 \\
2006 & 69 & 46 & 66.67 \\
2007 & 69 & 48 & 69.57 \\
2008 & 70 & 45 & 64.29 \\
2009 & 75 & 45 & 60.00 \\
2010 & 74 & 58 & 78.38 \\
2011 & 68 & 43 & 63.24 \\
2012 & 70 & 40 & 57.14 \\
2013 & 68 & 29 & 42.65 \\
2014 & 71 & 49 & 69.01 \\
2015 & 76 & 67 & 88.16 \\
2016 & 68 & 63 & 92.65 \\
2017 & 63 & 41 & 65.08 \\
2018 & 67 & 45 & 67.16 \\
2019 & 54 & 37 & 68.52 \\
2020 & 59 & 38 & 64.41 \\
2021 & 62 & 47 & 75.81 \\
2022 & 58 & 31 & 53.45 \\
2023 & 59 & 35 & 59.32 \\
\bottomrule
\end{tabular}



In [317]:
#total number of successfully matched cases
sum(summary_df_nonunan["number_total"])

1638

In [318]:
#total number of non-unanimous cases 
sum(summary_df_nonunan["non_unan"])

1084

In [320]:
#due to extraction time, we break up our time periods into periods of 5 years 
%%time
yrs = list(range(2000, 2006, 1))
for year in yrs:
    to_it = oa_non_unan[year]
    oas[year] = extract_text_for_year(to_it, year)
        

working on 2000

TYLER V. CAIN
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
NEW HAMPSHIRE V. MAINE
Extra

In [321]:
%%time
yrs = list(range(2006, 2011, 1))
for year in yrs:
    to_it = oa_non_unan[year]
    oas[year] = extract_text_for_year(to_it, year)
        

working on 2006

POWEREX CORPORATION V. RELIANT ENERGY SERVICES, INC.
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracti

In [323]:
%%time
yrs = list(range(2011, 2015, 1))
for year in yrs:
    to_it = oa_non_unan[year]
    oas[year] = extract_text_for_year(to_it, year)

working on 2011

CHRISTOPHER V. SMITHKLINE BEECHAM CORPORATION
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page

In [326]:
%%time
yrs = list(range(2015, 2021, 1))
for year in yrs:
    to_it = oa_non_unan[year]
    oas[year] = extract_text_for_year(to_it, year)
        

working on 2015

UNITED STATES V. TEXAS
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
Extracting Page 51


In [327]:
%%time
yrs = list(range(2021, 2024, 1))
for year in yrs:
    to_it = oa_non_unan[year]
    oas[year] = extract_text_for_year(to_it, year)
        

working on 2021

UNITED STATES V. WASHINGTON
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
Extracting Pag

In [234]:
#check that all years have been properly processed
oas.keys()

dict_keys([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [328]:
#save oral arguments as json file
for year in oas:
    df = oas[year]
    path = "/Users/eclin/Desktop/ML_Project/oral_arg/"
    end = f"arguments{year}.json"

    file_name = path+end

    df.to_json(file_name)