In [378]:
from urllib.request import urlopen
import pandas as pd
import numpy as np
import pdfplumber
import requests

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download("stopwords")
# nltk.download("wordnet")

from bs4 import BeautifulSoup

import difflib

from io import BytesIO

In [379]:
#load in supreme court database
#replace read_csv file name with your file name and directory
scdf = pd.read_csv("SCDB_2024_01_caseCentered_Citation 2.csv")

In [380]:
def extract_argument_transcripts(year):
    """
    Extract all URLs on a given page
    Inputs: 
    - year:int or string

    Outputs:
    - td_df:pd df with case name, date argued, truncated oral arg. string and full oral_arg link
    """
    numyear = year
    styear = str(year)
    url = f"https://www.supremecourt.gov/oral_arguments/argument_transcript/{styear}"
    request = requests.get(url)

    soup = BeautifulSoup(request.text, "html.parser")
    
    td = {"case_names" : [],
          "scdb_match" : [],
          "dateArg" : [],
          "oral_arg" : []}
    #find all links on page
    for link in soup.find_all('a'):
        raw_link = link.get('href')
        string_link = str(raw_link)
        #set different search strings based on SC data storage
        if year >= 2010: 
            search_string = "/argument_transcripts/"
        else:
            search_string = "pdfs/transcripts"
            
        if search_string in string_link:
            casename = link.next_element.next_element.next_element.next_element
            cleaned_casename = str(casename).upper()
            date = casename.next_element.next_element.next_element

            cleaned_link = str(raw_link)
            cleaned_link = cleaned_link.replace("..", "")
            #print(cleaned_link)
            
            td["case_names"].append(cleaned_casename)
            td["dateArg"].append(date)
            td["oral_arg"].append(cleaned_link)

            # print(casename)
            # print(raw_link)
            matched_list = difflib.get_close_matches(cleaned_casename, df["caseName"])
            if len(matched_list) != 0:
                matched_name = matched_list[0]
                td["scdb_match"].append(matched_name)
            else:
                td["scdb_match"].append("NA")
                
            
    td_df = pd.DataFrame(td)
    td_df.insert(0, "year", numyear)

    if numyear >= 2010:
        td_df["full_link"] = "https://www.supremecourt.gov/oral_arguments" + td_df["oral_arg"]
    else:
        td_df["full_link"] = "https://www.supremecourt.gov" + td_df["oral_arg"]
    td_df.replace("NA", pd.NA, inplace = True)
    
    return td_df


In [381]:
years = list(range(2000, 2025, 1))
oa_storage = {}

for year in years:
    print(year)
    oadf = extract_argument_transcripts(year)
    oa_storage[year] = oadf
oa_storage

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024


{2000:     year                                         case_names  \
 0   2000                               BECKER V. MONTGOMERY   
 1   2000                                      TYLER V. CAIN   
 2   2000                             NEW HAMPSHIRE V. MAINE   
 3   2000                UNITED STATES V. UNITED FOODS, INC.   
 4   2000                                 ALABAMA V. BOZEMAN   
 ..   ...                                                ...   
 76  2000            LEGAL SERVICES CORPORATION V. VELAZQUEZ   
 77  2000                         CLEVELAND V. UNITED STATES   
 78  2000                                   ARTUZ V. BENNETT   
 79  2000  BOARD OF TRUSTEES OF THE UNIV. OF ALA. V. GARRETT   
 80  2000  BRENTWOOD ACADEMY V. TENNESSEE SECONDARY ATHLE...   
 
                                            scdb_match   dateArg  \
 0                                                <NA>  04/16/01   
 1                                TALLEY v. CALIFORNIA  04/16/01   
 2                  

In [206]:
#merge oral argument transcript dictionary with supreme court database case vars
oa_dict = {}
for year in oa_storage:
    merged = pd.merge(oa_storage[year], scdf, how = "left", left_on = "scdb_match", right_on = "caseName")
    oa_dict[year] = merged
oa_dict

{2000:     year                                         case_names  \
 0   2000                               BECKER V. MONTGOMERY   
 1   2000                                      TYLER V. CAIN   
 2   2000                             NEW HAMPSHIRE V. MAINE   
 3   2000                             NEW HAMPSHIRE V. MAINE   
 4   2000                UNITED STATES V. UNITED FOODS, INC.   
 ..   ...                                                ...   
 79  2000            LEGAL SERVICES CORPORATION V. VELAZQUEZ   
 80  2000                         CLEVELAND V. UNITED STATES   
 81  2000                                   ARTUZ V. BENNETT   
 82  2000  BOARD OF TRUSTEES OF THE UNIV. OF ALA. V. GARRETT   
 83  2000  BRENTWOOD ACADEMY V. TENNESSEE SECONDARY ATHLE...   
 
                                            scdb_match   dateArg  \
 0                                                <NA>  04/16/01   
 1                                TALLEY v. CALIFORNIA  04/16/01   
 2                  

In [351]:
def extract_questions_and_answers(text):
    # Extract all text following "QUESTION: "
    questions = re.findall(r'QUESTION:\s*(.*?)(?=\n (?:MR\.|MS\.|MRS\.|JUSTICE|CHIEF JUSTICE|QUESTION:|$))', text, re.DOTALL)
    
    # Extract responses following speaker names (e.g., "MR. OLSON:", "MS. SMITH:")
    answers = re.findall(r'(?:MR\.|MS\.|MRS\.|JUSTICE|CHIEF JUSTICE)\s+[A-Z]+:\s*(.*?)(?=\n (?:MR\.|MS\.|MRS\.|JUSTICE|CHIEF JUSTICE|QUESTION:|$))', text, re.DOTALL)
    
    # Join extracted texts into separate strings
    questions_text = " ".join(q.strip() for q in questions)
    answers_text = " ".join(a.strip() for a in answers)


    questions_text = questions_text.replace("\n", "")
    answers_text = answers_text.replace("\n", "")
    
    return questions_text, answers_text


In [372]:
def extract_text(url, identifier):
    """
    Use PDF plumber to extract all text
    Inputs:
    -url:str url to extract

    Outputs
    all_text:str all text in the document
    """
    response = requests.get(url)
    if response.status_code == 200:
        document_dict = {"id" : [identifier],
                        "all_text" : [],
                        "question_text" : [],
                        "answer_text" : []}
        all_text = ""
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            doc = pdf.pages
            for p in doc:
                page_num = p.page_number
                print(f"Extracting Page {page_num}")

               
                if page_num == 1:
                    #get information about speakers
                    text = p.extract_text()
                    #test_text = test_text + text

                    attorneys = re.findall(r"([A-Z].*)(?=, ESQ.)", text)

                    for i, atty in enumerate(attorneys):
                        idx = str(i+1)
                        at_col_name = "attorney_" + idx
                        at_ln_col = "attorney_" + idx + "_LN"

                        split_name = atty.split()
                        ln = split_name[-1]
                        if ln.find(".") != -1:
                            ln = split_name[-2]
                            ln = ln.replace(",", "")
                        
                        document_dict[at_col_name] = [atty]
                        document_dict[at_ln_col] = [ln]
                elif page_num == 2:
                    #skips second page
                    continue
                    
                else:
                    #get all text in a page
                    text = p.extract_text()
    
                    #Do some preliminary cleaning
                    text = text.replace("- -", "")
                    text = text.replace("\'", "")
                    text = re.sub(r'\d+', "", text)
                    text = text.replace("Heritage Reporting Corporation", "")
                    text = text.replace("Official — Subject to Final Review", "")
                    text = text.replace("ALDERSON REPORTING COMPANY, INC.  FOURTEENTH STREET, N.W. SUITE  WASHINGTON, D.C.  ()- () FOR DEPO", "")
                    
                    all_text = all_text + text

            q_text, a_text = extract_questions_and_answers(all_text)
            document_dict["question_text"] = [q_text]
            document_dict["answer_text"] = [a_text]

            all_text = all_text.replace("\n", "")
            document_dict["all_text"] = all_text

            df = pd.DataFrame(document_dict)
        return df     
    else:
        errormessage = f"Failed to fetch PDF. Status code: {response.status_code}"
        return errormessage


In [304]:
oa_dict[2000]["full_link"][50]

'https://www.supremecourt.gov/pdfs/transcripts/2000/00-949.pdf'

In [385]:
def extract_text_for_year(year):
    print(f"working on {year}\n")
    to_it = oa_dict[year]
    to_concat = []

    for i, row in to_it.iterrows():
        url = row["full_link"]
        identifier = row["caseName"]
        print(identifier)
        text_df = extract_text(url, identifier)
        to_concat.append(text_df)

        
    text_df = pd.concat(to_concat)

    fdf = pd.merge(to_it, text_df, left_on = "caseName", right_on = "id")

    return fdf

In [389]:
%%time
yrs = list(range(2000, 2006))
for year in yrs:
    oas[year] = extract_text_for_year(year)


working on 2000

nan
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
TALLEY v. CALIFORNIA
Extracting Page 1
Extracting Page 2


Unnamed: 0,year,case_names,scdb_match,dateArg,oral_arg,full_link,caseId,docketId,caseIssuesId,voteId,...,question_text,answer_text,attorney_1,attorney_1_LN,attorney_2,attorney_2_LN,attorney_3,attorney_3_LN,attorney_4,attorney_4_LN
0,2000,BECKER V. MONTGOMERY,,04/16/01,/pdfs/transcripts/2000/00-6374.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,,,,,...,I should know this -- when you file a notice o...,"Well hear argument now in No.-, Dale Becker v....",JEFFREY S. SUTTON,SUTTON,STEWART A. BAKER,BAKER,,,,
1,2000,BECKER V. MONTGOMERY,,04/16/01,/pdfs/transcripts/2000/00-6374.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,,,,,...,"Mr. Ferrini, how does your case differ from th...","Well hear now argument next in -, Norfolk Ship...",JAMES T. FERRINI,FERRINI,PATRICK H. O'DONNELL,O'DONNELL,,,,
2,2000,BECKER V. MONTGOMERY,,04/16/01,/pdfs/transcripts/2000/00-6374.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,,,,,...,"-- forum, the law doesnt apply to me because i...","We will hear argument next in number , the Imm...",EDWIN S. KNEEDLER,KNEEDLER,LUCAS GUTTENTAG,GUTTENTAG,,,,
3,2000,BECKER V. MONTGOMERY,,04/16/01,/pdfs/transcripts/2000/00-6374.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,,,,,...,"Well, what is it on the face of the statutory ...","Well hear argument now in Number -, Lorillard ...",,,,,,,,
4,2000,BECKER V. MONTGOMERY,,04/16/01,/pdfs/transcripts/2000/00-6374.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,,,,,...,Could you tell me how the test works? I take i...,Well hear argument now in number -. Saucier ag...,PAUL D. CLEMENT,CLEMENT,JOHN K. BOYD,BOYD,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,2000,LEGAL SERVICES CORPORATION V. VELAZQUEZ,LEGAL SERVICES CORPORATION v. CARMEN VELAZQUEZ...,10/04/00,/pdfs/transcripts/2000/99-603.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,2000-028,2000-028-01,2000-028-01-01,2000-028-01-01-01,...,"Mr. Levine, does the statute at issue here in ...","Well hear argument next in Number -, Legal Ser...",ALAN LEVINE,LEVINE,EDWIN S. KNEEDLER,KNEEDLER,BURT NEUBORNE,NEUBORNE,,
328,2000,CLEVELAND V. UNITED STATES,CLEVELAND v. UNITED STATES,10/10/00,/pdfs/transcripts/2000/99-804.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,1946-002,1946-002-01,1946-002-01-01,1946-002-01-01-01,...,What would they be punished under? What kind o...,"Well hear argument now in Number -, Carl W. Cl...",PAUL MOGIN,MOGIN,MICHAEL R. DREEBEN,DREEBEN,,,,
329,2000,ARTUZ V. BENNETT,IANCU v. BRUNETTI,10/10/00,/pdfs/transcripts/2000/99-1238.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,2018-064,2018-064-01,2018-064-01-01,2018-064-01-01-01,...,I take it it all comes up because were constru...,"Well hear argument next in Number -, Christoph...",JOHN M. CASTELLANO,CASTELLANO,DAN SCHWEITZER,SCHWEITZER,ALAN S. FUTERFAS,FUTERFAS,,
330,2000,BOARD OF TRUSTEES OF THE UNIV. OF ALA. V. GARRETT,BOARD OF TRUSTEES OF THE UNIVERSITY OF ALABAMA...,10/11/00,/pdfs/transcripts/2000/99-1240.pdf,https://www.supremecourt.gov/pdfs/transcripts/...,2000-022,2000-022-01,2000-022-01-01,2000-022-01-01-01,...,"Lets -- Mr. Sutton, there were congressional f...","Well hear argument now in Number -, the Board ...",JEFFREY S. SUTTON,SUTTON,MICHAEL GOTTESMAN,GOTTESMAN,SETH P. WAXMAN,WAXMAN,,


In [390]:
%%time
yrs = list(range(2006, 2011))
for year in yrs:
    oas[year] = extract_text_for_year(year)

working on 2001

UNITED STATES v. SCOTT
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
Extracting Page 51


In [396]:
%%time
yrs = list(range(2011, 2016))
for year in yrs:
    oas[year] = extract_text_for_year(year)


working on 2011

nan
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
Extracting Page 51
Extracting Page 52


In [399]:
%%time
yrs = list(range(2016, 2021))
for year in yrs:
    oas[year] = extract_text_for_year(year)


working on 2016

PERRY v. MERIT SYSTEMS PROTECTION BD.
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Extracting Page 50
Extr

In [409]:
%%time
yrs = list(range(2020, 2025))
for year in yrs:
    oas[year] = extract_text_for_year(year)


working on 2020

YELLEN v. CONFEDERATED TRIBES OF THE CHEHALIS RESERVATION
Extracting Page 1
Extracting Page 2
Extracting Page 3
Extracting Page 4
Extracting Page 5
Extracting Page 6
Extracting Page 7
Extracting Page 8
Extracting Page 9
Extracting Page 10
Extracting Page 11
Extracting Page 12
Extracting Page 13
Extracting Page 14
Extracting Page 15
Extracting Page 16
Extracting Page 17
Extracting Page 18
Extracting Page 19
Extracting Page 20
Extracting Page 21
Extracting Page 22
Extracting Page 23
Extracting Page 24
Extracting Page 25
Extracting Page 26
Extracting Page 27
Extracting Page 28
Extracting Page 29
Extracting Page 30
Extracting Page 31
Extracting Page 32
Extracting Page 33
Extracting Page 34
Extracting Page 35
Extracting Page 36
Extracting Page 37
Extracting Page 38
Extracting Page 39
Extracting Page 40
Extracting Page 41
Extracting Page 42
Extracting Page 43
Extracting Page 44
Extracting Page 45
Extracting Page 46
Extracting Page 47
Extracting Page 48
Extracting Page 49
Ext

PDFSyntaxError: No /Root object! - Is this really a PDF?

In [414]:
#save oral arguments as CSV
for year in oas:
    df = oas[year]
    path = "/Users/eclin/Desktop/ML_Project/oral_arg/"
    end = f"arguments{year}.csv"

    file_name = path+end

    df.to_csv(file_name)