In [1]:
import requests

url = 'http://caselaw.findlaw.com/robots.txt'
response  = requests.get(url)
print(response.text)

# Findlaw robots.txt file

User-Agent: *
Disallow:
Sitemap: https://caselaw.findlaw.com/sitemap.xml


In [2]:
url_test = 'https://caselaw.findlaw.com/court/us-supreme-court/years/2000'

response = requests.get(url_test)

In [3]:
response.status_code

200

In [4]:
from bs4 import BeautifulSoup
import time, os
import pandas as pd 

In [5]:
def table_data(soup):

    def get_data(tr, tag='td'):
        text_list = [td.get_text(strip=True) for td in tr.find_all(tag)]
        href = [a['href'] for a in tr.select('a', href=True)]
        return text_list + href

    rows = []

    table = soup.find('table', class_='responsive-card-table unstriped')
    trs = table.find_all('tr')
    trs = trs[1:]
    for tr in trs: 
        rows.append(get_data(tr, 'td') )
    return rows

In [6]:
urls = []
for year in list(range(1968, 2022)):
    urls.append('https://caselaw.findlaw.com/court/us-supreme-court/years/' + str(year))

df_years = pd.DataFrame(data=None, columns=['Description', 'Date', 'Docket #', 'URL'])
list_of_year_dfs = []

for url in urls:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')
    
    table_contents = table_data(soup)
    
    df = pd.DataFrame(table_contents, columns=['Description', 'Date', 'Docket #', 'URL'])
    
    list_of_year_dfs.append(df)

In [7]:
df_years = pd.concat(list_of_year_dfs, axis=0, ignore_index=True)
df_years.shape

(7970, 4)

In [8]:
def get_case_data(series):
    df_cases = pd.DataFrame(data=None, columns=['Case', 'Raw Text'])
    list_of_cases_df = []
    
    def get_data(div):
        case = [h3.get_text(strip=False) for h3 in div.find_all('h3')]
        id_ = case[0]+' '+case[1]
        text_list = [p.get_text(strip=True) for p in div.find_all('p')]
        text = ' '.join(text_list)
        case_data = {id_ : text}
        df = pd.DataFrame.from_dict(case_data, orient='index', columns=['Raw Text'])
    
        list_of_cases_df.append(df)
    
    for index, link in series.items():
        page = requests.get(link).text
        soup = BeautifulSoup(page, "html.parser")
        
        div = soup.find('div', class_='caselawcontent searchable-content')
        
        get_data(div)
    
    return list_of_cases_df

In [9]:
df_years.loc[df_years.URL =='https://caselaw.findlaw.com/us-supreme-court/405/1030.html']

Unnamed: 0,Description,Date,Docket #,URL
1265,"VOLPE, SEC. OF TRANS. ET AL. v. D.C. FED'N. OF...","March 27, 1972",No. 71-931,https://caselaw.findlaw.com/us-supreme-court/4...


In [10]:
rows_to_drop = [1265]
df_years.drop(rows_to_drop, axis=0, inplace=True)

In [11]:
cases_dict = get_case_data(df_years.URL)

KeyboardInterrupt: 

In [56]:
df_opinions = pd.concat(cases_dict, axis=0)

In [60]:
df_opinions.to_csv('opinions_data.csv')

In [61]:
df_opinions.shape

(7916, 1)