# Data collection and Preparation
### Software Package & Built in Function Documentation
Beautiful Soup - https://www.crummy.com/software/BeautifulSoup/bs4/doc/ <br>
requests - http://docs.python-requests.org/en/master/  <br>
Regular Expressions - https://docs.python.org/2/library/re.html  <br>
Pandas - https://pandas.pydata.org/pandas-docs/stable/10min.html  <br>
Numpy - http://www.numpy.org/  <br>
Pickle - https://docs.python.org/2/library/pickle.html  <br>

In [5]:
# Import Scientific Packages into Python Kernel
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import html5lib

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/92.0.4515.131 Safari/537.36'}

In [3]:
url = "http://caselaw.findlaw.com/court/us-supreme-court/years/"
years = [url + str(year) for year in range(1760,2019)]

In [4]:
# Define a method that executes your url request and returns the data (HTML or XML) as an Object 
def Beautiful_soup_grabber(link):
    
    response = requests.get(link, headers = headers) #optional add timeout (seconds) keeps requests from running indefinitely 
    
    return BeautifulSoup(response.text, "lxml") #Returns BeautifulSoup object, which represents the document as a nested data structure

In [6]:
# Define a method which calls the above method for each year within the range you've requested and convert result object into table
def year_getter(years):
    
    y = {}
    for year in years:
        soup = Beautiful_soup_grabber(year)
        souplist = soup.findAll("a")
        
        #use regular expressions  
        for i in souplist:
            if re.search("us-supreme-court", str(i)) and not re.search("years", str(i)) and not re.search("/court/", str(i)):
                b = i["href"]
                y[b] = [re.sub("[^0-9]", "", b.split("/")[-1]), re.findall(r'\d+', year)[0]]
    
    return pd.DataFrame(y).transpose().reset_index() #converts results to data frame table using pandas

In [7]:
df_links = year_getter(years)
df_links.columns = ["case_url", "docket", "year"]
df_links.head(7)

Unnamed: 0,case_url,docket,year
0,https://caselaw.findlaw.com/us-supreme-court/1...,4,1760
1,https://caselaw.findlaw.com/us-supreme-court/1...,5,1762
2,https://caselaw.findlaw.com/us-supreme-court/1...,8,1763
3,https://caselaw.findlaw.com/us-supreme-court/1...,7,1763
4,https://caselaw.findlaw.com/us-supreme-court/1...,11,1764
5,https://caselaw.findlaw.com/us-supreme-court/1...,92,1764
6,https://caselaw.findlaw.com/us-supreme-court/1...,91,1764


In [8]:
df_links.shape

(23487, 3)

In [9]:
# Python Object serialization - 
# “Pickling” is the process whereby a Python object hierarchy is converted into a byte stream
df_links.to_pickle("supcourt_yearlist.pickle") 

In [3]:
df_links = pd.read_pickle("supcourt_yearlist.pickle")

In [4]:
##gets full text (maj opinion and dissent) in one string

import unicodedata
import string
from unicodedata import normalize

def  supcourttext(link):

    allitems = []
    response = requests.get(link, headers =  headers)
    page = response.text
    soup = BeautifulSoup(page, "html5lib")
    
    pagesoup = soup.find_all(class_="caselawcontent searchable-content") 
    
    for item in pagesoup:
        txtt = item.get_text()
        cleantxt = unicodedata.normalize('NFKD',txtt)

        #cleantxt= re.sub(r'\s+', '', cleantxt)
        cleantxt= cleantxt.replace('\n', ' ')
        cleantxt= cleantxt.replace('\t', ' ')
        allitems.append(cleantxt)
    
    return ' '.join(allitems)

In [5]:
#split dataframe into three temporary dfs (caselaw can detect too may requests from a scraper and block your ip)
supcourt_df1 = df_links.iloc[0:3000]
supcourt_df2 = df_links.iloc[3000:6000]
supcourt_df3 = df_links.iloc[6000:9000]
supcourt_df4 = df_links.iloc[9000:12000]
supcourt_df5 = df_links.iloc[12000:15000]
supcourt_df6 = df_links.iloc[15000:18000]
supcourt_df7 = df_links.iloc[18000:21000]
supcourt_df8 = df_links.iloc[21000:23487]

In [7]:
supcourt_df1.loc[:,("fulltext")] = supcourt_df1.case_url.apply(supcourttext)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [10]:
supcourt_df2.loc[:,("fulltext")] = supcourt_df2.case_url.apply(supcourttext)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [12]:
supcourt_df3.loc[:,("fulltext")] = supcourt_df3.case_url.apply(supcourttext)

In [None]:
supcourt_df4.loc[:,("fulltext")] = supcourt_df4.case_url.apply(supcourttext)

In [None]:
supcourt_df5.loc[:,("fulltext")] = supcourt_df5.case_url.apply(supcourttext)

In [None]:
supcourt_df6.loc[:,("fulltext")] = supcourt_df6.case_url.apply(supcourttext)

In [None]:
supcourt_df7.loc[:,("fulltext")] = supcourt_df7.case_url.apply(supcourttext)

In [16]:
supcourt_df8.loc[:,("fulltext")] = supcourt_df8.case_url.apply(supcourttext)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [None]:
supcourt_df1.to_pickle("temp1.pickle") 
supcourt_df2.to_pickle("temp2.pickle") 
supcourt_df3.to_pickle("temp3.pickle") 
supcourt_df4.to_pickle("temp4.pickle") 
supcourt_df5.to_pickle("temp5.pickle") 
supcourt_df6.to_pickle("temp6.pickle") 
supcourt_df7.to_pickle("temp7.pickle") 
supcourt_df8.to_pickle("temp8.pickle") 

In [23]:
def checkDataframe(df):
    print(df.shape)
    print(df.head(2))
    # data for first index
    print(df.iloc[0, :].fulltext)

In [31]:
# To test the pickle files are saved properly
df = pd.read_pickle("temp8.pickle")
checkDataframe(df)

(2487, 4)
                                                case_url docket  year  \
21000  https://caselaw.findlaw.com/us-supreme-court/5...    157  1991   
21001  https://caselaw.findlaw.com/us-supreme-court/5...    160  1991   

                                                fulltext  
21000                                                ...  
21001                                                ...  
                                                  United States Supreme Court             TOIBB v. RADLOFF(1991)             No. 90-368             Argued: April 22, 1991Decided: June 13, 1991                                                  Petitioner Toibb filed a voluntary petition for relief under Chapter 7 of the Bankruptcy Code, disclosing, inter alia, assets that included stock in an electric power company. When he discovered that the stock had substantial value, he decided to avoid its liquidation by moving to convert his Chapter 7 case to one under Chapter 11's reorganization p

In [32]:
supcourt_df1 = pd.read_pickle("temp1.pickle")
supcourt_df2 = pd.read_pickle("temp2.pickle")
supcourt_df3 = pd.read_pickle("temp3.pickle")
supcourt_df4 = pd.read_pickle("temp4.pickle")
supcourt_df5 = pd.read_pickle("temp5.pickle")
supcourt_df6 = pd.read_pickle("temp6.pickle")
supcourt_df7 = pd.read_pickle("temp7.pickle")
supcourt_df8 = pd.read_pickle("temp8.pickle")

In [33]:
#putting it all together
full_project = pd.concat([supcourt_df1, supcourt_df2, supcourt_df3, supcourt_df4, supcourt_df5, supcourt_df6, supcourt_df7, supcourt_df8])

In [34]:
full_project.to_pickle("full_proj_preproc.pickle")

In [6]:
final_dataset = pd.read_pickle("full_proj_preproc.pickle")
final_dataset.head(7)

AttributeError: Can't get attribute 'new_block' on <module 'pandas.core.internals.blocks' from 'C:\\Users\\gaikw\\anaconda3\\lib\\site-packages\\pandas\\core\\internals\\blocks.py'>

In [39]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23487 entries, 0 to 23486
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   case_url  23487 non-null  object
 1   docket    23487 non-null  object
 2   year      23487 non-null  int64 
 3   fulltext  23487 non-null  object
dtypes: int64(1), object(3)
memory usage: 734.1+ KB


In [38]:
final_dataset['year'] =pd.to_numeric(final_dataset.year)

In [40]:
year_wise_records = final_dataset.groupby('year').count()

In [41]:
year_wise_records

Unnamed: 0_level_0,case_url,docket,fulltext
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1760,1,1,1
1762,1,1,1
1763,2,2,2
1764,3,3,3
1767,1,1,1
...,...,...,...
2014,102,102,102
2015,87,87,87
2016,102,102,102
2017,85,85,85


In [42]:
year_wise_records.reset_index(inplace = True)
year_wise_records.drop('case_url', axis = 1, inplace = True )
year_wise_records.drop('fulltext', axis = 1, inplace = True )
year_wise_records.columns = ['year', 'number']
year_wise_records.head(5)

Unnamed: 0,year,docket
0,1760,1
1,1762,1
2,1763,2
3,1764,3
4,1767,1


In [43]:
year_wise_records.shape

(231, 2)

In [49]:
import matplotlib.pyplot as plt
%matplotlib inline

years = []
for i in range(1790, 2020, 20):
    years.append(i)

plt.rcParams.update({'font.size': 28})    

plt.figure(figsize=(20,10))
plt.title('Supreme court cases over time')
plt.ylabel('Number of cases')
plt.xlabel('Year')
plt.xlim(1790, 2020)
plt.ylim(0,350)
plt.xticks(years)
plt.plot(df["year"],df["number"], linewidth = 4.0);

ModuleNotFoundError: No module named 'matplotlib'