# Use a helper function to download all opinions from CL

To avoid repeatedly calling CL to get the opinions, this notebook downloads the opinions from CL for the experiments.

# Import libraries

In [1]:
import pandas as pd

from cl_utils import save_opinions_df

# Load the dataset

In [2]:
df = pd.read_csv("data/dataset.csv")
len(df)

1084

# Clean up the dataset to only get unique opinions

In [3]:
columns = ["citing_opinion_id", "court", "docket_id", "cluster_id"]
df = df[columns].drop_duplicates(subset=columns).reset_index(drop=True)
len(df)

133

# Download the opinions and save

In [4]:
df = save_opinions_df(df)
df.head()

Saved 1 opinions for opinion: 91306
Completed: 0
Saved 4 opinions for opinion: 92059
Saved 1 opinions for opinion: 92291
Saved 1 opinions for opinion: 93311
Saved 1 opinions for opinion: 93904
Saved 1 opinions for opinion: 98124
Saved 3 opinions for opinion: 98917
Saved 1 opinions for opinion: 99004
Saved 1 opinions for opinion: 99901
Saved 1 opinions for opinion: 101368
Saved 1 opinions for opinion: 101750
Completed: 10
Saved 1 opinions for opinion: 101913
Saved 1 opinions for opinion: 102164
Saved 1 opinions for opinion: 102784
Saved 3 opinions for opinion: 103012
Saved 4 opinions for opinion: 103172
Saved 3 opinions for opinion: 103214
Saved 1 opinions for opinion: 103198
Saved 4 opinions for opinion: 103292
Saved 4 opinions for opinion: 103290
Saved 1 opinions for opinion: 103351
Completed: 20
Saved 4 opinions for opinion: 103557
Saved 3 opinions for opinion: 103505
Saved 1 opinions for opinion: 103522
Saved 3 opinions for opinion: 103555
Saved 1 opinions for opinion: 103493
Saved 



Saved 4 opinions for opinion: 105014
Saved 3 opinions for opinion: 104965
Saved 3 opinions for opinion: 105319
Saved 5 opinions for opinion: 105525
Saved 4 opinions for opinion: 105543
Saved 6 opinions for opinion: 106235
Saved 5 opinions for opinion: 106285
Saved 1 opinions for opinion: 106576
Completed: 50
Saved 3 opinions for opinion: 106883
Saved 1 opinions for opinion: 107121
Saved 3 opinions for opinion: 107081
Saved 4 opinions for opinion: 107252
Saved 3 opinions for opinion: 107464
Saved 1 opinions for opinion: 107473
Saved 6 opinions for opinion: 107564
Saved 3 opinions for opinion: 107343
Saved 3 opinions for opinion: 107689
Saved 4 opinions for opinion: 107685
Completed: 60
Saved 4 opinions for opinion: 107746
Saved 1 opinions for opinion: 107606
Saved 1 opinions for opinion: 107679
Saved 4 opinions for opinion: 107980
Saved 4 opinions for opinion: 107965
Saved 4 opinions for opinion: 107979
Saved 4 opinions for opinion: 107748
Saved 4 opinions for opinion: 107973
Saved 3 op



Saved 1 opinions for opinion: 112906
Saved 7 opinions for opinion: 117947
Completed: 120
Saved 4 opinions for opinion: 117931
Saved 3 opinions for opinion: 117958
Saved 5 opinions for opinion: 118023
Saved 4 opinions for opinion: 118011
Saved 4 opinions for opinion: 118133
Saved 6 opinions for opinion: 118155
Saved 1 opinions for opinion: 118149
Saved 4 opinions for opinion: 118230
Saved 4 opinions for opinion: 118317
Saved 4 opinions for opinion: 118386
Completed: 130
Saved 4 opinions for opinion: 118428
Saved 4 opinions for opinion: 118273


Unnamed: 0,citing_opinion_id,court,docket_id,cluster_id,opinion_types,opinion_sources
0,91306,scotus,2297559,91306,[010combined],[html_with_citations]
1,92059,scotus,96842,92059,"[040dissent, 030concurrence, 020lead, 010combi...","[xml_harvard, xml_harvard, xml_harvard, html_w..."
2,92291,scotus,2449371,92291,[010combined],[html_with_citations]
3,93311,scotus,2397685,93311,[010combined],[html_with_citations]
4,93904,scotus,229746,93904,[010combined],[html_with_citations]


# Take a look at the distribution of types and sources

In [5]:
opinion_types = set().union(*df['opinion_types'])
opinion_types

{'010combined',
 '020lead',
 '030concurrence',
 '035concurrenceinpart',
 '040dissent'}

In [6]:
df['opinion_types'].value_counts()

opinion_types
[010combined]                                                                                                     41
[040dissent, 020lead, 010combined]                                                                                20
[040dissent, 030concurrence, 020lead, 010combined]                                                                20
[040dissent, 040dissent, 020lead, 010combined]                                                                    12
[030concurrence, 020lead, 010combined]                                                                            10
[040dissent, 030concurrence, 030concurrence, 020lead, 010combined]                                                 5
[030concurrence, 030concurrence, 020lead, 010combined]                                                             4
[040dissent, 030concurrence, 030concurrence, 030concurrence, 020lead, 010combined]                                 3
[040dissent, 035concurrenceinpart, 020lead, 010com

In [7]:
opinion_sources = set().union(*df['opinion_sources'])
opinion_sources

{'html_with_citations', 'xml_harvard'}

In [8]:
df['opinion_sources'].value_counts()

opinion_sources
[html_with_citations]                                                                                               41
[xml_harvard, xml_harvard, xml_harvard, html_with_citations]                                                        41
[xml_harvard, xml_harvard, html_with_citations]                                                                     31
[xml_harvard, xml_harvard, xml_harvard, xml_harvard, html_with_citations]                                           10
[xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, html_with_citations]                               8
[xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, html_with_citations]     1
[xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, xml_harvard, html_with_citations]                  1
Name: count, dtype: int64

# Save the df for future use

In [9]:
df.to_csv("data/opinion_dataset.csv", index=False)

In [10]:
assert df.isna().sum().sum() == 0