In [1]:
import pandas as pd

from utils.cl_utils import get_opinions_in_cluster

# Get raw opinions & metadata for citing decisions

In [2]:
df = pd.read_csv("data/citing_dataset.csv")
len(df)

133

In [3]:
df.head()

Unnamed: 0,citing_cluster_id,citing_decision_name
0,91306,"Morgan v. United States,113 U.S. 476 (1885)"
1,92059,"In re Ayers,123 U.S. 443 (1887)"
2,92291,"Leloup v. Port of Mobile,127 U.S. 640 (1888)"
3,93311,"Brenham v. German American Bank,144 U.S. 173 ..."
4,93904,"Roberts v. Lewis,153 U.S. 367 (1894)"


In [4]:
results = {}
filepath = "data/raw_citing_opinions"

for cluster_id in df["citing_cluster_id"].to_list():
    results[cluster_id] = get_opinions_in_cluster(cluster_id, filepath)

In [5]:
records = []
for case_id, case_data in results.items():
    record = {'citing_cluster_id': case_id}
    record.update({k: v for k, v in case_data.items()})
    
    opinion_filenames = [op['opinion_filename'] for op in case_data.get('opinions', [])]
    record['opinion_filenames'] = opinion_filenames
    
    records.append(record)

result_df = pd.DataFrame(records)
result_df.head()

Unnamed: 0,citing_cluster_id,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,91306,https://www.courtlistener.com/opinion/91306/mo...,Morgan,Morgan v. United States,MORGAN & Another v. UNITED STATES; UNITED STAT...,"[113 U.S. 476, 5 S. Ct. 588, 28 L. Ed. 1044, 1...","[{'opinion_id': 91306, 'opinion_api': 'https:/...",[91306_010combined.txt]
1,92059,https://www.courtlistener.com/opinion/92059/in...,In Re Ayers,In Re Ayers,In RE AYERS; IN RE SCOTT; IN RE McCABE,"[123 U.S. 443, 8 S. Ct. 164, 31 L. Ed. 216, 18...","[{'opinion_id': 9417465, 'opinion_api': 'https...","[92059_020lead.txt, 92059_030concurrence.txt, ..."
2,92291,https://www.courtlistener.com/opinion/92291/le...,Leloup,Leloup v. Port of Mobile,Leloup v. Port of Mobile,"[127 U.S. 640, 8 S. Ct. 1380, 32 L. Ed. 311, 1...","[{'opinion_id': 92291, 'opinion_api': 'https:/...",[92291_010combined.txt]
3,93311,https://www.courtlistener.com/opinion/93311/br...,Brenham,Brenham v. German American Bank,Brenham v. German American Bank,"[144 U.S. 173, 12 S. Ct. 559, 36 L. Ed. 390, 1...","[{'opinion_id': 93311, 'opinion_api': 'https:/...",[93311_010combined.txt]
4,93904,https://www.courtlistener.com/opinion/93904/ro...,Roberts,Roberts v. Lewis,Roberts v. Lewis,"[153 U.S. 367, 14 S. Ct. 945, 38 L. Ed. 747, 1...","[{'opinion_id': 93904, 'opinion_api': 'https:/...",[93904_010combined.txt]


In [6]:
result = df.merge(result_df, how="left", on="citing_cluster_id")
result.head()

Unnamed: 0,citing_cluster_id,citing_decision_name,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,91306,"Morgan v. United States,113 U.S. 476 (1885)",https://www.courtlistener.com/opinion/91306/mo...,Morgan,Morgan v. United States,MORGAN & Another v. UNITED STATES; UNITED STAT...,"[113 U.S. 476, 5 S. Ct. 588, 28 L. Ed. 1044, 1...","[{'opinion_id': 91306, 'opinion_api': 'https:/...",[91306_010combined.txt]
1,92059,"In re Ayers,123 U.S. 443 (1887)",https://www.courtlistener.com/opinion/92059/in...,In Re Ayers,In Re Ayers,In RE AYERS; IN RE SCOTT; IN RE McCABE,"[123 U.S. 443, 8 S. Ct. 164, 31 L. Ed. 216, 18...","[{'opinion_id': 9417465, 'opinion_api': 'https...","[92059_020lead.txt, 92059_030concurrence.txt, ..."
2,92291,"Leloup v. Port of Mobile,127 U.S. 640 (1888)",https://www.courtlistener.com/opinion/92291/le...,Leloup,Leloup v. Port of Mobile,Leloup v. Port of Mobile,"[127 U.S. 640, 8 S. Ct. 1380, 32 L. Ed. 311, 1...","[{'opinion_id': 92291, 'opinion_api': 'https:/...",[92291_010combined.txt]
3,93311,"Brenham v. German American Bank,144 U.S. 173 ...",https://www.courtlistener.com/opinion/93311/br...,Brenham,Brenham v. German American Bank,Brenham v. German American Bank,"[144 U.S. 173, 12 S. Ct. 559, 36 L. Ed. 390, 1...","[{'opinion_id': 93311, 'opinion_api': 'https:/...",[93311_010combined.txt]
4,93904,"Roberts v. Lewis,153 U.S. 367 (1894)",https://www.courtlistener.com/opinion/93904/ro...,Roberts,Roberts v. Lewis,Roberts v. Lewis,"[153 U.S. 367, 14 S. Ct. 945, 38 L. Ed. 747, 1...","[{'opinion_id': 93904, 'opinion_api': 'https:/...",[93904_010combined.txt]


In [7]:
result = result.rename(columns={'case_law_url': 'citing_url', 
                                'case_name_short': 'citing_name_short',
                                'case_name': 'citing_name',
                                'case_name_full': 'citing_name_full',
                                'citation_names': 'citing_citations',
                                'opinions': 'citing_opinions',
                                'opinion_filenames': 'citing_filenames'})
result.head()

Unnamed: 0,citing_cluster_id,citing_decision_name,citing_url,citing_name_short,citing_name,citing_name_full,citing_citations,citing_opinions,citing_filenames
0,91306,"Morgan v. United States,113 U.S. 476 (1885)",https://www.courtlistener.com/opinion/91306/mo...,Morgan,Morgan v. United States,MORGAN & Another v. UNITED STATES; UNITED STAT...,"[113 U.S. 476, 5 S. Ct. 588, 28 L. Ed. 1044, 1...","[{'opinion_id': 91306, 'opinion_api': 'https:/...",[91306_010combined.txt]
1,92059,"In re Ayers,123 U.S. 443 (1887)",https://www.courtlistener.com/opinion/92059/in...,In Re Ayers,In Re Ayers,In RE AYERS; IN RE SCOTT; IN RE McCABE,"[123 U.S. 443, 8 S. Ct. 164, 31 L. Ed. 216, 18...","[{'opinion_id': 9417465, 'opinion_api': 'https...","[92059_020lead.txt, 92059_030concurrence.txt, ..."
2,92291,"Leloup v. Port of Mobile,127 U.S. 640 (1888)",https://www.courtlistener.com/opinion/92291/le...,Leloup,Leloup v. Port of Mobile,Leloup v. Port of Mobile,"[127 U.S. 640, 8 S. Ct. 1380, 32 L. Ed. 311, 1...","[{'opinion_id': 92291, 'opinion_api': 'https:/...",[92291_010combined.txt]
3,93311,"Brenham v. German American Bank,144 U.S. 173 ...",https://www.courtlistener.com/opinion/93311/br...,Brenham,Brenham v. German American Bank,Brenham v. German American Bank,"[144 U.S. 173, 12 S. Ct. 559, 36 L. Ed. 390, 1...","[{'opinion_id': 93311, 'opinion_api': 'https:/...",[93311_010combined.txt]
4,93904,"Roberts v. Lewis,153 U.S. 367 (1894)",https://www.courtlistener.com/opinion/93904/ro...,Roberts,Roberts v. Lewis,Roberts v. Lewis,"[153 U.S. 367, 14 S. Ct. 945, 38 L. Ed. 747, 1...","[{'opinion_id': 93904, 'opinion_api': 'https:/...",[93904_010combined.txt]


In [20]:
double_check = result[["citing_cluster_id", "citing_decision_name", "citing_name"]]

def normalized_5gram(text):
    words = text.lower()[:5]
    return ''.join(words)

# Apply to both columns and compare
double_check['first_5gram_match'] = double_check.apply(
    lambda row: normalized_5gram(row['citing_decision_name']) == normalized_5gram(row['citing_name']),
    axis=1
)

double_check["first_5gram_match"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_check['first_5gram_match'] = double_check.apply(


first_5gram_match
True     132
False      1
Name: count, dtype: int64

In [21]:
double_check[double_check["first_5gram_match"]==False]

Unnamed: 0,citing_cluster_id,citing_decision_name,citing_name,first_5gram_match
16,103214,"O’Malley v. Woodrough,307 U.S. 277 (1939)",O'MALLEY v. Woodrough,False


In [8]:
result.to_json("data/citing_opinions_metadata.json")

# Get raw opinions & metadata for cited decisions

In [22]:
df = pd.read_csv("data/cited_dataset.csv")
len(df)

971

In [23]:
df.head()

Unnamed: 0,cited_cluster_id,cited_decision_name
0,88061,Texas v. White (1869)
1,88994,Vermilye & Co. v. Adams Express Co. (1875)
2,87633,Murray v. Lardner (1865)
3,88240,Texas v. Hardenberg (1869)
4,88693,Huntington v. Texas (1873)


In [24]:
results = {}
filepath = "data/raw_cited_opinions"
invalid = []

for cluster_id in df["cited_cluster_id"].to_list():
    try:
        results[cluster_id] = get_opinions_in_cluster(cluster_id, filepath)
    except:
        invalid.append(cluster_id)

ERROR:root:Error: Status code 404 for URL: https://www.courtlistener.com/api/rest/v4/clusters/3483537
ERROR:root:Error: Status code 404 for URL: https://www.courtlistener.com/api/rest/v4/clusters/3678043
ERROR:root:Error: Status code 404 for URL: https://www.courtlistener.com/api/rest/v4/clusters/3847696
ERROR:root:Error: Status code 404 for URL: https://www.courtlistener.com/api/rest/v4/clusters/3663548


In [25]:
invalid

[3483537, 3678043, 3847696, 3663548]

In [26]:
replaces = {3483537: 3485674, 
            3678043: 3931352,
            3847696: 4088490,
            3663548: 3917155}

for cluster_id in replaces.values():
    results[cluster_id] = get_opinions_in_cluster(cluster_id, filepath)    

In [27]:
df['cited_cluster_id'] = df['cited_cluster_id'].replace(replaces)

In [28]:
records = []
for case_id, case_data in results.items():
    record = {'cited_cluster_id': case_id}
    record.update({k: v for k, v in case_data.items()})
    
    opinion_filenames = [op['opinion_filename'] for op in case_data.get('opinions', [])]
    record['opinion_filenames'] = opinion_filenames
    
    records.append(record)

result_df = pd.DataFrame(records)
result_df.head()

Unnamed: 0,cited_cluster_id,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,88061,https://www.courtlistener.com/opinion/88061/te...,White,Texas v. White,Texas v. White Et Al.,"[74 U.S. 700, 19 L. Ed. 227, 7 Wall. 700, 1868...","[{'opinion_id': 9416757, 'opinion_api': 'https...","[88061_020lead.txt, 88061_040dissent.txt, 8806..."
1,88994,https://www.courtlistener.com/opinion/88994/ve...,,Vermilye & Co. v. Adams Express Co.,Vermilye & Co. v. Adams Express Company,"[88 U.S. 138, 22 L. Ed. 609, 21 Wall. 138, 187...","[{'opinion_id': 88994, 'opinion_api': 'https:/...",[88994_010combined.txt]
2,87633,https://www.courtlistener.com/opinion/87633/mu...,Murray,Murray v. Lardner,Murray v. Lardner,"[69 U.S. 110, 17 L. Ed. 857, 2 Wall. 110, 1864...","[{'opinion_id': 87633, 'opinion_api': 'https:/...",[87633_010combined.txt]
3,88240,https://www.courtlistener.com/opinion/88240/te...,Hardenberg,Texas v. Hardenberg,Texas v. Hardenberg,"[77 U.S. 68, 19 L. Ed. 839, 10 Wall. 68, 1869 ...","[{'opinion_id': 88240, 'opinion_api': 'https:/...",[88240_010combined.txt]
4,88693,https://www.courtlistener.com/opinion/88693/hu...,Huntington,Huntington v. Texas,Huntington v. Texas; Texas v. Huntington,"[83 U.S. 402, 21 L. Ed. 316, 16 Wall. 402, 187...","[{'opinion_id': 88693, 'opinion_api': 'https:/...",[88693_010combined.txt]


In [29]:
result_df["case_law_url"].isna().sum()

np.int64(0)

In [30]:
result = df.merge(result_df, how="left", on="cited_cluster_id")
result.head()

Unnamed: 0,cited_cluster_id,cited_decision_name,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,88061,Texas v. White (1869),https://www.courtlistener.com/opinion/88061/te...,White,Texas v. White,Texas v. White Et Al.,"[74 U.S. 700, 19 L. Ed. 227, 7 Wall. 700, 1868...","[{'opinion_id': 9416757, 'opinion_api': 'https...","[88061_020lead.txt, 88061_040dissent.txt, 8806..."
1,88994,Vermilye & Co. v. Adams Express Co. (1875),https://www.courtlistener.com/opinion/88994/ve...,,Vermilye & Co. v. Adams Express Co.,Vermilye & Co. v. Adams Express Company,"[88 U.S. 138, 22 L. Ed. 609, 21 Wall. 138, 187...","[{'opinion_id': 88994, 'opinion_api': 'https:/...",[88994_010combined.txt]
2,87633,Murray v. Lardner (1865),https://www.courtlistener.com/opinion/87633/mu...,Murray,Murray v. Lardner,Murray v. Lardner,"[69 U.S. 110, 17 L. Ed. 857, 2 Wall. 110, 1864...","[{'opinion_id': 87633, 'opinion_api': 'https:/...",[87633_010combined.txt]
3,88240,Texas v. Hardenberg (1869),https://www.courtlistener.com/opinion/88240/te...,Hardenberg,Texas v. Hardenberg,Texas v. Hardenberg,"[77 U.S. 68, 19 L. Ed. 839, 10 Wall. 68, 1869 ...","[{'opinion_id': 88240, 'opinion_api': 'https:/...",[88240_010combined.txt]
4,88693,Huntington v. Texas (1873),https://www.courtlistener.com/opinion/88693/hu...,Huntington,Huntington v. Texas,Huntington v. Texas; Texas v. Huntington,"[83 U.S. 402, 21 L. Ed. 316, 16 Wall. 402, 187...","[{'opinion_id': 88693, 'opinion_api': 'https:/...",[88693_010combined.txt]


In [31]:
result = result.rename(columns={'case_law_url': 'cited_url', 
                                'case_name_short': 'cited_name_short',
                                'case_name': 'cited_name',
                                'case_name_full': 'cited_name_full',
                                'citation_names': 'cited_citations',
                                'opinions': 'cited_opinions',
                                'opinion_filenames': 'cited_filenames'})
result.head()

Unnamed: 0,cited_cluster_id,cited_decision_name,cited_url,cited_name_short,cited_name,cited_name_full,cited_citations,cited_opinions,cited_filenames
0,88061,Texas v. White (1869),https://www.courtlistener.com/opinion/88061/te...,White,Texas v. White,Texas v. White Et Al.,"[74 U.S. 700, 19 L. Ed. 227, 7 Wall. 700, 1868...","[{'opinion_id': 9416757, 'opinion_api': 'https...","[88061_020lead.txt, 88061_040dissent.txt, 8806..."
1,88994,Vermilye & Co. v. Adams Express Co. (1875),https://www.courtlistener.com/opinion/88994/ve...,,Vermilye & Co. v. Adams Express Co.,Vermilye & Co. v. Adams Express Company,"[88 U.S. 138, 22 L. Ed. 609, 21 Wall. 138, 187...","[{'opinion_id': 88994, 'opinion_api': 'https:/...",[88994_010combined.txt]
2,87633,Murray v. Lardner (1865),https://www.courtlistener.com/opinion/87633/mu...,Murray,Murray v. Lardner,Murray v. Lardner,"[69 U.S. 110, 17 L. Ed. 857, 2 Wall. 110, 1864...","[{'opinion_id': 87633, 'opinion_api': 'https:/...",[87633_010combined.txt]
3,88240,Texas v. Hardenberg (1869),https://www.courtlistener.com/opinion/88240/te...,Hardenberg,Texas v. Hardenberg,Texas v. Hardenberg,"[77 U.S. 68, 19 L. Ed. 839, 10 Wall. 68, 1869 ...","[{'opinion_id': 88240, 'opinion_api': 'https:/...",[88240_010combined.txt]
4,88693,Huntington v. Texas (1873),https://www.courtlistener.com/opinion/88693/hu...,Huntington,Huntington v. Texas,Huntington v. Texas; Texas v. Huntington,"[83 U.S. 402, 21 L. Ed. 316, 16 Wall. 402, 187...","[{'opinion_id': 88693, 'opinion_api': 'https:/...",[88693_010combined.txt]


In [77]:
double_check = result[["cited_cluster_id", "cited_decision_name", "cited_name"]]

def normalized_5gram(text):
    words = text.lower()[:5]
    return ''.join(words)

# Apply to both columns and compare
double_check['first_5gram_match'] = double_check.apply(
    lambda row: normalized_5gram(row['cited_decision_name']) == normalized_5gram(row['cited_name']),
    axis=1
)

double_check["first_5gram_match"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_check['first_5gram_match'] = double_check.apply(


first_5gram_match
True     947
False     24
Name: count, dtype: int64

In [78]:
double_check[double_check["first_5gram_match"]==False]

Unnamed: 0,cited_cluster_id,cited_decision_name,cited_name,first_5gram_match
68,4727363,Roy & Roy v. Northern Pacific Railway Co. (1906),Adair Holdings v. Johnson,False
69,5562591,Planters' Rice-Mill Co. v. Merchants' National...,Mullane v. Roberge,False
71,3589385,Pennsylvania Gas Co. v. Public Service Commiss...,Universal Insurance v. State Board of Tax Appeals,False
72,3486521,West Virginia & Maryland Gas Co. v. Towers (1919),Cahill v. Maryland Life Insurance,False
119,3548121,"Reno Smelting, Milling & Reduction Works v. St...",Federal Land Bank v. McColgan,False
144,3484932,Gordy v. Dennis (1939),State v. Mayor of Baltimore,False
160,102489,Becker v. St. Louis Union Trust Co (1935),Becker v. St. Louis Union Trust Co.,False
287,100830,Childers v. Beaver (1926),Childers v. Beaver,False
325,3981612,Bell v. Hill (1934),"McGinnis v. Three Rivers Convalescent, Unpubli...",False
350,3648247,State v. . Whitaker (1947),Lampman v. . Cochran,False


In [79]:
result[result["cited_cluster_id"].isin(double_check[double_check["cited_name"]==""]["cited_cluster_id"].to_list())]

Unnamed: 0,cited_cluster_id,cited_decision_name,cited_url,cited_name_short,cited_name,cited_name_full,cited_citations,cited_opinions,cited_filenames
558,268703,(1965),https://www.courtlistener.com/opinion/268703/u...,,,United States of America Ex Rel. George Heteny...,"[348 F.2d 844, 1965 U.S. App. LEXIS 4912]","[{'opinion_id': 268703, 'opinion_api': 'https:...",[268703_010combined.txt]
570,288328,(1970),https://www.courtlistener.com/opinion/288328/a...,,,"Adolfo Perez and Emma Perez, Husband and Wife,...",[421 F.2d 619],"[{'opinion_id': 288328, 'opinion_api': 'https:...",[288328_010combined.txt]
592,307980,"John Jordan, by His Next Friend, Jeanette Brya...",https://www.courtlistener.com/opinion/307980/j...,,,"John Jordan, by His Next Friend, Jeanette Brya...","[472 F.2d 985, 1973 U.S. App. LEXIS 12101]","[{'opinion_id': 307980, 'opinion_api': 'https:...",[307980_010combined.txt]
595,305771,"Edna Rothstein v. George K. Wyman, as Commissi...",https://www.courtlistener.com/opinion/305771/e...,,,"Edna Rothstein v. George K. Wyman, as Commissi...","[467 F.2d 226, 1972 U.S. App. LEXIS 7624]","[{'opinion_id': 305771, 'opinion_api': 'https:...",[305771_010combined.txt]
625,321071,(1974),https://www.courtlistener.com/opinion/321071/n...,,,"Nancy Dukes, D/B/A Louisiana Concessions v. Th...",[501 F.2d 706],"[{'opinion_id': 321071, 'opinion_api': 'https:...",[321071_010combined.txt]
770,409686,(1982),https://www.courtlistener.com/opinion/409686/i...,,,"Independence Tube Corporation, Plaintiff-Count...",[691 F.2d 310],"[{'opinion_id': 409686, 'opinion_api': 'https:...",[409686_010combined.txt]
830,507765,(1988),https://www.courtlistener.com/opinion/507765/i...,,,In Re the Beer Institute (Formerly Named Unite...,"[849 F.2d 753, 1988 U.S. App. LEXIS 8016]","[{'opinion_id': 507765, 'opinion_api': 'https:...",[507765_010combined.txt]
890,663717,(1994),https://www.courtlistener.com/opinion/663717/a...,,,"Adarand Constructors, Inc., a Colorado Corpora...","[16 F.3d 1537, 40 Cont. Cas. Fed. 76,739, 1994...","[{'opinion_id': 663717, 'opinion_api': 'https:...",[663717_010combined.txt]
925,658639,(1994),https://www.courtlistener.com/opinion/658639/s...,,,"Seminole Tribe of Florida v. State of Florida,...","[11 F.3d 1016, 1994 U.S. App. LEXIS 661]","[{'opinion_id': 658639, 'opinion_api': 'https:...",[658639_010combined.txt]
931,438842,(1984),https://www.courtlistener.com/opinion/438842/b...,,,"Betty-Louise Felton, Charlotte Green, Barbara ...","[739 F.2d 48, 1984 U.S. App. LEXIS 20692]","[{'opinion_id': 438842, 'opinion_api': 'https:...",[438842_010combined.txt]


# Review for wrong/changed cluster ids

In [80]:
replaces = {4727363: 4920593,
            5562591: 5712559,
            3589385: 3607596,
            3486521: 3488586,
            3981612: 4209365,
            3648247: 3902209,
            3526916: 3551998,
            3580565: 3599253,
            3620827: 3637215,
            3548121: 3568583,
            3484932: 3487033
           }

replace_results = {}
for cluster_id in replaces.values():
    replace_results[cluster_id] = get_opinions_in_cluster(cluster_id, filepath)   

In [81]:
result['cited_cluster_id'] = result['cited_cluster_id'].replace(replaces)

In [82]:
records = []
for case_id, case_data in replace_results.items():
    record = {'cited_cluster_id': case_id}
    record.update({k: v for k, v in case_data.items()})
    
    opinion_filenames = [op['opinion_filename'] for op in case_data.get('opinions', [])]
    record['opinion_filenames'] = opinion_filenames
    
    records.append(record)

result_df = pd.DataFrame(records)
result_df.head()

Unnamed: 0,cited_cluster_id,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,4920593,https://www.courtlistener.com/opinion/4920593/...,,Roy & Roy v. Northern Pacific Railway Co.,Roy & Roy v. Northern Pacific Railway Company,"[42 Wash. 572, 85 P. 53, 1906 Wash. LEXIS 620]","[{'opinion_id': 4727363, 'opinion_api': 'https...",[4920593_020lead.txt]
1,5712559,https://www.courtlistener.com/opinion/5712559/...,,Planters' Rice-Mill Co. v. Merchants' National...,The Planters' Rice-Mill Company v. The Merchan...,"[78 Ga. 574, 3 S.E. 327]","[{'opinion_id': 5562591, 'opinion_api': 'https...",[5712559_020lead.txt]
2,3607596,https://www.courtlistener.com/opinion/3607596/...,,Pennsylvania Gas Co. v. Public Service Commission,In the Matter of the Application of Pennsylvan...,"[122 N.E. 260, 225 N.Y. 397, 1919 N.Y. LEXIS 1...","[{'opinion_id': 3589385, 'opinion_api': 'https...",[3607596_020lead.txt]
3,3488586,https://www.courtlistener.com/opinion/3488586/...,,West Virginia & Maryland Gas Co. v. Towers,WEST VIRGINIA & MARYLAND GAS COMPANY vs. ALBER...,"[106 A. 265, 134 Md. 137, 1919 Md. LEXIS 54]","[{'opinion_id': 3486521, 'opinion_api': 'https...",[3488586_020lead.txt]
4,4209365,https://www.courtlistener.com/opinion/4209365/...,Hill,Bell v. Hill,"W. G. Bell Et Al. v. Fred G. Hill, County Cler...","[74 S.W.2d 113, 123 Tex. 531, 1934 Tex. LEXIS ...","[{'opinion_id': 3981612, 'opinion_api': 'https...",[4209365_020lead.txt]


In [83]:
result_df = result_df.rename(columns={'case_law_url': 'cited_url', 
                                'case_name_short': 'cited_name_short',
                                'case_name': 'cited_name',
                                'case_name_full': 'cited_name_full',
                                'citation_names': 'cited_citations',
                                'opinions': 'cited_opinions',
                                'opinion_filenames': 'cited_filenames'})
result_df.head()

Unnamed: 0,cited_cluster_id,cited_url,cited_name_short,cited_name,cited_name_full,cited_citations,cited_opinions,cited_filenames
0,4920593,https://www.courtlistener.com/opinion/4920593/...,,Roy & Roy v. Northern Pacific Railway Co.,Roy & Roy v. Northern Pacific Railway Company,"[42 Wash. 572, 85 P. 53, 1906 Wash. LEXIS 620]","[{'opinion_id': 4727363, 'opinion_api': 'https...",[4920593_020lead.txt]
1,5712559,https://www.courtlistener.com/opinion/5712559/...,,Planters' Rice-Mill Co. v. Merchants' National...,The Planters' Rice-Mill Company v. The Merchan...,"[78 Ga. 574, 3 S.E. 327]","[{'opinion_id': 5562591, 'opinion_api': 'https...",[5712559_020lead.txt]
2,3607596,https://www.courtlistener.com/opinion/3607596/...,,Pennsylvania Gas Co. v. Public Service Commission,In the Matter of the Application of Pennsylvan...,"[122 N.E. 260, 225 N.Y. 397, 1919 N.Y. LEXIS 1...","[{'opinion_id': 3589385, 'opinion_api': 'https...",[3607596_020lead.txt]
3,3488586,https://www.courtlistener.com/opinion/3488586/...,,West Virginia & Maryland Gas Co. v. Towers,WEST VIRGINIA & MARYLAND GAS COMPANY vs. ALBER...,"[106 A. 265, 134 Md. 137, 1919 Md. LEXIS 54]","[{'opinion_id': 3486521, 'opinion_api': 'https...",[3488586_020lead.txt]
4,4209365,https://www.courtlistener.com/opinion/4209365/...,Hill,Bell v. Hill,"W. G. Bell Et Al. v. Fred G. Hill, County Cler...","[74 S.W.2d 113, 123 Tex. 531, 1934 Tex. LEXIS ...","[{'opinion_id': 3981612, 'opinion_api': 'https...",[4209365_020lead.txt]


In [84]:
result.set_index('cited_cluster_id', inplace=True)
result_df.set_index('cited_cluster_id', inplace=True)

result.update(result_df)

result.reset_index(inplace=True)

In [91]:
double_check = result[["cited_cluster_id", "cited_decision_name", "cited_name"]]

def normalized_5gram(text):
    words = text.lower()[:5]
    return ''.join(words)

# Apply to both columns and compare
double_check['first_5gram_match'] = double_check.apply(
    lambda row: normalized_5gram(row['cited_decision_name']) == normalized_5gram(row['cited_name']),
    axis=1
)

double_check["first_5gram_match"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_check['first_5gram_match'] = double_check.apply(


first_5gram_match
True     958
False     13
Name: count, dtype: int64

In [92]:
double_check[double_check["first_5gram_match"]==False]

Unnamed: 0,cited_cluster_id,cited_decision_name,cited_name,first_5gram_match
160,102489,Becker v. St. Louis Union Trust Co (1935),Becker v. St. Louis Union Trust Co.,False
287,100830,Childers v. Beaver (1926),Childers v. Beaver,False
558,268703,(1965),,False
570,288328,(1970),,False
592,307980,"John Jordan, by His Next Friend, Jeanette Brya...",,False
595,305771,"Edna Rothstein v. George K. Wyman, as Commissi...",,False
625,321071,(1974),,False
770,409686,(1982),,False
830,507765,(1988),,False
890,663717,(1994),,False


In [87]:
result.to_json("data/cited_opinions_metadata.json")

# Fix the cited cluster id in dataset.csv and cited_dataset.csv

In [93]:
dataset = pd.read_csv("data/dataset.csv")
dataset.head()

Unnamed: 0,filename,citing_cluster_id,citing_decision_name,cited_cluster_id,cited_decision_name,overruled
0,0001.91306_cites_88061.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88061,Texas v. White (1869),yes
1,0002.91306_cites_88994.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88994,Vermilye & Co. v. Adams Express Co. (1875),no
2,0003.91306_cites_87633.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",87633,Murray v. Lardner (1865),no
3,0004.91306_cites_88240.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88240,Texas v. Hardenberg (1869),no
4,0005.91306_cites_88693.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88693,Huntington v. Texas (1873),no


In [94]:
replaces = {4727363: 4920593,
            5562591: 5712559,
            3589385: 3607596,
            3486521: 3488586,
            3981612: 4209365,
            3648247: 3902209,
            3526916: 3551998,
            3580565: 3599253,
            3620827: 3637215,
            3548121: 3568583,
            3484932: 3487033,
            3483537: 3485674, 
            3678043: 3931352,
            3847696: 4088490,
            3663548: 3917155
           }

In [95]:
dataset['cited_cluster_id'] = dataset['cited_cluster_id'].replace(replaces)

In [100]:
assert sorted(dataset['cited_cluster_id'].unique()) == sorted(result['cited_cluster_id'].unique())

In [104]:
dataset.to_csv("data/dataset.csv", index=False)

In [101]:
cited_dataset = pd.read_csv("data/cited_dataset.csv")
cited_dataset.head()

Unnamed: 0,cited_cluster_id,cited_decision_name
0,88061,Texas v. White (1869)
1,88994,Vermilye & Co. v. Adams Express Co. (1875)
2,87633,Murray v. Lardner (1865)
3,88240,Texas v. Hardenberg (1869)
4,88693,Huntington v. Texas (1873)


In [102]:
cited_dataset['cited_cluster_id'] = cited_dataset['cited_cluster_id'].replace(replaces)

In [103]:
assert sorted(cited_dataset['cited_cluster_id'].unique()) == sorted(result['cited_cluster_id'].unique())

In [105]:
cited_dataset.to_csv("data/cited_dataset.csv", index=False)

# Review for wrong/changed cluster ids

In [2]:
result = pd.read_json("data/cited_opinions_metadata.json")

In [4]:
filepath = "data/raw_cited_opinions"

replaces = {108444: 8985546,
            97237: 5726506,
            268703: 8887067,
            663717: 1879640,
            1087822: 103707,
            97966: 8180960,
            305771: 8901897,
            2443377: 103664,
            2516349: 95716,
            108280: 8982256,
            1116658: 104182,
            104816: 8928994,
           }

replace_results = {}
for cluster_id in replaces.values():
    replace_results[cluster_id] = get_opinions_in_cluster(cluster_id, filepath)   

In [5]:
result['cited_cluster_id'] = result['cited_cluster_id'].replace(replaces)

In [6]:
records = []
for case_id, case_data in replace_results.items():
    record = {'cited_cluster_id': case_id}
    record.update({k: v for k, v in case_data.items()})
    
    opinion_filenames = [op['opinion_filename'] for op in case_data.get('opinions', [])]
    record['opinion_filenames'] = opinion_filenames
    
    records.append(record)

result_df = pd.DataFrame(records)
result_df.head()

Unnamed: 0,cited_cluster_id,case_law_url,case_name_short,case_name,case_name_full,citation_names,opinions,opinion_filenames
0,8985546,https://www.courtlistener.com/opinion/8985546/...,Grove,"Dun & Bradstreet, Inc. v. Grove","Dun & Bradstreet, Inc. v. Grove, Trustee","[404 U.S. 898, 92 S. Ct. 204, 30 L. Ed. 2d 175...","[{'opinion_id': 8977565, 'opinion_api': 'https...",[8985546_040dissent.txt]
1,5726506,https://www.courtlistener.com/opinion/5726506/...,Brantley,Brantley v. State,BRANTLEY v. State,"[132 Ga. 573, 1909 Ga. LEXIS 363, 64 S.E. 676]","[{'opinion_id': 5576766, 'opinion_api': 'https...",[5726506_020lead.txt]
2,8887067,https://www.courtlistener.com/opinion/8887067/...,Wilkins,United States ex rel. Hetenyi v. Wilkins,UNITED STATES of America ex rel. George HETENY...,[348 F.2d 844],"[{'opinion_id': 8873151, 'opinion_api': 'https...","[8887067_020lead.txt, 8887067_040dissent.txt]"
3,1879640,https://www.courtlistener.com/opinion/1879640/...,Skinner,"Adarand Constructors, Inc. v. Skinner","ADARAND CONSTRUCTORS, INC., Plaintiff, v. Samu...","[790 F. Supp. 240, 38 Cont. Cas. Fed. 76,325, ...","[{'opinion_id': 1879640, 'opinion_api': 'https...",[1879640_010combined.txt]
4,103707,https://www.courtlistener.com/opinion/103707/e...,,Ex Parte Quirin,Ex Parte Quirin. Ex Parte Haupt. Ex Parte Kerl...,[317 U.S. 1],"[{'opinion_id': 103707, 'opinion_api': 'https:...",[103707_010combined.txt]


In [7]:
result_df = result_df.rename(columns={'case_law_url': 'cited_url', 
                                'case_name_short': 'cited_name_short',
                                'case_name': 'cited_name',
                                'case_name_full': 'cited_name_full',
                                'citation_names': 'cited_citations',
                                'opinions': 'cited_opinions',
                                'opinion_filenames': 'cited_filenames'})
result_df.head()

Unnamed: 0,cited_cluster_id,cited_url,cited_name_short,cited_name,cited_name_full,cited_citations,cited_opinions,cited_filenames
0,8985546,https://www.courtlistener.com/opinion/8985546/...,Grove,"Dun & Bradstreet, Inc. v. Grove","Dun & Bradstreet, Inc. v. Grove, Trustee","[404 U.S. 898, 92 S. Ct. 204, 30 L. Ed. 2d 175...","[{'opinion_id': 8977565, 'opinion_api': 'https...",[8985546_040dissent.txt]
1,5726506,https://www.courtlistener.com/opinion/5726506/...,Brantley,Brantley v. State,BRANTLEY v. State,"[132 Ga. 573, 1909 Ga. LEXIS 363, 64 S.E. 676]","[{'opinion_id': 5576766, 'opinion_api': 'https...",[5726506_020lead.txt]
2,8887067,https://www.courtlistener.com/opinion/8887067/...,Wilkins,United States ex rel. Hetenyi v. Wilkins,UNITED STATES of America ex rel. George HETENY...,[348 F.2d 844],"[{'opinion_id': 8873151, 'opinion_api': 'https...","[8887067_020lead.txt, 8887067_040dissent.txt]"
3,1879640,https://www.courtlistener.com/opinion/1879640/...,Skinner,"Adarand Constructors, Inc. v. Skinner","ADARAND CONSTRUCTORS, INC., Plaintiff, v. Samu...","[790 F. Supp. 240, 38 Cont. Cas. Fed. 76,325, ...","[{'opinion_id': 1879640, 'opinion_api': 'https...",[1879640_010combined.txt]
4,103707,https://www.courtlistener.com/opinion/103707/e...,,Ex Parte Quirin,Ex Parte Quirin. Ex Parte Haupt. Ex Parte Kerl...,[317 U.S. 1],"[{'opinion_id': 103707, 'opinion_api': 'https:...",[103707_010combined.txt]


In [8]:
result.set_index('cited_cluster_id', inplace=True)
result_df.set_index('cited_cluster_id', inplace=True)

result.update(result_df)

result.reset_index(inplace=True)

In [9]:
double_check = result[["cited_cluster_id", "cited_decision_name", "cited_name"]]

def normalized_5gram(text):
    words = text.lower()[:5]
    return ''.join(words)

# Apply to both columns and compare
double_check['first_5gram_match'] = double_check.apply(
    lambda row: normalized_5gram(row['cited_decision_name']) == normalized_5gram(row['cited_name']),
    axis=1
)

double_check["first_5gram_match"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  double_check['first_5gram_match'] = double_check.apply(


first_5gram_match
True     957
False     14
Name: count, dtype: int64

In [10]:
double_check[double_check["first_5gram_match"]==False]

Unnamed: 0,cited_cluster_id,cited_decision_name,cited_name,first_5gram_match
54,8180960,The Minnesota Rate Cases (1913),Simpson v. Shepard,False
160,102489,Becker v. St. Louis Union Trust Co (1935),Becker v. St. Louis Union Trust Co.,False
287,100830,Childers v. Beaver (1926),Childers v. Beaver,False
558,8887067,(1965),United States ex rel. Hetenyi v. Wilkins,False
570,288328,(1970),,False
592,307980,"John Jordan, by His Next Friend, Jeanette Brya...",,False
595,8901897,"Edna Rothstein v. George K. Wyman, as Commissi...",Rothstein v. Wyman,False
625,321071,(1974),,False
770,409686,(1982),,False
830,507765,(1988),,False


In [11]:
result.to_json("data/cited_opinions_metadata.json")

# Fix the cited cluster id in dataset.csv and cited_dataset.csv

In [12]:
dataset = pd.read_csv("data/dataset.csv")
dataset.head()

Unnamed: 0,filename,citing_cluster_id,citing_decision_name,cited_cluster_id,cited_decision_name,overruled
0,0001.91306_cites_88061.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88061,Texas v. White (1869),yes
1,0002.91306_cites_88994.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88994,Vermilye & Co. v. Adams Express Co. (1875),no
2,0003.91306_cites_87633.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",87633,Murray v. Lardner (1865),no
3,0004.91306_cites_88240.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88240,Texas v. Hardenberg (1869),no
4,0005.91306_cites_88693.txt,91306,"Morgan v. United States,113 U.S. 476 (1885)",88693,Huntington v. Texas (1873),no


In [13]:
dataset['cited_cluster_id'] = dataset['cited_cluster_id'].replace(replaces)

In [14]:
assert sorted(dataset['cited_cluster_id'].unique()) == sorted(result['cited_cluster_id'].unique())

In [15]:
dataset.to_csv("data/dataset.csv", index=False)

In [16]:
cited_dataset = pd.read_csv("data/cited_dataset.csv")
cited_dataset.head()

Unnamed: 0,cited_cluster_id,cited_decision_name
0,88061,Texas v. White (1869)
1,88994,Vermilye & Co. v. Adams Express Co. (1875)
2,87633,Murray v. Lardner (1865)
3,88240,Texas v. Hardenberg (1869)
4,88693,Huntington v. Texas (1873)


In [17]:
cited_dataset['cited_cluster_id'] = cited_dataset['cited_cluster_id'].replace(replaces)

In [18]:
assert sorted(cited_dataset['cited_cluster_id'].unique()) == sorted(result['cited_cluster_id'].unique())

In [19]:
cited_dataset.to_csv("data/cited_dataset.csv", index=False)