In [9]:
# Import libraries
import os
import pandas as pd
import glob
import warnings
import timeit
import ast  # To safely evaluate string representations of lists

In [3]:
# Read all csv files in data folder and concatenate them into one dataframe
# Define the data folder first
current_path = os.getcwd()
parent_dir = os.path.dirname(current_path)
grandparent_dir = os.path.dirname(parent_dir)
docket_csv_path = os.path.join(grandparent_dir, 'data', 'dockets','individual_csvs')
print(docket_csv_path)

/Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/dockets/individual_csvs


In [6]:
# Function to read csvs using glob
# Suppress only specific warnings, such as DtypeWarning
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# Defining load_data function
def load_data(data_dir):
    '''
    Base approach to load data from csv files in a directory
    '''
    all_files = glob.glob(data_dir + "/*.csv")
    print(f"Found {len(all_files)} files in the directory")

    # Measure time using timeit
    start_time = timeit.default_timer()

    li = []
    for filename in all_files:
        try:
            df = pd.read_csv(filename, index_col=None, header=0, ) # Turn warnings off 
            li.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {e}")

    elapsed_time = timeit.default_timer() - start_time
    print(f"Time taken to load {len(all_files)} files using base approach: {elapsed_time:.2f} seconds")
    return pd.concat(li, axis=0, ignore_index=True)

In [7]:
# Load dockets
df = load_data(docket_csv_path)

Found 1403 files in the directory
Time taken to load 1403 files using base approach: 5.00 seconds


In [8]:
# Examine the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194066 entries, 0 to 194065
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   absolute_url                 194066 non-null  object 
 1   attorney                     163967 non-null  object 
 2   caseName                     194066 non-null  object 
 3   caseNameFull                 166072 non-null  object 
 4   citation                     194066 non-null  object 
 5   citeCount                    194066 non-null  int64  
 6   cluster_id                   194066 non-null  int64  
 7   court                        194066 non-null  object 
 8   court_citation_string        194066 non-null  object 
 9   court_id                     194066 non-null  object 
 10  dateArgued                   385 non-null     object 
 11  dateFiled                    194066 non-null  object 
 12  dateReargued                 0 non-null       float64
 13 

In [11]:
# Function to safely extract the 'author_id' and 'snippet' fields
def extract_author_id(opinion):
    if isinstance(opinion, str):
        # Convert string representation of list to a Python object
        opinion = ast.literal_eval(opinion)
    if isinstance(opinion, list) and len(opinion) > 0:
        return opinion[0].get('author_id', None)
    return None

def extract_snippet(opinion):
    if isinstance(opinion, str):
        # Convert string representation of list to a Python object
        opinion = ast.literal_eval(opinion)
    if isinstance(opinion, list) and len(opinion) > 0:
        return opinion[0].get('snippet', None)
    return None

# Apply the functions to extract 'author_id' and 'snippet'
df['author_id'] = df['opinions'].apply(extract_author_id)
df['snippet'] = df['opinions'].apply(extract_snippet)

In [26]:
# Export this as a csv
df.to_csv(os.path.join(grandparent_dir, 'data', 'dockets/intermediate_dfs', 'all_dockets_judges_w_fin_dis.csv'), index=False)

In [12]:
# Examine count of unique docket_id by author_id
df.groupby('author_id')['docket_id'].nunique().sort_values(ascending=False)

author_id
1844    2098
1124    1828
201     1666
1911    1601
3150    1483
        ... 
807        1
2371       1
2374       1
2385       1
2228       1
Name: docket_id, Length: 1403, dtype: int64

In [13]:
# Create bins for docket counts
bins = [0, 5, 10, 50, 100, 250,500,750,1000,1250,1500,1750,2000,2500]

# Function to assign bin to each row based on unique docket ids for author_id
def assign_bin(docket_count):
    for i in range(len(bins)-1):
        if docket_count >= bins[i] and docket_count < bins[i+1]:
            return f"{bins[i]}-{bins[i+1]-1}"
    return f">{bins[-1]}"

# Apply the function to create a new column 'docket_count_bin'
df['docket_count_bin'] = df.groupby('author_id')['docket_id'].transform('nunique').apply(assign_bin)

# Examine the dataframe
df.head()

Unnamed: 0,absolute_url,attorney,caseName,caseNameFull,citation,citeCount,cluster_id,court,court_citation_string,court_id,...,procedural_history,scdb_id,sibling_ids,source,status,suitNature,syllabus,author_id,snippet,docket_count_bin
0,/opinion/2182594/garcia-v-nationwide-mutual-in...,"Richard H. Honaker, Honaker Law Offices, Rock ...",Garcia v. Nationwide Mutual Insurance,"Michael GARCIA, Plaintiff, v. NATIONWIDE MUTUA...","['821 F. Supp. 2d 1264', '2011 U.S. Dist. LEXI...",0,2182594,"District Court, D. Wyoming",D. Wyo.,wyd,...,,,[2182594],LU,Published,,,2986,"\n821 F.Supp.2d 1264 (2011)\nMichael GARCIA, P...",0-4
1,/opinion/2579421/scarpulla-v-bayer-corp-disabi...,"Jeffrey S. Daniel, Birmingham, AL, for Plainti...",Scarpulla v. Bayer Corp. Disability Plan,"Teresa SCARPULLA, Plaintiffs, v. BAYER CORPORA...","['514 F. Supp. 2d 1262', '2007 WL 2800369']",0,2579421,"District Court, N.D. Alabama",N.D. Ala.,alnd,...,,,[2579421],LU,Published,,,341,"\n514 F.Supp.2d 1262 (2007)\nTeresa SCARPULLA,...",10-49
2,/opinion/2578911/chazen-v-deloitte-touche-llp/,"James L. North, J. Timothy Francis, James L. N...","Chazen v. Deloitte & Touche, LLP","Stephen M. CHAZEN, Plaintiff, v. DELOITTE & TO...","['247 F. Supp. 2d 1259', '2003 U.S. Dist. LEXI...",3,2578911,"District Court, N.D. Alabama",N.D. Ala.,alnd,...,,,[2578911],LU,Published,,,341,\n247 F.Supp.2d 1259 (2003)\nStephen M. CHAZEN...,10-49
3,/opinion/2573917/state-farm-fire-casualty-co-v...,"David A. Pote, James C. Gray, III, Lloyd Gray ...",State Farm Fire & Casualty Co. v. Knoblett,"STATE FARM FIRE & CASUALTY CO., Plaintiff, v. ...","['561 F. Supp. 2d 1256', '2008 U.S. Dist. LEXI...",1,2573917,"District Court, N.D. Alabama",N.D. Ala.,alnd,...,,,[2573917],LU,Published,,,341,"\n(2008)\nSTATE FARM FIRE & CASUALTY CO., Plai...",10-49
4,/opinion/2565101/american-canoe-assn-v-white/,"Ray Vaughan, WildLaw, Montgomery, David Bookbi...",American Canoe Ass'n v. White,"AMERICAN CANOE ASSOCIATION, Et Al., Plaintiffs...","['277 F. Supp. 2d 1244', '2003 WL 21982008']",3,2565101,"District Court, N.D. Alabama",N.D. Ala.,alnd,...,,,[2565101],LU,Published,,,341,\n277 F.Supp.2d 1244 (2003)\nAMERICAN CANOE AS...,10-49


In [16]:
# Get counts of author_id by docket_count_bin
df.groupby('docket_count_bin')['author_id'].nunique().sort_values(ascending=False)

docket_count_bin
10-49        376
100-249      347
50-99        319
250-499      115
0-4          104
5-9           62
500-749       48
750-999       16
1250-1499      8
1000-1249      4
1500-1749      2
1750-1999      1
2000-2499      1
Name: author_id, dtype: int64

In [19]:
# We will select 25 random authors with more than 100 unique dockets but less than 500
# Random seed for reproducibility
random_seed = 42

# Define buckets to filter from
bucket = ['100-249', '250-499', '500-749']

# Filter authors with docket counts in the category 100-249 and 250-499
author_ids = df[df['docket_count_bin'].isin(bucket)]['author_id'].unique()

# Define number of authors to select 
num_authors = 25

# Randomly select 20 authors
selected_authors = pd.Series(author_ids).sample(num_authors, random_state=random_seed).values

# Filter the dataframe for selected authors
df_sampled = df[df['author_id'].isin(selected_authors)]

# Examine the dataframe
print(f"Size of sampled dataframe is {df_sampled.shape}")
df_sampled.head()

Size of sampled dataframe is (5913, 35)


Unnamed: 0,absolute_url,attorney,caseName,caseNameFull,citation,citeCount,cluster_id,court,court_citation_string,court_id,...,procedural_history,scdb_id,sibling_ids,source,status,suitNature,syllabus,author_id,snippet,docket_count_bin
3550,/opinion/2596484/watchorn-ex-rel-christenson-v...,"William R. Amlong, Amlong & Amlong, P.A., Fort...",Watchorn Ex Rel. Christenson v. Town of Davie,"Christina Elisabeth WATCHORN, a Minor, by and ...","['795 F. Supp. 1112', '1992 U.S. Dist. LEXIS 1...",5,2596484,"District Court, S.D. Florida",S.D. Fla.,flsd,...,,,[2596484],LU,Published,,,2490,\n795 F.Supp. 1112 (1992)\nChristina Elisabeth...,100-249
3551,/opinion/2512315/dictiomatic-inc-v-united-stat...,"John Joseph Pappas, Butler Burnette & Pappas, ...","Dictiomatic, Inc. v. United States Fidelity & ...","DICTIOMATIC, INC., a Florida Corporation, and ...","['127 F. Supp. 2d 1239', '1999 U.S. Dist. LEXI...",1,2512315,"District Court, S.D. Florida",S.D. Fla.,flsd,...,,,[2512315],LU,Published,,,2490,"\n127 F.Supp.2d 1239 (1999)\nDICTIOMATIC, INC....",100-249
3552,/opinion/2503927/neumont-v-monroe-county-florida/,"James H. Hicks, Hicks, Brams & Scher, West Pal...",Neumont v. Monroe County Florida,"Elizabeth J. NEUMONT, Et Al., Plaintiffs, v. M...","['104 F. Supp. 2d 1368', '2000 U.S. Dist. LEXI...",1,2503927,"District Court, S.D. Florida",S.D. Fla.,flsd,...,,,[2503927],LU,Published,,,2490,\n104 F.Supp.2d 1368 (2000)\nElizabeth J. NEUM...,100-249
3553,/opinion/2501892/brandt-v-weather-channel-inc/,"Robert Lamar Bell, Miami, FL, for plaintiffs.,...","Brandt v. Weather Channel, Inc.","Deborah K. BRANDT, Et Al., Plaintiff, v. the W...","['42 F. Supp. 2d 1344', '27 Media L. Rep. (BNA...",2,2501892,"District Court, S.D. Florida",S.D. Fla.,flsd,...,,,[2501892],LU,Published,,,2490,"\n42 F.Supp.2d 1344 (1999)\nDeborah K. BRANDT,...",100-249
3554,/opinion/2501376/neumont-v-monroe-county-florida/,"Harold E. Wolfe, Jr., Eric M. Grant, LeBoeuf, ...","Neumont v. Monroe County, Florida","Elizabeth J. NEUMONT, Et Al., Plaintiffs, v. M...","['280 F. Supp. 2d 1367', '2003 U.S. Dist. LEXI...",1,2501376,"District Court, S.D. Florida",S.D. Fla.,flsd,...,,,[2501376],LU,Published,,,2490,\n280 F.Supp.2d 1367 (2003)\nElizabeth J. NEUM...,100-249


In [20]:
# Get counts of author_id by docket_count_bin
df_sampled.groupby('docket_count_bin')['author_id'].nunique().sort_values(ascending=False)

docket_count_bin
100-249    18
250-499     5
500-749     2
Name: author_id, dtype: int64

In [22]:
# Save this dataframe to a parquet file
# Define the path to save the parquet file
docket_path = os.path.join(grandparent_dir, 'data', 'dockets','intermediate_dfs')
#print(docket_path)

# Save the dataframe to a parquet file
df_sampled.to_parquet(os.path.join(docket_path, 'sampled_judges.parquet'))

In [25]:
# Export the dataframe also as a csv for quick look
df_sampled.to_csv('sampled_dockets.csv', index=False)

In [24]:
# Also export the list of selected authors to a csv file
pd.DataFrame(selected_authors, columns=['author_id']).to_csv(os.path.join(docket_path, 'sampled_judge_ids.csv'), index=False)