In [12]:
# Importing packages
import os
import pandas as pd
import re

In [3]:
# Import dockets dataframe
current_path = os.getcwd()
parent_dir = os.path.dirname(current_path)
grandparent_dir = os.path.dirname(parent_dir)
dockets_path = os.path.join(grandparent_dir, 'data', 'conflict','intermediate_dfs','disclosure_judges_w_party_names.csv')
df_dockets = pd.read_csv(dockets_path)
df_dockets.head()

Unnamed: 0,author_id,dateFiled,docketNumber,year,party1,party2
0,2986,2011-08-29,2:11-cv-00084,2011,Garcia,Nationwide Mutual Insurance
1,341,2007-09-27,CV-05-BE-1324-W,2007,Scarpulla,Bayer Corp. Disability Plan
2,341,2003-02-28,2:02-cr-00352,2003,Chazen,"Deloitte & Touche, LLP"
3,341,2008-05-29,2:08-cr-00220,2008,State Farm Fire & Casualty Co.,Knoblett
4,341,2003-08-15,CV-00-BE-1795-NE,2003,American Canoe Ass'n,White


In [28]:
# Melt party1 and party2 into a single column 'party_name'
df_dockets_party_melt = pd.melt(
    df_dockets,
    id_vars=['author_id','year','docketNumber'],
    value_vars=['party1','party2'],
    value_name='party_name'
)
# Drop the 'variable' column
df_dockets_party_melt.drop(columns='variable', inplace=True)

# Examine the dataframe
df_dockets_party_melt.head()

Unnamed: 0,author_id,year,docketNumber,party_name
0,2986,2011,2:11-cv-00084,Garcia
1,341,2007,CV-05-BE-1324-W,Scarpulla
2,341,2003,2:02-cr-00352,Chazen
3,341,2008,2:08-cr-00220,State Farm Fire & Casualty Co.
4,341,2003,CV-00-BE-1795-NE,American Canoe Ass'n


In [49]:
# Function to pre-process party names
legal_party_stopwords = ['ltd','ltd.','limited','company','inc','inc.','incorporated','corporation','corp.','corp','co','co.','llc','plc']
special_characters = [',',';',':','!','?','(',')','[',']','{','}','-','_','/','\\','@','#','$','%','^','*','+','=','|','<','>','~','`','"','\'']

def preprocess_names(names_list):
    """
    Preprocess a list of names by cleaning text (lowercase, strip spaces, remove special characters).
    
    Args:
        names_list (list): List of names to preprocess.
    
    Returns:
        list: Preprocessed list of names.
    """
    processed_list = []
    for name in names_list:
        # Convert to lowercase and remove unwanted characters
        if isinstance(name, str):  # Check if it's a string
            clean_name = name.lower().strip()
            # remove numerals and special characters from the list
            clean_name = re.sub(r'[0-9]', '', clean_name)
            clean_name = ''.join([char for char in clean_name if char not in special_characters])
            # remove stopwords
            clean_name = ' '.join([word for word in clean_name.split() if word not in legal_party_stopwords])
            processed_list.append(clean_name)
        else:
            processed_list.append('')
    return processed_list

In [50]:
# Add a column called 'party_name_clean' to the dataframe
df_dockets_party_melt['party_name_clean'] = preprocess_names(df_dockets_party_melt['party_name'])

# Examine the dataframe
df_dockets_party_melt.head()

Unnamed: 0,author_id,year,docketNumber,party_name,party_name_clean
0,2986,2011,2:11-cv-00084,Garcia,garcia
1,341,2007,CV-05-BE-1324-W,Scarpulla,scarpulla
2,341,2003,2:02-cr-00352,Chazen,chazen
3,341,2008,2:08-cr-00220,State Farm Fire & Casualty Co.,state farm fire & casualty
4,341,2003,CV-00-BE-1795-NE,American Canoe Ass'n,american canoe assn


In [51]:
# Extract party names in a list and remove duplicates
party_names_clean_list = df_dockets_party_melt['party_name_clean'].tolist()

# Cleaning: remove duplicates
party_names_clean_list = list(set(party_names_clean_list))

# Cleaning: remove all names with length less than equal to 3
length_threshold = 3
party_names_clean_list = [name for name in party_names_clean_list if len(name) > length_threshold]

# Cleaning: remove a name if the first character is not a letter
party_names_clean_list = [name for name in party_names_clean_list if name[0].isalpha()]

# Check the length and examine some values after sorting
print("List of parties has length:", len(party_names_clean_list))
print("Some party names are:")
party_names_clean_list.sort()
party_names_clean_list[:10]

List of parties has length: 144547
Some party names are:


['a & a enterprises',
 'a & a printing',
 'a & b beacon business machines',
 'a & b builders',
 'a & b construction',
 'a & b metal & roofing',
 'a & b sales',
 'a & b steel shearing & processing',
 'a & d interests',
 'a & d maja construction']

In [56]:
# Save the list of cleaned party names to a text file
party_names_clean_path = os.path.join(grandparent_dir, 'data', 'conflict','intermediate_dfs','party_names_clean.txt')
with open(party_names_clean_path, 'w') as f:
    for item in party_names_clean_list:
        f.write("%s\n" % item)
print("Party names have been saved to", party_names_clean_path)

Party names have been saved to /Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/conflict/intermediate_dfs/party_names_clean.txt
