In [2]:
# Importing libraries
import os
import pandas as pd
import jellyfish
from fuzzywuzzy import fuzz
import re
import time
from rapidfuzz.distance import Levenshtein, JaroWinkler
from collections import defaultdict

In [3]:
# Importing sample data
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
data_dir = os.path.join(grandparent_dir, 'data/conflict/sample')
print(data_dir)

/Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/conflict/sample


In [4]:
# Importing data
sample_dockets = pd.read_csv(os.path.join(data_dir, 'sampled_dockets.csv'))
print(f'The shape of the sample_dockets is {sample_dockets.shape}')
sample_investments = pd.read_csv(os.path.join(data_dir, 'sampled_investments.csv'))
print(f'The shape of the sample_investments is {sample_investments.shape}')

The shape of the sample_dockets is (5913, 10)
The shape of the sample_investments is (12242, 4)


In [5]:
# Examine dockets sample
sample_dockets.head()

Unnamed: 0,author_id,judge,docketNumber,docket_id,snippet,caseName,dateFiled,party_1,party_2,dateFiled_year
0,2490,Paine,91-6548-CIV.,691131,\n795 F.Supp. 1112 (1992)\nChristina Elisabeth...,Watchorn Ex Rel. Christenson v. Town of Davie,1992-08-07,Watchorn Ex Rel. Christenson,Town of Davie,1992
1,2490,Paine,"93-2123-CIV-PAINE, 94-1692-CIV-PAINE",2379325,"\n127 F.Supp.2d 1239 (1999)\nDICTIOMATIC, INC....","Dictiomatic, Inc. v. United States Fidelity & ...",1999-06-15,"Dictiomatic, Inc.",United States Fidelity & Guaranty Co.,1999
2,2490,Paine,99-10054-CIV,2371116,\n104 F.Supp.2d 1368 (2000)\nElizabeth J. NEUM...,Neumont v. Monroe County Florida,2000-06-21,Neumont,Monroe County Florida,2000
3,2490,Paine,98-10060-CIV.,2369251,"\n42 F.Supp.2d 1344 (1999)\nDeborah K. BRANDT,...","Brandt v. Weather Channel, Inc.",1999-03-18,Brandt,"Weather Channel, Inc.",1999
4,2490,Paine,99-10054-CIV-PAINE/VITUNAC,2368734,\n280 F.Supp.2d 1367 (2003)\nElizabeth J. NEUM...,"Neumont v. Monroe County, Florida",2003-05-21,Neumont,"Monroe County, Florida",2003


In [6]:
# From the columns above, we only need author_id, judge, docketNumber, docket_id, party_1, party_2, and dateFiled_year
columns_to_keep = ['author_id', 'judge', 'docketNumber', 'docket_id', 'caseName','party_1', 'party_2', 'dateFiled_year']
sample_dockets_reduced_v1 = sample_dockets[columns_to_keep]

# Because investment data is available for years 2002 onwards, we will only keep dockets from 2002 onwards
sample_dockets_reduced_v1 = sample_dockets_reduced_v1[sample_dockets_reduced_v1['dateFiled_year'] >= 2002]
print(f'The shape of the reduced dockets sample is {sample_dockets_reduced_v1.shape}')
sample_dockets_reduced_v1.head()

The shape of the reduced dockets sample is (2094, 8)


Unnamed: 0,author_id,judge,docketNumber,docket_id,caseName,party_1,party_2,dateFiled_year
4,2490,Paine,99-10054-CIV-PAINE/VITUNAC,2368734,"Neumont v. Monroe County, Florida",Neumont,"Monroe County, Florida",2003
5,2490,Paine,04-80543-CIV,2358425,Bazile v. Lucent Technologies,Bazile,Lucent Technologies,2005
6,2490,Paine,03-80590-CIV,2335767,Fick v. Metropolitan Life Insurance,Fick,Metropolitan Life Insurance,2004
7,2490,"Paine, Vitunac","99-10054-CIV-PAINE, 99-10054-CIV-VITUNAC",2312588,"Neumont v. Monroe County, Florida",Neumont,"Monroe County, Florida",2002
8,2490,"Paine, Lynch, Unite",0214160CIVPAINE Lynch,2297969,Wall v. Pennzoil-Quaker States Co.,Wall,Pennzoil-Quaker States Co.,2004


In [7]:
# Create a dataframe with just party names
columns_to_keep = ['party_1','party_2']
sample_dockets_reduced_v2 = sample_dockets_reduced_v1[columns_to_keep]

# Merge the two columns, drop duplicates and create a list
party_names = pd.concat([sample_dockets_reduced_v2['party_1'], sample_dockets_reduced_v2['party_2']])
party_names = party_names.drop_duplicates()
party_names = party_names.tolist()
print(f'The number of unique party names is {len(party_names)}')
print(party_names[:10])

The number of unique party names is 3002
['Neumont', 'Bazile', 'Fick', 'Wall', 'Sony Computer Entertainment Inc.', 'Information Resources, Inc.', 'Blanch', 'Dimich', 'Viacom International Inc.', 'Colgate-Palmolive Co.']


In [8]:
# Create a merged dataframe with judges and investments
# First, in the dockets file, have both party names in a single column for merging 
sample_dockets_reduced_v3 = sample_dockets_reduced_v1[['author_id','party_1','party_2','dateFiled_year']]

# Merge party names into a single column using melt
sample_dockets_reduced_v3 = sample_dockets_reduced_v1.melt(
    id_vars=['author_id','dateFiled_year'],
    value_vars=['party_1','party_2'],
    value_name='party_name')

# Reorder columns and drop duplicates
sample_dockets_reduced_v3 = sample_dockets_reduced_v3[['author_id','party_name','dateFiled_year']]
sample_dockets_reduced_v3 = sample_dockets_reduced_v3.drop_duplicates()
print(f'The shape of the melted dockets is {sample_dockets_reduced_v3.shape}')
sample_dockets_reduced_v3.head()

The shape of the melted dockets is (3706, 3)


Unnamed: 0,author_id,party_name,dateFiled_year
0,2490,Neumont,2003
1,2490,Bazile,2005
2,2490,Fick,2004
3,2490,Neumont,2002
4,2490,Wall,2004


In [9]:
# Now let's examine investments sample
sample_investments.head()

Unnamed: 0,person_id,disclosure_year,investment_id,inv_description
0,568,2015,1751139.0,Harris Bank IRA-Non Stock Account
1,568,2015,1751140.0,Harris Bank IRA-Non Stock Account
2,1816,2012,1565171.0,Bank of America
3,1816,2012,1565172.0,Bank of America
4,1816,2012,1565173.0,Bank of America


In [10]:
# Get a list of unique investment descriptions - these are strings we will try to match with
investment_descriptions = sample_investments['inv_description'].unique().tolist()
print(f'The number of unique investment descriptions is {len(investment_descriptions)}')
print(investment_descriptions)

The number of unique investment descriptions is 4102
['Harris Bank IRA-Non Stock Account', 'Bank of America', 'IRA #1 MORGAN STANLEY ("MS"): UBS FINANCIAL SERV., INC.', 'Ford Motor Credit Co. (Ser. Note)', 'General Motors ACB', 'DRU, INC', 'Caterpillar FNL SVC', 'Capital One Bank Glen Allen, VA CD', 'HSBC Fin. Corp.', 'BBTCop.T', 'GE Money Bank UT CD', 'M&T Capital Trust', 'PNC Capita) Trust E', 'Credit Suisse GU', 'J.P.Morgan Chase', 'American Express Credit', 'Capital Jumbo CD', 'Flagstone Bank CD', 'MORGAN STANLEY ACCOUNT: UBS FINANCIAL SERV. INC.', 'Duke Energy', 'Bristol Meyers Squibb', 'General Electric', 'Sunrise W & G', 'Hudson GO Purp B', 'Luzeme, COGOB', 'Texas Cap. Bk. CD', 'Kansas CY CLGS', 'DB Capiual T', 'Credit Suisse', 'Citigroup', 'Decutsche BK Cap Ill', 'UBS Bank USA', '1ll. Rural Bk Bd Rev Bond', 'Walgreen', 'Romeoville, IL GO', 'Wash. St. Higher Ed. Fac Bond', 'IM-IT Bond Fund 79 (Van Kampen)', 'TI. St. U. Rev. B', 'Rancho Mirage CAB', 'Harris Bank (checking)', 'Fed

In [11]:
# Keep relevant columns in investments data
sample_investments_reduced_v1 = sample_investments[['person_id','disclosure_year','inv_description']]
# Drop duplicates
sample_investments_reduced_v1 = sample_investments_reduced_v1.drop_duplicates()

# Rename and rearrange columns
sample_investments_reduced_v1 = sample_investments_reduced_v1.rename(columns={'person_id':'author_id','disclosure_year':'dateFiled_year'})
sample_investments_reduced_v1 = sample_investments_reduced_v1[['author_id','inv_description','dateFiled_year']]
print(f'The shape of the reduced investments is {sample_investments_reduced_v1.shape}')
sample_investments_reduced_v1.head()

The shape of the reduced investments is (8878, 3)


Unnamed: 0,author_id,inv_description,dateFiled_year
0,568,Harris Bank IRA-Non Stock Account,2015
2,1816,Bank of America,2012
6,1380,"IRA #1 MORGAN STANLEY (""MS""): UBS FINANCIAL SE...",2009
7,1380,Ford Motor Credit Co. (Ser. Note),2009
8,1380,General Motors ACB,2009


In [27]:
# Merge investments and reduced dockets data on author_id and dateFiled_year
merged_dock_inv_str_matching = pd.merge(sample_dockets_reduced_v3, sample_investments_reduced_v1, on='author_id', how='inner')
print(f'The shape of the merged data is {merged_dock_inv_str_matching.shape}')
merged_dock_inv_str_matching.head()

The shape of the merged data is (1456593, 5)


Unnamed: 0,author_id,party_name,dateFiled_year_x,inv_description,dateFiled_year_y
0,2490,Neumont,2003,BANK ACCOUNTS,2003
1,2490,Neumont,2003,Wachovia Bank,2003
2,2490,Neumont,2003,"West Palm Beach, Florida",2003
3,2490,Neumont,2003,Fidelity Fed. Bk & Tr,2003
4,2490,Neumont,2003,COMMON STOCK,2003


In [26]:
# Pipeline for string matching

stopwords = ['ltd','limited','company','inc','incorporated','corporation','corp','co','llc','plc','accounts','401K','bonds','']
special_characters = ['.',',',';',':','!','?','(',')','[',']','{','}','<','>','/','\\','|','@','#','%','^','&','*','_','+','=','-','~','`','"','\'']

def preprocess_names(names_list):
    """
    Preprocess a list of names by cleaning text (lowercase, strip spaces, remove special characters).
    
    Args:
        names_list (list): List of names to preprocess.
    
    Returns:
        list: Preprocessed list of names.
    """
    processed_list = []
    for name in names_list:
        if isinstance(name, str):  # Check if it's a string
            clean_name = name.lower().strip()
            # remove numerals and special characters defined above    
            clean_name = re.sub(r'\d+', '', clean_name)
            for char in special_characters:
                clean_name = clean_name.replace(char, '')
            # remove stopwords
            clean_name = ' '.join([word for word in clean_name.split() if word not in stopwords])
            processed_list.append(clean_name)
        else:
            processed_list.append('')
    return processed_list

In [28]:
# Preprocess party_name and inv_description in the merged dataframe
merged_dock_inv_str_matching['processed_party_name'] = preprocess_names(merged_dock_inv_str_matching['party_name'].tolist())
merged_dock_inv_str_matching['processed_inv_description'] = preprocess_names(merged_dock_inv_str_matching['inv_description'].tolist())

In [29]:
# Examine merged_dock_inv_str_matching
merged_dock_inv_str_matching.head()

Unnamed: 0,author_id,party_name,dateFiled_year_x,inv_description,dateFiled_year_y,processed_party_name,processed_inv_description
0,2490,Neumont,2003,BANK ACCOUNTS,2003,neumont,bank
1,2490,Neumont,2003,Wachovia Bank,2003,neumont,wachovia bank
2,2490,Neumont,2003,"West Palm Beach, Florida",2003,neumont,west palm beach florida
3,2490,Neumont,2003,Fidelity Fed. Bk & Tr,2003,neumont,fidelity fed bk tr
4,2490,Neumont,2003,COMMON STOCK,2003,neumont,common stock


In [33]:
# Filter out rows (names) where processed_party_name or processed_inv_description length is less than 5
merged_dock_inv_str_matching_v1 = merged_dock_inv_str_matching[
    (merged_dock_inv_str_matching['processed_party_name'].str.len() >= 5) & 
    (merged_dock_inv_str_matching['processed_inv_description'].str.len() >= 5)]

# Examine the shape of the filtered data
print(f'The shape of the filtered data is {merged_dock_inv_str_matching_v1.shape}')
print(merged_dock_inv_str_matching_v1.head())

The shape of the filtered data is (1326520, 7)
   author_id party_name  dateFiled_year_x           inv_description  \
1       2490    Neumont              2003             Wachovia Bank   
2       2490    Neumont              2003  West Palm Beach, Florida   
3       2490    Neumont              2003     Fidelity Fed. Bk & Tr   
4       2490    Neumont              2003              COMMON STOCK   
5       2490    Neumont              2003          Loch Laurel Club   

   dateFiled_year_y processed_party_name processed_inv_description  
1              2003              neumont             wachovia bank  
2              2003              neumont   west palm beach florida  
3              2003              neumont        fidelity fed bk tr  
4              2003              neumont              common stock  
5              2003              neumont          loch laurel club  


In [34]:
# Extract unique processed_party_name and processed_inv_description
unique_processed_party_names = merged_dock_inv_str_matching_v1['processed_party_name'].unique().tolist()
unique_processed_inv_descriptions = merged_dock_inv_str_matching_v1['processed_inv_description'].unique().tolist()
print(f'The number of unique processed party names is {len(unique_processed_party_names)}')
print(f'The number of unique processed investment descriptions is {len(unique_processed_inv_descriptions)}')

The number of unique processed party names is 2808
The number of unique processed investment descriptions is 3184


In [35]:
# Create a dictionary to store investment results alphabetically

def create_investment_dict(investment_descriptions):
    investment_dict = defaultdict(list)
    for desc in investment_descriptions:
        if isinstance(desc, str) and desc:
            first_letter = desc[0].lower()
            investment_dict[first_letter].append(desc)
    return investment_dict

In [36]:
# Create a dictionary of investment descriptions
processed_investment_dict = create_investment_dict(unique_processed_inv_descriptions)
processed_investment_dict

defaultdict(list,
            {'w': ['wachovia bank',
              'west palm beach florida',
              'walgreen',
              'wash mutual',
              'world com',
              'wachovia bank was first union’ common stock',
              'wachovia bank was first union common stock',
              'wells fargo common stock',
              'wells fargobanks',
              'wachovia bank meger with wells fargo',
              'wells fargo',
              'wells fargo new del',
              'walmart',
              'wachovia',
              'walgreens',
              'washington mut',
              'wyeth formerly amer home prod',
              'wyeth formerly amerhome prod',
              'walt disney',
              'walgreens oo',
              'waddell reed',
              'wachovia new',
              'walmart stores',
              'weatherford intl',
              'wal mart stores',
              'wrigley wm jr',
              'whole foods mkt',
              'washin

In [37]:
# Function to calculate similarity scores for party names and investment descriptions

def compare_similarity_scores(party_name, investment_dict):
    first_letter = party_name[0].lower()

    # Get relevant investment descriptions for the first letter
    relevant_investments = investment_dict.get(first_letter, [])

    results = []
    for investment in relevant_investments:
        lev_score = Levenshtein.normalized_similarity(party_name, investment)
        jaro_score = JaroWinkler.similarity(party_name, investment)

        results.append({
            "party_name": party_name,
            "investment": investment,
            "levenshtein_score": lev_score,
            "jaro_winkler_score": jaro_score
        })
    return pd.DataFrame(results).sort_values(by=["levenshtein_score", "jaro_winkler_score"], ascending=False)

In [38]:
# Test implementation for first 50 party names
party_names_sample = unique_processed_party_names[:50]
start_time = time.time()
results = []
for name in party_names_sample:
    result = compare_similarity_scores(name, processed_investment_dict)
    results.append(result)
end_time = time.time()
print(f'Time taken to process 50 party names is {end_time - start_time} seconds')

# Examine results
results_df = pd.concat(results)
print(results_df.head())

Time taken to process 50 party names is 0.11429715156555176 seconds
    party_name          investment  levenshtein_score  jaro_winkler_score
0      neumont             newmont           0.857143            0.923810
58     neumont         newmont mng           0.545455            0.840693
149    neumont         newmont nem           0.545455            0.840693
77     neumont               nucor           0.428571            0.676190
142    neumont  newmont mining nem           0.333333            0.784127


In [41]:
# Test the top match code
results_df['average_score'] = results_df[['levenshtein_score','jaro_winkler_score']].mean(axis=1)
top_matches = results_df.groupby('party_name').head(1)
top_matches = top_matches.sort_values(by='average_score', ascending=False)
top_matches.head()
print(top_matches.shape)

(50, 5)


In [46]:
# Implement for all party names

def get_top_match_using_lev_jarowinkler(party_list, investment_dict):
    '''
    Calculate the similarity score for party name, take the average of the two scores and return the top match
    '''
    results = []
    start_time = time.time()
    for name in party_list:
        result = compare_similarity_scores(name, investment_dict)
        results.append(result)
    results_df = pd.concat(results)
    results_df['average_score'] = (results_df['levenshtein_score'] + results_df['jaro_winkler_score']) / 2
    top_match = results_df.groupby('party_name').apply(lambda x: x.loc[x['average_score'].idxmax()])
    end_time = time.time()
    print(f'Time taken to process {len(party_list)} party names is {end_time - start_time:.2f} seconds')
    return top_match

In [47]:
# Implementation with all party names
top_matches_all = get_top_match_using_lev_jarowinkler(unique_processed_party_names, processed_investment_dict)
print(top_matches_all.shape)
print(top_matches_all)

Time taken to process 2808 party names is 1.67 seconds
(2808, 5)
                                                                party_name  \
party_name                                                                   
aaron transfer storage                              aaron transfer storage   
abbott laboratories                                    abbott laboratories   
abels                                                                abels   
abm janitorial servicesnorth central  abm janitorial servicesnorth central   
academia sagrado corazon                          academia sagrado corazon   
...                                                                    ...   
zimmerman                                                        zimmerman   
zollinger                                                        zollinger   
zurich america insurance                          zurich america insurance   
zurich american insurance                        zurich american insurance   

  top_match = results_df.groupby('party_name').apply(lambda x: x.loc[x['average_score'].idxmax()])


In [48]:
top_matches_all.to_csv('top_matches_all.csv', index=False)

In [53]:
merged_dock_inv_str_matching_v1.to_csv('merged_dock_inv_str_matching_v1.csv', index=False)

In [50]:
# Drop the party_name index to prepare for merging
top_matches_all = top_matches_all.reset_index(drop=True)

# Merge dataframes based on matching party names and best matches, and also keep = 1
df_result = pd.merge(merged_dock_inv_str_matching_v1, top_matches_all,
                     left_on=['processed_party_name', 'processed_inv_description'],
                     right_on=['party_name', 'investment'],
                     how='left')

# Examine the shape of the merged dataframe
print(df_result.shape)
df_result.head()

(1326520, 12)


Unnamed: 0,author_id,party_name_x,dateFiled_year_x,inv_description,dateFiled_year_y,processed_party_name,processed_inv_description,party_name_y,investment,levenshtein_score,jaro_winkler_score,average_score
0,2490,Neumont,2003,Wachovia Bank,2003,neumont,wachovia bank,,,,,
1,2490,Neumont,2003,"West Palm Beach, Florida",2003,neumont,west palm beach florida,,,,,
2,2490,Neumont,2003,Fidelity Fed. Bk & Tr,2003,neumont,fidelity fed bk tr,,,,,
3,2490,Neumont,2003,COMMON STOCK,2003,neumont,common stock,,,,,
4,2490,Neumont,2003,Loch Laurel Club,2003,neumont,loch laurel club,,,,,


In [51]:
# Filter df_result rows where average_score is not NaN
df_result = df_result[df_result['average_score'].notna()]
print(df_result.shape)
df_result.head()

(380, 12)


Unnamed: 0,author_id,party_name_x,dateFiled_year_x,inv_description,dateFiled_year_y,processed_party_name,processed_inv_description,party_name_y,investment,levenshtein_score,jaro_winkler_score,average_score
13,2490,Neumont,2003,Newmont,2003,neumont,newmont,neumont,newmont,0.857143,0.92381,0.890476
73,2490,Neumont,2003,Newmont,2005,neumont,newmont,neumont,newmont,0.857143,0.92381,0.890476
130,2490,Neumont,2003,Newmont,2004,neumont,newmont,neumont,newmont,0.857143,0.92381,0.890476
365,2490,Neumont,2002,Newmont,2003,neumont,newmont,neumont,newmont,0.857143,0.92381,0.890476
425,2490,Neumont,2002,Newmont,2005,neumont,newmont,neumont,newmont,0.857143,0.92381,0.890476


In [52]:
df_result.to_csv('string_matching_final_results.csv', index=False)