In [1]:
# Importing packages
import os
import pandas as pd
import re

In [2]:
# Import dockets dataframe
current_path = os.getcwd()
parent_dir = os.path.dirname(current_path)
grandparent_dir = os.path.dirname(parent_dir)
investments_path = os.path.join(grandparent_dir, 'data', 'conflict','intermediate_dfs','investments.csv')
df_investments = pd.read_csv(investments_path)
df_investments.head()

Unnamed: 0,person_id,disclosure_year,investment_id,inv_description
0,2885,2014,,
1,2885,2014,1800038.0,Wells Fargo Bank - Cash Accounts
2,2885,2014,1800039.0,Nationwide Retirement Solutions Deferred Compe...
3,2885,2014,1800040.0,NW Inv Dest Mod Sve
4,2885,2014,1800041.0,Invsco VK Gr inc A


In [3]:
# Drop duplicate and rows where inv_description is NaN
df_investments = df_investments.drop_duplicates()
df_investments = df_investments.dropna(subset=['inv_description'])

df_investments.head()

Unnamed: 0,person_id,disclosure_year,investment_id,inv_description
1,2885,2014,1800038.0,Wells Fargo Bank - Cash Accounts
2,2885,2014,1800039.0,Nationwide Retirement Solutions Deferred Compe...
3,2885,2014,1800040.0,NW Inv Dest Mod Sve
4,2885,2014,1800041.0,Invsco VK Gr inc A
5,2885,2014,1800042.0,AmCent Eq Gr Inv


In [4]:
# Function to pre-process party names
# Stopwords are the same as legal party stopwords
investment_stopwords = ['ltd','ltd.','limited','company','inc','inc.','incorporated','corporation','corp.','corp','co','co.','llc','plc']
special_characters = [',',';',':','!','?','(',')','[',']','{','}','-','_','/','\\','@','#','$','%','^','*','+','=','|','<','>','~','`','"','\'']

def preprocess_names(names_list):
    """
    Preprocess a list of names by cleaning text (lowercase, strip spaces, remove special characters).
    
    Args:
        names_list (list): List of names to preprocess.
    
    Returns:
        list: Preprocessed list of names.
    """
    processed_list = []
    for name in names_list:
        # Convert to lowercase and remove unwanted characters
        if isinstance(name, str):  # Check if it's a string
            clean_name = name.lower().strip()
            # remove numerals and special characters from the list
            clean_name = re.sub(r'[0-9]', '', clean_name)
            clean_name = ''.join([char for char in clean_name if char not in special_characters])
            # remove stopwords
            clean_name = ' '.join([word for word in clean_name.split() if word not in investment_stopwords])
            processed_list.append(clean_name)
        else:
            processed_list.append('')
    return processed_list

In [5]:
# Add a column called 'inv_name' to the dataframe
df_investments['inv_clean'] = preprocess_names(df_investments['inv_description'])

# Examine the dataframe
df_investments.head()

Unnamed: 0,person_id,disclosure_year,investment_id,inv_description,inv_clean
1,2885,2014,1800038.0,Wells Fargo Bank - Cash Accounts,wells fargo bank cash accounts
2,2885,2014,1800039.0,Nationwide Retirement Solutions Deferred Compe...,nationwide retirement solutions deferred compe...
3,2885,2014,1800040.0,NW Inv Dest Mod Sve,nw inv dest mod sve
4,2885,2014,1800041.0,Invsco VK Gr inc A,invsco vk gr a
5,2885,2014,1800042.0,AmCent Eq Gr Inv,amcent eq gr inv


In [6]:
# Extract investment names in a list and remove duplicates
investment_names_clean_list = df_investments['inv_clean'].tolist()

# Cleaning: remove duplicates
investment_names_clean_list = list(set(investment_names_clean_list))

# Cleaning: remove all names with length less than equal to 3
length_threshold = 3
investment_names_clean_list = [name for name in investment_names_clean_list if len(name) > length_threshold]

# Cleaning: remove a name if the first character is not a letter
investment_names_clean_list = [name for name in investment_names_clean_list if name[0].isalpha()]

# Check the length and examine some values after sorting
print("List of investments has length:", len(investment_names_clean_list))
print("Some investment names are:")
investment_names_clean_list.sort()
investment_names_clean_list[:10]

List of investments has length: 428616
Some investment names are:


['a & b properties secured note due',
 'a & b properties. ecured note due',
 'a & j equities',
 'a & llireiess s’ock se rai',
 'a & w',
 'a &j equities',
 'a &jequities',
 'a &jequities see additional information',
 'a . a wag pe',
 'a . cas& vioney vit accourf']

In [7]:
# Save the list of cleaned investment names to a text file
investment_names_clean_path = os.path.join(grandparent_dir, 'data', 'conflict','intermediate_dfs','investment_names_clean.txt')
with open(investment_names_clean_path, 'w') as f:
    for item in investment_names_clean_list:
        f.write("%s\n" % item)
print("Investment names have been saved to", investment_names_clean_path)

Investment names have been saved to /Users/eshan23/eshanprashar_git_profile/judges-conflicts/data/conflict/intermediate_dfs/investment_names_clean.txt
