In [1]:
# Importing required libraries
import pandas as pd
import os
import glob

In [3]:
# Setting up the working directory
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
fin_disc_data_dir = os.path.join(grandparent_dir, 'data/financial_disc')

In [5]:
# Function to load data from the financial disclosure directory

def load_data(data_dir):
    '''
    This function is not memory efficient, but for <100 csv files, should work fine
    For example: this approach will have to be modified for docket/PACER data
    '''
    all_files = glob.glob(data_dir + "/*.csv")
    print(f"Found {len(all_files)} files in the directory")
    li = []
    for filename in all_files:
        try:
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {e}")
    return pd.concat(li, axis=0, ignore_index=True)

In [6]:
# Loading files from the financial disclosure directory
df_fin_disc = load_data(fin_disc_data_dir)

# Examine the data
df_fin_disc.info()

Found 54 files in the directory


  df = pd.read_csv(filename, index_col=None, header=0)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2015868 entries, 0 to 2015867
Data columns (total 32 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   person_id                            int64  
 1   person_url                           object 
 2   filepath                             object 
 3   disclosure_year                      int64  
 4   notes                                object 
 5   type                                 object 
 6   date_raw                             object 
 7   parties_and_terms                    object 
 8   redacted                             bool   
 9   description                          object 
 10  income_during_reporting_period_code  object 
 11  income_during_reporting_period_type  object 
 12  gross_value_code                     object 
 13  gross_value_method                   object 
 14  transaction_during_reporting_period  object 
 15  transaction_date_raw            

In [13]:
# Let us see how many unique persons, disclosures and investments are there in the dataset
print(f"Number of unique persons in the dataset: {df_fin_disc['person_id'].nunique()}")
print(f"Number of unique disclosures in the dataset: {df_fin_disc['filepath'].nunique()}")
print(f"Number of unique investments in the dataset: {df_fin_disc['investment_id'].nunique()}")

Number of unique persons in the dataset: 3349
Number of unique disclosures in the dataset: 30243
Number of unique investments in the dataset: 1899862


In [8]:
# Create a dataframe for just positions
df_positions = df_fin_disc[df_fin_disc['type'] == 'position']
columns_to_keep = ['person_id', 'filepath', 'disclosure_year', 'notes', 'position_id', 'organization_name']
df_positions = df_positions[columns_to_keep]

# Examine the dataframe
df_positions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37050 entries, 40 to 2015835
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   person_id          37050 non-null  int64  
 1   filepath           37050 non-null  object 
 2   disclosure_year    37050 non-null  int64  
 3   notes              24635 non-null  object 
 4   position_id        37050 non-null  float64
 5   organization_name  36282 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 2.0+ MB


In [9]:
# Let us see how many unique people and positions exist in the dataset
print(f"Number of unique people: {df_positions['person_id'].nunique()}")
print(f"Number of unique positions: {df_positions['position_id'].nunique()}")

Number of unique people: 2553
Number of unique positions: 37050
