In [8]:
# Env: wang-ml-py3.11
# Pending: Adding poetry environment and dependencies
# Importing required libraries
import pandas as pd
import os
import glob
# import pyarrow.parquet as pq

In [17]:
# Setting up the working directory
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
fin_disc_data_dir = os.path.join(grandparent_dir, 'data/financial_disc')

In [3]:
# Function to load data from the financial disclosure directory

def load_data(data_dir):
    '''
    This function is not memory efficient, but for <100 csv files, should work fine
    For example: this approach will have to be modified for docket/PACER data
    '''
    all_files = glob.glob(data_dir + "/*.csv")
    print(f"Found {len(all_files)} files in the directory")
    li = []
    for filename in all_files:
        try:
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)
        except Exception as e:
            print(f"Error reading file {filename}: {e}")
    return pd.concat(li, axis=0, ignore_index=True)

In [4]:
# Loading files from the financial disclosure directory
df_fin_disc = load_data(fin_disc_data_dir)

# Examine the data
df_fin_disc.info()

Found 54 files in the directory


  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.read_csv(filename, index_col=None, header=0)
  df = pd.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019751 entries, 0 to 2019750
Data columns (total 52 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   person_id                            int64  
 1   person_url                           object 
 2   filepath                             object 
 3   disclosure_year                      int64  
 4   notes                                object 
 5   date_created                         object 
 6   date_modified                        object 
 7   type                                 object 
 8   agreement_id                         float64
 9   agree_date_raw                       object 
 10  parties_and_terms                    object 
 11  agree_redacted                       object 
 12  gift_id                              float64
 13  gift_source                          object 
 14  gift_description                     object 
 15  gift_value                      

In [9]:
# Save data to a parquet file for future use
# df_fin_disc.to_parquet(os.path.join(grandparent_dir, 'data/financial_disc/financial_disc.parquet'), index=False)
# print("Data saved to parquet file in directory: {fin_disc_data_dir}")

In [5]:
# Let us see how many unique persons, disclosures and investments are there in the dataset
print(f"Number of unique persons in the dataset: {df_fin_disc['person_id'].nunique()}")
print(f"Number of unique disclosures in the dataset: {df_fin_disc['filepath'].nunique()}")
print(f"Number of unique investments in the dataset: {df_fin_disc['investment_id'].nunique()}")

Number of unique persons in the dataset: 3349
Number of unique disclosures in the dataset: 30436
Number of unique investments in the dataset: 1901720


In [12]:
# Create a dataframe for just positions
df_positions = df_fin_disc[df_fin_disc['type'] == 'position']
columns_to_keep = ['position_id','position','disclosure_year', 'organization_name','date_created','date_modified','position_redacted','person_id', 'filepath', 'notes']
df_positions = df_positions[columns_to_keep]

# Examine the dataframe
df_positions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37050 entries, 40 to 2019718
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   position_id        37050 non-null  float64
 1   position           36823 non-null  object 
 2   disclosure_year    37050 non-null  int64  
 3   organization_name  36282 non-null  object 
 4   date_created       37050 non-null  object 
 5   date_modified      37050 non-null  object 
 6   position_redacted  37050 non-null  object 
 7   person_id          37050 non-null  int64  
 8   filepath           37050 non-null  object 
 9   notes              24635 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 3.1+ MB


In [13]:
# Let us see how many unique people and positions exist in the dataset
print(f"Number of unique people: {df_positions['person_id'].nunique()}")
print(f"Number of unique positions: {df_positions['position_id'].nunique()}")

Number of unique people: 2553
Number of unique positions: 37050


In [28]:
# Importing the persons and positions data from persons_positions.csv
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
persons_positions_dir = os.path.join(grandparent_dir, 'data/persons_positions')
df_persons_positions = pd.read_csv(os.path.join(persons_positions_dir, 'persons_positions_modified.csv'))

# Examine the data
df_persons_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51281 entries, 0 to 51280
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   person_id               51281 non-null  int64  
 1   name_first              51281 non-null  object 
 2   name_middle             43300 non-null  object 
 3   name_last               51281 non-null  object 
 4   political_affiliations  35456 non-null  object 
 5   race                    33919 non-null  object 
 6   position_url            51281 non-null  object 
 7   position_type           30056 non-null  object 
 8   job_title               21225 non-null  object 
 9   sector                  6764 non-null   float64
 10  organization            13248 non-null  object 
 11  date_nominated          4274 non-null   object 
 12  date_start              50412 non-null  object 
 13  date_termination        42457 non-null  object 
 14  court_resource_url      22174 non-null

In [29]:
# Join the two dataframes on person_id using outer join
df_persons_positions = df_persons_positions.merge(df_positions, on='person_id', how='outer')
df_persons_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216862 entries, 0 to 216861
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   person_id               216862 non-null  int64  
 1   name_first              216862 non-null  object 
 2   name_middle             196183 non-null  object 
 3   name_last               216862 non-null  object 
 4   political_affiliations  189429 non-null  object 
 5   race                    188046 non-null  object 
 6   position_url            216862 non-null  object 
 7   position_type           73842 non-null   object 
 8   job_title               143020 non-null  object 
 9   sector                  7362 non-null    float64
 10  organization            85160 non-null   object 
 11  date_nominated          30881 non-null   object 
 12  date_start              215876 non-null  object 
 13  date_termination        163980 non-null  object 
 14  court_resource_url  

In [None]:
# Examing csv files in the directory
