# Clean Hearing Details
### Standardize hearing detail JSONs into one file

In [29]:
import pandas as pd
import json

def standardize_json(json_file_path):
  with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

  df = pd.DataFrame(data)

  selected_columns = ['govinfo_id', 'Title', 'Congress', 'Congress Chamber', 'Held Date', 'Committee and Subcommittee', 'Members', 'Bill Numbers', 'Witnesses']
  new_column_names = {'Title': 'title', 'Congress': 'congress_session', 'Congress Chamber': 'chamber', 'Held Date': 'held_date', 'Committee and Subcommittee': 'committee', 'Members': 'members', 'Bill Numbers': 'bill_numbers', 'Witnesses': 'witnesses'}
  df_standardized = df[selected_columns].rename(columns=new_column_names)
  df_standardized['congress_session'] = df_standardized['congress_session'].str.replace(r'th$', '', regex=True)

  return df_standardized

Because committee name could change over time, we have collected a committee name mapping that maps different names to the same committee. This function below adds a 'committee_mapped' column.

In [30]:
def map_committee_name(df):
    print('map_committee_name: Mapping different committee names of the same committee to the same name')
    with open('../data/mappings/committee_name_mapping.json', 'r') as f:
        committee_name_mapping = json.load(f)

    def get_mapped_committee_name(chamber, committee):
        if chamber not in committee_name_mapping: return committee
        if committee not in committee_name_mapping[chamber]: return committee
        return committee_name_mapping[chamber][committee]

    df['committee_mapped'] = df.apply(lambda row: get_mapped_committee_name(row['chamber'], row['committee']), axis=1)
    column_order = ['govinfo_id', 'title', 'congress_session', 'chamber', 'committee', 'committee_mapped', 'held_date', 'members', 'bill_numbers', 'witnesses']
    df = df[column_order]
    return df

In [31]:
import os

all_dataframes = []

for json_file_name in os.listdir('../data/hearing_data/hearing_details/'):
    try:
        all_dataframes.append(standardize_json(f"../data/hearing_data/hearing_details/{json_file_name}"))
    except Exception as e:
        print(f"Getting exception at {json_file_name}")

df_combined = pd.concat(all_dataframes, ignore_index=True)
print('Combined all JSONs in to one dataframe, standardized each JSON')
df_mapped = map_committee_name(df_combined)
df_mapped

Combined all JSONs in to one dataframe, standardized each JSON
map_committee_name: Mapping different committee names of the same committee to the same name


Unnamed: 0,govinfo_id,title,congress_session,chamber,committee,committee_mapped,held_date,members,bill_numbers,witnesses
0,CHRG-117hhrg54192,"House Hearing, 117th Congress - Russia's Swiss...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"May 5, 2022",John Boozman (AR); Richard Blumenthal (CT); Be...,,
1,CHRG-117hhrg50674,"House Hearing, 117th Congress - No Safe Haven:...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 13, 2022",,,
2,CHRG-117hhrg50672,"House Hearing, 117th Congress - Demining Ukrai...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 8, 2022",Robert B. Aderholt (AL); John Boozman (AR); Ri...,,
3,CHRG-117hhrg51269,"House Hearing, 117th Congress - Russia's Infra...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 8, 2022",,,
4,CHRG-117hhrg48842,Serial No. 117-10 (House Hearing) - A Hearing ...,117,House of Representatives,Committee on Agriculture,Committee on Agriculture,"June 23, 2021",Bobby L. Rush (IL); James P. McGovern (MA); Da...,,"Kirwan, Jeff, Owner, Kirwan Farms, New Windor,..."
...,...,...,...,...,...,...,...,...,...,...
32692,CHRG-105hhrg48275,Serial No. 105-82 (House Hearing) - Joint Hear...,105,House of Representatives,"Committee on Appropriations, Committee on the ...","Committee on Appropriations, Committee on the ...","March 26, 1998",Neil Abercrombie (HI); Kevin Brady (TX); Jim B...,H.R. 2378,"Dombeck, Michael, Chief, United States Forest ..."
32693,CHRG-105hhrg53367,"House Hearing, 105th Congress - Impeachment In...",105,House of Representatives,Committee on the Judiciary,Committee on the Judiciary,"November 19, 1998",Bob Barr (GA); Thomas M. Barrett (WI); Howard ...,H. Res. 581,"Starr, Hon. Kenneth, Independent Counsel, Offi..."
32694,CHRG-105hhrg52320,"House Hearing, 105th Congress - Impeachment In...",105,House of Representatives,Committee on the Judiciary,Committee on the Judiciary,"December 8, 1998\nDecember 9, 1998",Bob Barr (GA); Thomas M. Barrett (WI); Howard ...,H. Res. 581,"Ackerman, Bruce, Sterling Professor of Law and..."
32695,CHRG-105hhrg60464,Serial No. 105-214 (House Hearing) - H.R. 3032...,105,House of Representatives,"Committee on Government Reform and Oversight, ...","Committee on Government Reform and Oversight, ...","September 11, 1998",,H.R. 3032,


### Dataset Validation

In [32]:
df_mapped.isna().sum()

govinfo_id              0
title                   0
congress_session        0
chamber                 0
committee               1
committee_mapped        1
held_date               1
members              3030
bill_numbers        17863
witnesses           12137
dtype: int64

Here we check the format of all dates in the "held_date" column of the dataframe. We make sure all dates are no earlier than 1997. The publisher used simple REGEX to detect dates. Therefore, some dates from other places of the document get included in the dates. We manually updated all the dates in the hearing details JSON files. But by no means we have corrected all cases. 

In [33]:
def check_date_format(df, date_col):
  num_incorrect = 0

  def is_valid_date_format(date_str):
    try:
      date_obj = pd.to_datetime(date_str, format='%B %d, %Y')
      return date_obj.year >= 1997
    except ValueError:
      return False

  def has_multiple_dates(date_str):
    return '\n' in date_str

  for i in range(len(df)):
    govinfo_id = df.loc[i, 'govinfo_id']
    date_str = df.loc[i, date_col]
    if pd.isna(date_str):
      continue

    if is_valid_date_format(date_str):
      continue
    elif has_multiple_dates(date_str):
      dates = date_str.split('\n')
      
      if not all(is_valid_date_format(date) for date in dates):
        num_incorrect += 1
        df.loc[i, date_col] = '\n'.join([date for date in dates if is_valid_date_format(date)])
        print(f"Removed invalid date from a list of dates for hearing {govinfo_id}.")
    else:
      num_incorrect += 1
      df.loc[i, date_col] = None
      print(f"Removed invalid date for hearing {govinfo_id}.")
      

  return num_incorrect

num_incorrect_formats = check_date_format(df_mapped, 'held_date')

print(f"Number of rows with incorrect format: {num_incorrect_formats}")


Removed invalid date from a list of dates for hearing CHRG-111shrg57186.
Removed invalid date from a list of dates for hearing CHRG-109jhrg26434.
Removed invalid date from a list of dates for hearing CHRG-115hhrg25555.
Removed invalid date from a list of dates for hearing CHRG-117shrg49104038.
Removed invalid date from a list of dates for hearing CHRG-112shrg19104472.
Removed invalid date from a list of dates for hearing CHRG-107shrg80461.
Removed invalid date from a list of dates for hearing CHRG-109shrg26254.
Removed invalid date from a list of dates for hearing CHRG-106shrg71528.
Removed invalid date from a list of dates for hearing CHRG-106shrg59371.
Removed invalid date from a list of dates for hearing CHRG-111hhrg53253.
Removed invalid date from a list of dates for hearing CHRG-105hhrg41269.
Number of rows with incorrect format: 11


In [36]:
def show_rows_with_length_mismatch(df, column_name, string_length):
  """
  This function shows rows in a DataFrame where a specific column's value length doesn't match a given length.

  Args:
      df (pandas.DataFrame): The DataFrame containing the column.
      column_name (str): The name of the column to check.
  """
  # Filter rows where the column length doesn't match the desired length
  filtered_df = df[df[column_name].str.len() != string_length]

  # Print the filtered DataFrame (containing rows with mismatched length)
  if not filtered_df.empty:
    print("Rows where", column_name, f"length doesn't match {string_length}:")
    return filtered_df
  else:
    print("All rows in", column_name, f"have a length of {string_length}.")

non_matching_df = show_rows_with_length_mismatch(df_mapped.copy(), 'govinfo_id', 17)
non_matching_df

Rows where govinfo_id length doesn't match 17:


Unnamed: 0,govinfo_id,title,congress_session,chamber,committee,committee_mapped,held_date,members,bill_numbers,witnesses
31,CHRG-117hhrg49906-pt2,"Serial No. 117-27 (House Hearing) , Part 2 - A...",117,House of Representatives,Committee on Agriculture,Committee on Agriculture,"February 2, 2022\nMarch 1, 2022\nMarch 8, 2022...",Jim Costa (CA); James P. McGovern (MA); David ...,H.R. 2\nH. Con. Res. 43\nS. 313\nS. 876\nS. 11...,
194,CHRG-117hhrg44535-add4,"Serial No. 117-14 (House Hearing) , Addendum 4...",117,House of Representatives,Committee on Education and Labor,Committee on Education and Labor,"May 13, 2021",Joe Wilson (SC); Raul M. Grijalva (AZ); Virgin...,H.R. 3145,
195,CHRG-117hhrg44535-add3,"Serial No. 117-14 (House Hearing) , Addendum 3...",117,House of Representatives,Committee on Education and Labor,Committee on Education and Labor,"May 13, 2021",Joe Wilson (SC); Raul M. Grijalva (AZ); Virgin...,H.R. 3145,
196,CHRG-117hhrg44535-add2,"Serial No. 117-14 (House Hearing) , Addendum 2...",117,House of Representatives,Committee on Education and Labor,Committee on Education and Labor,"May 13, 2021",Joe Wilson (SC); Raul M. Grijalva (AZ); Virgin...,H.R. 3145,
197,CHRG-117hhrg44535-add1,"Serial No. 117-14 (House Hearing) , Addendum 1...",117,House of Representatives,Committee on Education and Labor,Committee on Education and Labor,"May 13, 2021",Joe Wilson (SC); Raul M. Grijalva (AZ); Virgin...,H.R. 3145,
...,...,...,...,...,...,...,...,...,...,...
31410,CHRG-116shrg19104924,"Senate Hearing, 116th Congress - Financial Ser...",116,Senate,Committee on Appropriations,Committee on Appropriations,"March 10, 2020",,H.R. 4998,
31411,CHRG-116shrg19104893,"Senate Hearing, 116th Congress - Departments o...",116,Senate,Committee on Appropriations,Committee on Appropriations,"July 2, 2020",,S. 4055,
31412,CHRG-116shrg29104933,"Senate Hearing, 116th Congress - State, Foreig...",116,Senate,Committee on Appropriations,Committee on Appropriations,"July 21, 2020",,,
31413,CHRG-116shrg29104932,"Senate Hearing, 116th Congress - Financial Ser...",116,Senate,Committee on Appropriations,Committee on Appropriations,"June 16, 2020",,,


In [40]:
df_err = df_mapped[df_mapped['govinfo_id'].str.endswith('-err')]
print(f"There are {len(df_err)} errata hearing records.")

for err_id in df_err['govinfo_id']:
    # Get the part before -err
    base_id = err_id[:-4]  # Remove the '-err' part

    if base_id in df_mapped['govinfo_id'].values:
        # Remove the -err row from the original DataFrame
        print(f"Removing {err_id}")
        df_mapped = df_mapped[df_mapped['govinfo_id'] != err_id]

df_err = df_mapped[df_mapped['govinfo_id'].str.endswith('-err')]
print(f"There are still {len(df_err)} errata hearing records left.")

There are 24 errata hearing records.
Removing CHRG-106hhrg59318-err
Removing CHRG-106hhrg60043-err
Removing CHRG-111shrg86304-err
Removing CHRG-108shrg91193-err
Removing CHRG-112hhrg65745-err
Removing CHRG-112shrg67632-err
Removing CHRG-107shrg83924-err
Removing CHRG-113hhrg87647-err
Removing CHRG-110shrg48281-err
Removing CHRG-110shrg43820-err
Removing CHRG-110shrg40547-err
Removing CHRG-110shrg41912-err
Removing CHRG-110shrg47451-err
Removing CHRG-110shrg41958-err
Removing CHRG-110shrg45240-err
Removing CHRG-111hhrg51840-err
Removing CHRG-111hhrg52325-err
Removing CHRG-111hhrg48126-err
Removing CHRG-111hhrg48883-err
Removing CHRG-111hhrg48413-err
Removing CHRG-114hhrg93279-err
There are still 3 errata hearing records left.


In [41]:
df_mapped

Unnamed: 0,govinfo_id,title,congress_session,chamber,committee,committee_mapped,held_date,members,bill_numbers,witnesses
0,CHRG-117hhrg54192,"House Hearing, 117th Congress - Russia's Swiss...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"May 5, 2022",John Boozman (AR); Richard Blumenthal (CT); Be...,,
1,CHRG-117hhrg50674,"House Hearing, 117th Congress - No Safe Haven:...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 13, 2022",,,
2,CHRG-117hhrg50672,"House Hearing, 117th Congress - Demining Ukrai...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 8, 2022",Robert B. Aderholt (AL); John Boozman (AR); Ri...,,
3,CHRG-117hhrg51269,"House Hearing, 117th Congress - Russia's Infra...",117,House of Representatives,Commission on Security and Cooperation in Europe,Commission on Security and Cooperation in Europe,"December 8, 2022",,,
4,CHRG-117hhrg48842,Serial No. 117-10 (House Hearing) - A Hearing ...,117,House of Representatives,Committee on Agriculture,Committee on Agriculture,"June 23, 2021",Bobby L. Rush (IL); James P. McGovern (MA); Da...,,"Kirwan, Jeff, Owner, Kirwan Farms, New Windor,..."
...,...,...,...,...,...,...,...,...,...,...
32692,CHRG-105hhrg48275,Serial No. 105-82 (House Hearing) - Joint Hear...,105,House of Representatives,"Committee on Appropriations, Committee on the ...","Committee on Appropriations, Committee on the ...","March 26, 1998",Neil Abercrombie (HI); Kevin Brady (TX); Jim B...,H.R. 2378,"Dombeck, Michael, Chief, United States Forest ..."
32693,CHRG-105hhrg53367,"House Hearing, 105th Congress - Impeachment In...",105,House of Representatives,Committee on the Judiciary,Committee on the Judiciary,"November 19, 1998",Bob Barr (GA); Thomas M. Barrett (WI); Howard ...,H. Res. 581,"Starr, Hon. Kenneth, Independent Counsel, Offi..."
32694,CHRG-105hhrg52320,"House Hearing, 105th Congress - Impeachment In...",105,House of Representatives,Committee on the Judiciary,Committee on the Judiciary,"December 8, 1998\nDecember 9, 1998",Bob Barr (GA); Thomas M. Barrett (WI); Howard ...,H. Res. 581,"Ackerman, Bruce, Sterling Professor of Law and..."
32695,CHRG-105hhrg60464,Serial No. 105-214 (House Hearing) - H.R. 3032...,105,House of Representatives,"Committee on Government Reform and Oversight, ...","Committee on Government Reform and Oversight, ...","September 11, 1998",,H.R. 3032,


### Export df in Parquet format

In [42]:
def store_df_to_parquet(df, filename):
  try:
    df.to_parquet(filename)
    print(f"DataFrame successfully stored to Parquet file: {filename}")
  except Exception as e:
    print(f"Error storing DataFrame to Parquet: {e}")

parquet_filename = "../data/hearing_data/hearing_details.parquet"

store_df_to_parquet(df_mapped, parquet_filename)

DataFrame successfully stored to Parquet file: ../data/hearing_data/hearing_details.parquet
