# Hearing Details Cleaning
### Standardize hearing detail JSONs into one file

In [2]:
import pandas as pd
import json

def standardize_json(json_file_path):
  with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

  df = pd.DataFrame(data)

  selected_columns = ['govinfo_id', 'Title', 'Congress', 'Congress Chamber', 'Held Date', 'Committee and Subcommittee', 'Members', 'Bill Numbers', 'Witnesses']
  new_column_names = {'Title': 'title', 'Congress': 'congress_session', 'Congress Chamber': 'chamber', 'Held Date': 'held_date', 'Committee and Subcommittee': 'committee', 'Members': 'members', 'Bill Numbers': 'bill_numbers', 'Witnesses': 'witnesses'}
  df_standardized = df[selected_columns].rename(columns=new_column_names)
  df_standardized['congress_session'] = df_standardized['congress_session'].str.replace(r'th$', '', regex=True)

  return df_standardized

In [3]:
def map_committee_name(df):
    print('map_committee_name: Mapping different committee names of the same committee to the same name')
    with open('../datasets/committee_name_mapping.json', 'r') as f:
        committee_name_mapping = json.load(f)

    def get_mapped_committee_name(chamber, committee):
        if chamber not in committee_name_mapping: return committee
        if committee not in committee_name_mapping[chamber]: return committee
        return committee_name_mapping[chamber][committee]

    df['committee_mapped'] = df.apply(lambda row: get_mapped_committee_name(row['chamber'], row['committee']), axis=1)
    column_order = ['govinfo_id', 'title', 'congress_session', 'chamber', 'committee', 'committee_mapped', 'held_date', 'members', 'bill_numbers', 'witnesses']
    df = df[column_order]
    return df

In [4]:
import os

all_dataframes = []

for json_file_name in os.listdir('../datasets/hearing_details/'):
    try:
        all_dataframes.append(standardize_json(f"../datasets/hearing_details/{json_file_name}"))
    except Exception as e:
        print(f"Getting exception at {json_file_name}")

df_combined = pd.concat(all_dataframes, ignore_index=True)
print('Combined all JSONs in to one dataframe, standardized each JSON')
df_mapped = map_committee_name(df_combined)
df_mapped

Combined all JSONs in to one dataframe, standardized each JSON
map_committee_name: Mapping different committee names of the same committee to the same name


Unnamed: 0,govinfo_id,title,congress_session,chamber,committee,committee_mapped,held_date,members,bill_numbers,witnesses
0,CHRG-109shrg26254,"S. Hrg. 109-291 - Repeal Act of May 26, 1936, ...",109,Senate,Committee on Energy and Natural Resources,Committee on Energy and Natural Resources,"October 25, 2005",LISA MURKOWSKI of Alaska (AK); DIANNE FEINSTEI...,H.R. 59\nS. 1829\nS. 1830\nS. 1831\nS. 3378,
1,CHRG-109shrg22720,S. Hrg. 109-337 - Review the Reauthorization o...,109,Senate,"Committee on Agriculture, Nutrition, and Forestry","Committee on Agriculture, Nutrition, and Forestry","May 25, 2005",Max Baucus (MT); Saxby Chambliss (GA); Thad Co...,,"Dahl, Tom, President, American Association of ..."
2,CHRG-109shrg20612,S. Hrg. 109-089 - To Consider the Reauthorizat...,109,Senate,"Committee on Agriculture, Nutrition, and Forestry","Committee on Agriculture, Nutrition, and Forestry","March 8, 2005\nMarch 10, 2005",Max Baucus (MT); Saxby Chambliss (GA); Thad Co...,,"Brown-Hruska, Sharon, Chairman, Commodity Futu..."
3,CHRG-109shrg97781,S. Hrg. 109-012 - Nomination of Hon. Michael J...,109,Senate,"Committee on Agriculture, Nutrition, and Forestry","Committee on Agriculture, Nutrition, and Forestry","January 6, 2005",Max Baucus (MT); Saxby Chambliss (GA); Thad Co...,,"Hagel, Hon. Chuck, a U.S. Senator from Nebrask..."
4,CHRG-109shrg20839,S. Hrg. 109-274 - Nomination of Hon. Thomas C....,109,Senate,"Committee on Agriculture, Nutrition, and Forestry","Committee on Agriculture, Nutrition, and Forestry","April 27, 2005",Max Baucus (MT); Saxby Chambliss (GA); Thad Co...,,"Dorr, Hon. Thomas C., of Iowa, to be Under Sec..."
...,...,...,...,...,...,...,...,...,...,...
32690,CHRG-105shrg50137,S. Hrg. 105-628 - International Banking & Fina...,105,Senate,Special Committee on Year 2000 Technology Problem,Special Committee on Year 2000 Technology Problem,"July 6, 1998",Robert F. Bennett (UT); Jeff Bingaman (NM); Ro...,S. 22\nS. Res. 208\nS. 1518,
32691,CHRG-105shrg51128,S. Hrg. 105-770 - The Year 2000 Technology Pro...,105,Senate,Special Committee on Year 2000 Technology Problem,Special Committee on Year 2000 Technology Problem,"September 17, 1998",Robert F. Bennett (UT); Jeff Bingaman (NM); Ro...,S. Res. 208\nS. Res. 2000\nS. 2392,
32692,CHRG-105shrg51127,S. Hrg. 105-777 - Transportation After Y2k: Ca...,105,Senate,Special Committee on Year 2000 Technology Problem,Special Committee on Year 2000 Technology Problem,"September 10, 1998",Robert F. Bennett (UT); Jeff Bingaman (NM); Ro...,S. Res. 208\nS. 2392,
32693,CHRG-105shrg44791,S. Hrg. 105-376 - U.S. and Mexican Counterdrug...,105,Senate,United States Senate Caucus on International N...,United States Senate Caucus on International N...,"October 29, 1997",John Ashcroft (MO); Joseph R. Biden Jr. (DE); ...,,


### Dataset Validation

In [5]:
df_mapped.isna().sum()

govinfo_id              0
title                   0
congress_session        0
chamber                 0
committee               0
committee_mapped        0
held_date               0
members              3028
bill_numbers        17863
witnesses           12135
dtype: int64

Here we check the format of all dates in the "held_date" column of the dataframe. We make sure all dates are no earlier than 1997. The publisher used simple REGEX to detect dates. Therefore, some dates from other places of the document get included in the dates. We manually updated all the dates in the hearing details JSON files. But by no means we have corrected all cases. 

In [6]:
def check_date_format(df, date_col):
  num_incorrect = 0

  def is_valid_date_format(date_str):
    try:
      date_obj = pd.to_datetime(date_str, format='%B %d, %Y')
      return date_obj.year >= 1997
    except ValueError:
      return False

  def has_multiple_dates(date_str):
    return '\n' in date_str

  for i in range(len(df)):
    govinfo_id = df.loc[i, 'govinfo_id']
    date_str = df.loc[i, date_col]
    if is_valid_date_format(date_str):
      continue

    if has_multiple_dates(date_str):
      dates = date_str.split('\n')
      if not all(is_valid_date_format(date) for date in dates):
        num_incorrect += 1
        print(f"Hearing {govinfo_id} has invalid date.")
    else:
      num_incorrect += 1

  return num_incorrect

num_incorrect_formats = check_date_format(df_mapped, 'held_date')

print(f"Number of rows with incorrect format: {num_incorrect_formats}")


Number of rows with incorrect format: 0


In [7]:
def show_rows_with_length_mismatch(df, column_name, string_length):
  """
  This function shows rows in a DataFrame where a specific column's value length doesn't match a given length.

  Args:
      df (pandas.DataFrame): The DataFrame containing the column.
      column_name (str): The name of the column to check.
  """
  # Filter rows where the column length doesn't match the desired length
  filtered_df = df[df[column_name].str.len() != string_length]

  # Print the filtered DataFrame (containing rows with mismatched length)
  if not filtered_df.empty:
    print("Rows where", column_name, f"length doesn't match {string_length}:")
    return filtered_df
  else:
    print("All rows in", column_name, f"have a length of {string_length}.")

non_matching_df = show_rows_with_length_mismatch(df_mapped.copy(), 'govinfo_id', 17)
non_matching_df

Rows where govinfo_id length doesn't match 17:


Unnamed: 0,govinfo_id,title,congress_session,chamber,committee,committee_mapped,held_date,members,bill_numbers,witnesses
50,CHRG-109shrg49104184,"Senate Hearing, 109th Congress - Departments o...",109,Senate,Committee on Appropriations,Committee on Appropriations,"February 17, 2005",,,
51,CHRG-109shrg49104171,"Senate Hearing, 109th Congress - Departments o...",109,Senate,Committee on Appropriations,Committee on Appropriations,"March 2, 2005",,H.R. 1,
56,CHRG-109shrg39104136,"Senate Hearing, 109th Congress - Department of...",109,Senate,Committee on Appropriations,Committee on Appropriations,"March 2, 2005",,,
58,CHRG-109shrg49104169,"Senate Hearing, 109th Congress - Department of...",109,Senate,Committee on Appropriations,Committee on Appropriations,"March 2, 2005",,S. 352\nH.R. 418,
60,CHRG-109shrg49104179,"Senate Hearing, 109th Congress - Military Cons...",109,Senate,Committee on Appropriations,Committee on Appropriations,"March 8, 2005",,,
...,...,...,...,...,...,...,...,...,...,...
32253,CHRG-110shrg40547-err,S. Hrg. 110-328 - [ERRATA] Hearing on Pending ...,110,Senate,Committee on Veterans' Affairs,Committee on Veterans' Affairs,"October 24, 2007",Daniel K. Akaka (HI); Sherrod Brown (OH); Larr...,S. 38\nS. 2004\nS. 2142\nS. 2160\nS. 2162\nH.R...,
32261,CHRG-110shrg41912-err,S. Hrg. 110-378 - [ERRATA] Hearing on Pending ...,110,Senate,Committee on Veterans' Affairs,Committee on Veterans' Affairs,"November 14, 2007",Daniel K. Akaka (HI); Sherrod Brown (OH); Larr...,S. 2160\nS. 2162,
32333,CHRG-110shrg47451-err,"S. Hrg. 110-753 - [ERRATA] the ""Material Suppo...",110,Senate,Committee on the Judiciary,Committee on the Judiciary,"September 19, 2007",Joseph R. Biden Jr. (DE); Sam Brownback (KS); ...,,
32368,CHRG-110shrg41958-err,S. Hrg. 110-357 - [ERRATA] the Rise of Drug-Re...,110,Senate,Committee on the Judiciary,Committee on the Judiciary,"March 24, 2008",Joseph R. Biden Jr. (DE); Sam Brownback (KS); ...,S. 456,


In [8]:
df_err = df_mapped[df_mapped['govinfo_id'].str.endswith('-err')]
print(f"There are {len(df_err)} errata hearing records.")

There are 24 errata hearing records.


### Export df in Parquet format

In [9]:
def store_df_to_parquet(df, filename):
  try:
    df.to_parquet(filename)
    print(f"DataFrame successfully stored to Parquet file: {filename}")
  except Exception as e:
    print(f"Error storing DataFrame to Parquet: {e}")

parquet_filename = "../datasets/hearing_details.parquet"

store_df_to_parquet(df_mapped, parquet_filename)

DataFrame successfully stored to Parquet file: ../datasets/hearing_details.parquet
