In [1]:
import pandas as pd
import json

from common.constants.billml import BillMLConstants

# edit this to change the dataset file
DATASET_FILE_NAME = 'bill_text_us_11-10-2023_13-02-35.jsonl'

DATASET_PATH = (
    f'{BillMLConstants.BILLML_DATASETS_STORAGE_FILEPATH.value}/'
    f'{BillMLConstants.BILLML_BILL_TEXT_US_DATASET_NAME.value}/'
    f'{DATASET_FILE_NAME}'
)
chunks = pd.read_json(path_or_buf=DATASET_PATH, lines=True, chunksize=10000)
dfs = []
for chunk in chunks:
  # Note due to the large size of the dataset, we have to exclude massive text and sections columns.
  dfs.append(chunk.loc[:, ~chunk.columns.isin(['text', 'sections'])])

df = pd.concat(dfs)

In [2]:
print('Dataset top rows:')
df.head(10)

Dataset top rows:


Unnamed: 0,id,congress,bill_type,bill_number,bill_version,title,sections_length,text_length
0,107hconres470ih,107,hconres,470,ih,Supporting the goals and ideals of College Sav...,1,69
1,107hconres502ih,107,hconres,502,ih,Expressing the sense of the Congress in suppor...,1,747
2,107hconres467ih,107,hconres,467,ih,Expressing the sense of Congress that Lionel H...,1,171
3,107hconres501ih,107,hconres,501,ih,Expressing the sense of Congress that Congress...,1,186
4,107hconres497ih,107,hconres,497,ih,Supporting the goals and ideas of National Tak...,1,479
5,107hconres517ih,107,hconres,517,ih,Condemning the Democratic People’s Republic of...,1,1954
6,107hconres491ih,107,hconres,491,ih,Supporting the goals and ideals of National Sa...,1,92
7,107hconres518ih,107,hconres,518,ih,Expressing the sense of Congress that the depl...,1,447
8,107hconres515ih,107,hconres,515,ih,Expressing the sense of Congress with respect ...,1,1162
9,107hconres472ih,107,hconres,472,ih,Recognizing the 100th anniversary of the 4-H Y...,1,352


In [3]:
print('Dataset bottom rows:')
df.tail(10)

Dataset bottom rows:


Unnamed: 0,id,congress,bill_type,bill_number,bill_version,title,sections_length,text_length
166769,118sres71ats,118,sres,71,ats,Congratulating the Kansas City Chiefs on their...,1,533
166770,118sres123ats,118,sres,123,ats,Recognizing the week of March 19 through March...,1,693
166771,118sres123is,118,sres,123,is,Recognizing the week of March 19 through March...,1,693
166772,118sres277ats,118,sres,277,ats,Expressing the condolences of the Senate and h...,1,841
166773,118sres124is,118,sres,124,is,"Designating March 24th, 2023, as National Wome...",1,1809
166774,118sres215ats,118,sres,215,ats,Supporting the mission and goals of National F...,1,554
166775,118sres371is,118,sres,371,is,Supporting the designation of the week of Sept...,1,1717
166776,118sres83ats,118,sres,83,ats,"Designating the week of February 6 through 10,...",1,421
166777,118sres369ats,118,sres,369,ats,Expressing support for the designation of Sep...,1,490
166778,118sres335is,118,sres,335,is,"Designating September 23, 2023, through Octobe...",1,460


In [4]:
print('Dataset info:')
df.info()

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166779 entries, 0 to 166778
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               166779 non-null  object
 1   congress         166779 non-null  int64 
 2   bill_type        166779 non-null  object
 3   bill_number      166779 non-null  int64 
 4   bill_version     166779 non-null  object
 5   title            166101 non-null  object
 6   sections_length  166779 non-null  int64 
 7   text_length      166779 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 10.2+ MB


In [5]:
congresses = df['congress'].unique()
for congress in congresses:
    print(f'Congress: "{congress}" has "{len(df[df["congress"] == congress])}" dataset items.')
    bill_types = df[df['congress'] == congress]['bill_type'].unique()
    for bill_type in bill_types:
        print(f'    Bill type: "{bill_type}" has "{len(df[(df["congress"] == congress) & (df["bill_type"] == bill_type)])}" dataset items.')
        bill_versions = df[(df['congress'] == congress) & (df['bill_type'] == bill_type)]['bill_version'].unique()
        for bill_version in bill_versions:
            print(f'        Bill version: "{bill_version}" has "{len(df[(df["congress"] == congress) & (df["bill_type"] == bill_type) & (df["bill_version"] == bill_version)])}" dataset items.')

Congress: "107" has "246" dataset items.
    Bill type: "hconres" has "12" dataset items.
        Bill version: "ih" has "12" dataset items.
    Bill type: "hjres" has "1" dataset items.
        Bill version: "ih" has "1" dataset items.
    Bill type: "hr" has "75" dataset items.
        Bill version: "ih" has "75" dataset items.
    Bill type: "hres" has "158" dataset items.
        Bill version: "eh" has "96" dataset items.
        Bill version: "ih" has "50" dataset items.
        Bill version: "ath" has "12" dataset items.
Congress: "108" has "1924" dataset items.
    Bill type: "hconres" has "148" dataset items.
        Bill version: "ih" has "148" dataset items.
    Bill type: "hjres" has "11" dataset items.
        Bill version: "ih" has "11" dataset items.
    Bill type: "hr" has "1419" dataset items.
        Bill version: "ih" has "1414" dataset items.
        Bill version: "rh" has "5" dataset items.
    Bill type: "hres" has "248" dataset items.
        Bill version: "ih" ha

In [6]:
print('Dataset row with max text length:')
df[df['text_length']==df['text_length'].max()]

Dataset row with max text length:


Unnamed: 0,id,congress,bill_type,bill_number,bill_version,title,sections_length,text_length
118134,116hr133eah,116,hr,133,eah,,2271,5599885


In [8]:
print('Dataset row with max sections length:')
df[df['sections_length']==df['sections_length'].max()]

Dataset row with max sections length:


Unnamed: 0,id,congress,bill_type,bill_number,bill_version,title,sections_length,text_length
118134,116hr133eah,116,hr,133,eah,,2271,5599885
