In [15]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

In [1]:
from datasets import load_dataset

dataset = load_dataset("dreamproit/bill_committees_us", split='train') # the original dataset only has a "train" split
dataset

Downloading readme:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.46G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
    num_rows: 132142
})

In [3]:
# Convert committees to Annif CSV vocabulary

import csv

FIELDS = ('uri', 'label_en')
OUTFILE = 'committees.csv'

committees = sorted(set([item for row in dataset['committees'] for item in row]))
committees_uri = {}

# This page lists the committees. It is hard map committee name from dataset with url.
URIBASE = 'https://www.congress.gov/committees'

with open(OUTFILE, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=FIELDS)
    writer.writeheader()

    count = 0
    for idx, committee_name in enumerate(committees):
        uri = URIBASE + str(idx+1)
        item = {
            'uri': uri,
            'label_en': committee_name,
        }
        if idx < 10:
            print(f"{uri}\t{committee_name}")
        committees_uri[committee_name] = uri
        writer.writerow(item)
        count += 1

    print(f"wrote {count} committees into {OUTFILE}")

https://www.congress.gov/committees1	Aging (Special) Committee
https://www.congress.gov/committees2	Agriculture Committee
https://www.congress.gov/committees3	Agriculture, Nutrition, and Forestry Committee
https://www.congress.gov/committees4	Appropriations Committee
https://www.congress.gov/committees5	Armed Services Committee
https://www.congress.gov/committees6	Banking, Housing, and Urban Affairs Committee
https://www.congress.gov/committees7	Budget Committee
https://www.congress.gov/committees8	Commerce, Science, and Transportation Committee
https://www.congress.gov/committees9	Commission on Security and Cooperation in Europe (U.S. Helsinki Commission)
https://www.congress.gov/committees10	Committee on House Administration
wrote 38 committees into committees.csv


In [4]:
ds_split = dataset.train_test_split(test_size=0.3)
ds_split

DatasetDict({
    train: Dataset({
        features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
        num_rows: 92499
    })
    test: Dataset({
        features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
        num_rows: 39643
    })
})

In [10]:
def flatten_committees(example):
    example['flat_committees'] = ','.join(map(str, example['committees']))
    return example

# Add the 'flat_committees' column
dataset = dataset.map(flatten_committees)

df = pd.DataFrame(dataset)

# Count the occurrences of each class
counts = df['flat_committees'].value_counts()

# Identify the classes that have less than 2 instances
less_frequent = counts[counts < 2].index

# Group the less frequent classes into an 'other' category
df['flat_committees'] = df['flat_committees'].replace(less_frequent, 'other')

# Now you can perform the stratified split as before
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
for train_index, test_index in sss.split(np.zeros(len(df)), df['flat_committees']):
    train_ds = df.iloc[train_index]
    test_ds = df.iloc[test_index]

Map:   0%|          | 0/132142 [00:00<?, ? examples/s]

In [13]:
# Flatten the lists in the 'committees' column for the training set
train_committees = pd.Series([item for sublist in train_ds['committees'] for item in sublist])

# Flatten the lists in the 'committees' column for the test set
test_committees = pd.Series([item for sublist in test_ds['committees'] for item in sublist])

# Count the occurrences of each unique value
train_counts = train_committees.value_counts()
test_counts = test_committees.value_counts()

print("Training set counts:\n", train_counts)
print("Test set counts:\n", test_counts)

Training set counts:
 Judiciary Committee                                                            13668
Ways and Means Committee                                                       12638
Energy and Commerce Committee                                                  11589
Education and the Workforce Committee                                           6701
Finance Committee                                                               6338
Oversight and Accountability Committee                                          5868
Natural Resources Committee                                                     5440
Foreign Affairs Committee                                                       5122
Financial Services Committee                                                    4467
Health, Education, Labor, and Pensions Committee                                4341
Transportation and Infrastructure Committee                                     4331
Armed Services Committee                   

In [16]:
# Flatten the lists in the 'committees' column for the training set
train_committees = pd.Series([item for sublist in ds_split['train']['committees'] for item in sublist])

# Flatten the lists in the 'committees' column for the test set
test_committees = pd.Series([item for sublist in ds_split['test']['committees'] for item in sublist])

# Count the occurrences of each unique value
train_counts = train_committees.value_counts()
test_counts = test_committees.value_counts()

print("Training set counts:\n", train_counts)
print("Test set counts:\n", test_counts)

Training set counts:
 Judiciary Committee                                                            13755
Ways and Means Committee                                                       12605
Energy and Commerce Committee                                                  11543
Education and the Workforce Committee                                           6692
Finance Committee                                                               6273
Oversight and Accountability Committee                                          5788
Natural Resources Committee                                                     5388
Foreign Affairs Committee                                                       5141
Financial Services Committee                                                    4523
Health, Education, Labor, and Pensions Committee                                4364
Transportation and Infrastructure Committee                                     4326
Armed Services Committee                   

In [18]:
%%time

# create the commitees corpora from the train and test sets

import gzip

def normalize_text(text):
    try:
        return ' '.join(text.split())
    except:
        return ''

def to_committees_corpus(ds, outfile):
    for item in ds:
        title = normalize_text(item['title'])
        text = normalize_text(item['text'])
        uris = [f"<{committees_uri[committee]}>" for committee in item['committees']]

        print(f"{title} ¤ {text}\t{' '.join(uris)}", file=outfile)

with gzip.open('committees-train.tsv.gz', 'wt') as outfile:
    to_committees_corpus(ds_split['train'], outfile)

with gzip.open('committees-test.tsv.gz', 'wt') as outfile:
    to_committees_corpus(ds_split['test'], outfile)

CPU times: user 7min 6s, sys: 3.23 s, total: 7min 9s
Wall time: 7min 38s
