In [1]:
! python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import numpy as np
np.__version__

'1.26.4'

In [3]:
from datasets import load_dataset

dataset = load_dataset("dreamproit/bill_committees_us", split='train') # the original dataset only has a "train" split
dataset

Dataset({
    features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
    num_rows: 132142
})

In [4]:
# Get the unique congresses
unique_congresses = sorted(list(set(dataset['congress'])))

print("Unique congresses:\n", unique_congresses)

Unique congresses:
 [108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118]


In [5]:
NUMBER_OF_LAST_CONGRESSES = 2
# Create a list with all but the last 2 values
all_but_last_2 = unique_congresses[:-NUMBER_OF_LAST_CONGRESSES]

# Create a list with the last 2 values
last_2 = unique_congresses[-NUMBER_OF_LAST_CONGRESSES:]

print(f"All but last {NUMBER_OF_LAST_CONGRESSES} congresses:\n{all_but_last_2}")
print(f"Last 2 congresses:\n{last_2}")

All but last 2 congresses:
[108, 109, 110, 111, 112, 113, 114, 115, 116]
Last 2 congresses:
[117, 118]


In [11]:
# Filter the dataset to create two new datasets
dataset_all_but_last_2 = dataset.filter(lambda example: example['congress'] in all_but_last_2)
dataset_last_2 = dataset.filter(lambda example: example['congress'] in last_2)

print("Dataset with all but last 2 congresses:\n", dataset_all_but_last_2)
print("Dataset with last 2 congresses:\n", dataset_last_2)

Dataset with all but last 2 congresses:
 Dataset({
    features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
    num_rows: 100981
})
Dataset with last 2 congresses:
 Dataset({
    features: ['id', 'congress', 'bill_type', 'bill_number', 'bill_version', 'title', 'sections', 'sections_length', 'text', 'text_length', 'committees'],
    num_rows: 31161
})


In [6]:
import pprint

# List of dictionaries
committees = [
    {"href":"https://www.congress.gov/committee/house-agriculture/hsag00","name":"Agriculture"},
    {"href":"https://www.congress.gov/committee/house-appropriations/hsap00","name":"Appropriations"},
    {"href":"https://www.congress.gov/committee/house-armed-services/hsas00","name":"Armed Services"},
    {"href":"https://www.congress.gov/committee/house-budget/hsbu00","name":"Budget"},
    {"href":"https://www.congress.gov/committee/house-education-and-the-workforce/hsed00","name":"Education and the Workforce"},
    {"href":"https://www.congress.gov/committee/house-energy-and-commerce/hsif00","name":"Energy and Commerce"},
    {"href":"https://www.congress.gov/committee/house-ethics/hsso00","name":"Ethics"},
    {"href":"https://www.congress.gov/committee/house-financial-services/hsba00","name":"Financial Services"},
    {"href":"https://www.congress.gov/committee/house-foreign-affairs/hsfa00","name":"Foreign Affairs"},
    {"href":"https://www.congress.gov/committee/house-homeland-security/hshm00","name":"Homeland Security"},
    {"href":"https://www.congress.gov/committee/committee-on-house-administration/hsha00","name":"House Administration"},
    {"href":"https://www.congress.gov/committee/house-judiciary/hsju00","name":"Judiciary"},
    {"href":"https://www.congress.gov/committee/house-natural-resources/hsii00","name":"Natural Resources"},
    {"href":"https://www.congress.gov/committee/house-oversight-and-accountability/hsgo00","name":"Oversight and Accountability"},
    {"href":"https://www.congress.gov/committee/house-rules/hsru00","name":"Rules"},
    {"href":"https://www.congress.gov/committee/house-science-space-and-technology/hssy00","name":"Science, Space, and Technology"},
    {"href":"https://www.congress.gov/committee/house-small-business/hssm00","name":"Small Business"},
    {"href":"https://www.congress.gov/committee/house-transportation-and-infrastructure/hspw00","name":"Transportation and Infrastructure"},
    {"href":"https://www.congress.gov/committee/house-veterans-affairs/hsvr00","name":"Veterans' Affairs"},
    {"href":"https://www.congress.gov/committee/house-ways-and-means/hswm00","name":"Ways and Means"},
    {"href":"https://www.congress.gov/committee/senate-agriculture-nutrition-and-forestry/ssaf00","name":"Agriculture, Nutrition, and Forestry"},
    {"href":"https://www.congress.gov/committee/senate-appropriations/ssap00","name":"Appropriations"},
    {"href":"https://www.congress.gov/committee/senate-armed-services/ssas00","name":"Armed Services"},
    {"href":"https://www.congress.gov/committee/senate-banking-housing-and-urban-affairs/ssbk00","name":"Banking, Housing, and Urban Affairs"},
    {"href":"https://www.congress.gov/committee/senate-budget/ssbu00","name":"Budget"},
    {"href":"https://www.congress.gov/committee/senate-commerce-science-and-transportation/sscm00","name":"Commerce, Science, and Transportation"},
    {"href":"https://www.congress.gov/committee/senate-energy-and-natural-resources/sseg00","name":"Energy and Natural Resources"},
    {"href":"https://www.congress.gov/committee/senate-environment-and-public-works/ssev00","name":"Environment and Public Works"},
    {"href":"https://www.congress.gov/committee/senate-finance/ssfi00","name":"Finance"},
    {"href":"https://www.congress.gov/committee/senate-foreign-relations/ssfr00","name":"Foreign Relations"},
    {"href":"https://www.congress.gov/committee/senate-health-education-labor-and-pensions/sshr00","name":"Health, Education, Labor, and Pensions"},
    {"href":"https://www.congress.gov/committee/senate-homeland-security-and-governmental-affairs/ssga00","name":"Homeland Security and Governmental Affairs"},
    {"href":"https://www.congress.gov/committee/senate-judiciary/ssju00","name":"Judiciary"},
    {"href":"https://www.congress.gov/committee/senate-rules-and-administration/ssra00","name":"Rules and Administration"},
    {"href":"https://www.congress.gov/committee/senate-small-business-and-entrepreneurship/sssb00","name":"Small Business and Entrepreneurship"},
    {"href":"https://www.congress.gov/committee/senate-veterans-affairs/ssva00","name":"Veterans' Affairs"},
    {"href":"https://www.congress.gov/committee/house-intelligence-permanent-select/hlig00","name":"House Permanent Select Committee on Intelligence"},
    {"href":"https://www.congress.gov/committee/house-select-committee-on-the-strategic-competition-between-the-united-states-and-the-chinese-communist-party/hlzs00","name":"Select Committee on the Strategic Competition Between the United States and the Chinese Communist Party"},
    {"href":"https://www.congress.gov/committee/senate-aging-special/spag00","name":"Aging (Special)"},
    {"href":"https://www.congress.gov/committee/senate-caucus-on-international-narcotics-control/scnc00","name":"Caucus on International Narcotics Control"},
    {"href":"https://www.congress.gov/committee/senate-ethics-select/slet00","name":"Ethics (Select)"},
    {"href":"https://www.congress.gov/committee/senate-indian-affairs/slia00","name":"Indian Affairs"},
    {"href":"https://www.congress.gov/committee/senate-intelligence-select/slin00","name":"Intelligence (Select)"},
    {"href":"https://www.congress.gov/committee/joint-committee-on-printing/jspr00%3E","name":"Joint Committee on Printing"},
    {"href":"https://www.congress.gov/committee/joint-committee-on-taxation/jstx00%3E","name":"Joint Committee on Taxation"},
    {"href":"https://www.congress.gov/committee/joint-committee-on-the-library/jslc00%3E","name":"Joint Committee on the Library"},
    {"href":"https://www.congress.gov/committee/joint-economic-committee/jsec00%3E","name":"Joint Economic Committee"},
    # {"href":"https://www.congress.gov/committee/the-helsinki-commission/jcse00","name":"Commission on Security and Cooperation in Europe"},
    # {"href":"https://www.congress.gov/committee/congressional-executive-commission-on-china/jcpk00","name":"Congressional-Executive Commission on China"},
    # {"href":"http://tlhrc.house.gov/","name":"Tom Lantos Human Rights Commission"},
    # {"href":"https://hdp.house.gov/","name":"House Democracy Partnership"},
    # {"href":"https://www.congress.gov/committee/congressional-oversight-commission/jcov00","name":"Congressional Oversight Commission"}
]

house_edge_cases = [
    {"href":"https://www.congress.gov/committee/the-helsinki-commission/jcse00","name":"Commission on Security and Cooperation in Europe"},
    {"href":"https://www.congress.gov/committee/congressional-executive-commission-on-china/jcpk00","name":"Congressional-Executive Commission on China"},
    {"href":"http://tlhrc.house.gov/","name":"Tom Lantos Human Rights Commission"},
    {"href":"https://hdp.house.gov/","name":"House Democracy Partnership"},
]
senate_edge_cases = [
    {"href":"https://www.congress.gov/committee/congressional-oversight-commission/jcov00","name":"Congressional Oversight Commission"}
]

house_committees = {committee['name']: committee['href'] for committee in committees if 'senate' not in committee['href']}
for committee in house_edge_cases:
    house_committees[committee['name']] = committee['href']
senate_committees = {committee['name']: committee['href'] for committee in committees if 'house' not in committee['href']}
for committee in senate_edge_cases:
    senate_committees[committee['name']] = committee['href']

print("House Committees: ", len(house_committees))
pprint.pprint(house_committees)
print("Senate Committees: ", len(senate_committees))
pprint.pprint(senate_committees)

House Committees:  30
{'Agriculture': 'https://www.congress.gov/committee/house-agriculture/hsag00',
 'Appropriations': 'https://www.congress.gov/committee/house-appropriations/hsap00',
 'Armed Services': 'https://www.congress.gov/committee/house-armed-services/hsas00',
 'Budget': 'https://www.congress.gov/committee/house-budget/hsbu00',
 'Commission on Security and Cooperation in Europe': 'https://www.congress.gov/committee/the-helsinki-commission/jcse00',
 'Congressional-Executive Commission on China': 'https://www.congress.gov/committee/congressional-executive-commission-on-china/jcpk00',
 'Education and the Workforce': 'https://www.congress.gov/committee/house-education-and-the-workforce/hsed00',
 'Energy and Commerce': 'https://www.congress.gov/committee/house-energy-and-commerce/hsif00',
 'Ethics': 'https://www.congress.gov/committee/house-ethics/hsso00',
 'Financial Services': 'https://www.congress.gov/committee/house-financial-services/hsba00',
 'Foreign Affairs': 'https://www.

In [12]:
# Convert committees to Annif CSV vocabulary

import csv

FIELDS = ('uri', 'label_en')
OUTFILE = 'committees.csv'
committees = sorted(set([item for row in dataset_all_but_last_2['committees'] for item in row]))
csv_items = []

for idx, committee_name in enumerate(committees):
    committee_name_first_part = committee_name.replace('Committee', '').strip()
    # print(committee_name_first_part)
    house_committee_url = house_committees.get(committee_name_first_part)
    senate_committee_url = senate_committees.get(committee_name_first_part)
    if house_committee_url:
      uri = house_committee_url
    elif senate_committee_url:
      uri = senate_committee_url
    else:
      print(f'edge cases: {committee_name}')
      if committee_name == 'Commission on Security and Cooperation in Europe (U.S. Helsinki Commission)':
        committee_name_first_part = 'Commission on Security and Cooperation in Europe'
        uri = house_committees.get(committee_name_first_part)
      elif committee_name == 'Committee on House Administration':
        committee_name_first_part = 'House Administration'
        uri = house_committees.get(committee_name_first_part)
      elif committee_name == 'Intelligence (Permanent Select) Committee':
        committee_name_first_part = 'Intelligence (Select)'
        uri = senate_committees.get(committee_name_first_part)
      elif committee_name == 'United States Senate Caucus on International Narcotics Control':
        committee_name_first_part = 'Caucus on International Narcotics Control'
        uri = senate_committees.get(committee_name_first_part)

    # uri = committees[committee_name]
    item = {
        'uri': uri,
        'label_en': committee_name,
    }
    csv_items.append(item)
print(len(csv_items))
print(csv_items)

with open(OUTFILE, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=FIELDS)
    writer.writeheader()

    for row in csv_items:
        writer.writerow(row)

edge cases: Commission on Security and Cooperation in Europe (U.S. Helsinki Commission)
edge cases: Committee on House Administration
edge cases: Intelligence (Permanent Select) Committee
edge cases: United States Senate Caucus on International Narcotics Control
38
[{'uri': 'https://www.congress.gov/committee/senate-aging-special/spag00', 'label_en': 'Aging (Special) Committee'}, {'uri': 'https://www.congress.gov/committee/house-agriculture/hsag00', 'label_en': 'Agriculture Committee'}, {'uri': 'https://www.congress.gov/committee/senate-agriculture-nutrition-and-forestry/ssaf00', 'label_en': 'Agriculture, Nutrition, and Forestry Committee'}, {'uri': 'https://www.congress.gov/committee/house-appropriations/hsap00', 'label_en': 'Appropriations Committee'}, {'uri': 'https://www.congress.gov/committee/house-armed-services/hsas00', 'label_en': 'Armed Services Committee'}, {'uri': 'https://www.congress.gov/committee/senate-banking-housing-and-urban-affairs/ssbk00', 'label_en': 'Banking, Hous

In [13]:
%%time

# create the commitees corpora from the train and test sets

import gzip

def normalize_text(text):
    try:
        return ' '.join(text.split())
    except:
        return ''

def to_committees_corpus(ds, outfile):
    for item in ds:
        title = normalize_text(item['title'])
        text = normalize_text(item['text'])
        # uris = [f"<{committees_uri[committee]}>" for committee in item['committees']]
        uris = []
        for ds_committee in item['committees']:
          for item in csv_items:
            if ds_committee == item['label_en']:
              uris.append(f'<{item["uri"]}>')
        line = f"{title} ¤ {text}\t{' '.join(uris)}"
        print(line, file=outfile)

with gzip.open('committees-train.tsv.gz', 'wt') as outfile:
    to_committees_corpus(dataset_all_but_last_2, outfile)

with gzip.open('committees-test.tsv.gz', 'wt') as outfile:
    to_committees_corpus(dataset_last_2, outfile)


CPU times: user 2min 43s, sys: 2.5 s, total: 2min 46s
Wall time: 2min 53s


In [14]:
! annif load-vocab cm committees.csv

Loading vocabulary from CSV file committees.csv...
updating existing subject index
saving vocabulary into SKOS file data/vocabs/cm/subjects.ttl


In [15]:
! annif train cm-mllm-en committees-train.tsv.gz

Backend mllm: starting train
Backend mllm: preparing training data
Backend mllm: training model
Backend mllm: saving model


In [16]:
! annif eval cm-mllm-en committees-test.tsv.gz -l 15 -t 0.15

Precision (doc avg):          	0.0961
Recall (doc avg):             	0.1617
F1 score (doc avg):           	0.1113
Precision (subj avg):         	0.3092
Recall (subj avg):            	0.1853
F1 score (subj avg):          	0.2154
Precision (weighted subj avg):	0.4253
Recall (weighted subj avg):   	0.1652
F1 score (weighted subj avg): 	0.2190
Precision (microavg):         	0.3878
Recall (microavg):            	0.1652
F1 score (microavg):          	0.2317
F1@5:                         	0.1117
NDCG:                         	0.1397
NDCG@5:                       	0.1392
NDCG@10:                      	0.1397
Precision@1:                  	0.1151
Precision@3:                  	0.0987
Precision@5:                  	0.0966
True positives:               	6530
False positives:              	10309
False negatives:              	33008
Documents evaluated:          	31161


In [2]:
! annif train cm-fasttext-en committees-train.tsv.gz

Backend fasttext: creating fastText training file
Backend fasttext: creating fastText model
Read 160M words
Number of words:  98883
Number of labels: 37
Progress: 100.0% words/sec/thread:  148861 lr:  0.000000 avg.loss:  0.818112 ETA:   0h 0m 0s  0.0% words/sec/thread:  153299 lr:  0.739694 avg.loss:  2.911276 ETA:   7h16m37s  0.4% words/sec/thread:  159779 lr:  0.737211 avg.loss:  2.531188 ETA:   6h57m30s  0.5% words/sec/thread:  160386 lr:  0.736167 avg.loss:  2.355006 ETA:   6h55m20s  0.6% words/sec/thread:  160239 lr:  0.735272 avg.loss:  2.268493 ETA:   6h55m12s  1.2% words/sec/thread:  160966 lr:  0.731371 avg.loss:  1.989954 ETA:   6h51m 8s  1.3% words/sec/thread:  161084 lr:  0.730343 avg.loss:  1.893400 ETA:   6h50m16s  1.5% words/sec/thread:  161163 lr:  0.729092 avg.loss:  1.820741 ETA:   6h49m21s 1.703055 ETA:   6h47m29s  1.9% words/sec/thread:  161183 lr:  0.726259 avg.loss:  1.687794 ETA:   6h47m43s  1.9% words/sec/thread:  161216 lr:  0.726036 avg.loss:  1.678820 ETA:   

In [3]:
! annif eval cm-fasttext-en committees-test.tsv.gz -l 15 -t 0.15

Precision (doc avg):          	0.5645
Recall (doc avg):             	0.7578
F1 score (doc avg):           	0.6146
Precision (subj avg):         	0.3650
Recall (subj avg):            	0.4934
F1 score (subj avg):          	0.4134
Precision (weighted subj avg):	0.5216
Recall (weighted subj avg):   	0.6982
F1 score (weighted subj avg): 	0.5928
Precision (microavg):         	0.5029
Recall (microavg):            	0.6982
F1 score (microavg):          	0.5847
F1@5:                         	0.6146
NDCG:                         	0.6985
NDCG@5:                       	0.6990
NDCG@10:                      	0.6985
Precision@1:                  	0.6187
Precision@3:                  	0.5646
Precision@5:                  	0.5645
True positives:               	27606
False positives:              	27290
False negatives:              	11932
Documents evaluated:          	31161


In [5]:
! annif eval cm-yake-en committees-test.tsv.gz -l 15 -t 0.15

Backend yake: Creating index
Backend yake: Created index with 38 labels
Precision (doc avg):          	0.0063
Recall (doc avg):             	0.0056
F1 score (doc avg):           	0.0058
Precision (subj avg):         	0.2074
Recall (subj avg):            	0.0079
F1 score (subj avg):          	0.0137
Precision (weighted subj avg):	0.3951
Recall (weighted subj avg):   	0.0051
F1 score (weighted subj avg): 	0.0090
Precision (microavg):         	0.6474
Recall (microavg):            	0.0051
F1 score (microavg):          	0.0101
F1@5:                         	0.0058
NDCG:                         	0.0057
NDCG@5:                       	0.0057
NDCG@10:                      	0.0057
Precision@1:                  	0.0063
Precision@3:                  	0.0063
Precision@5:                  	0.0063
True positives:               	202
False positives:              	110
False negatives:              	39336
Documents evaluated:          	31161


In [6]:
! annif train cm-stwfsa-en committees-train.tsv.gz

In [7]:
! annif eval cm-stwfsa-en committees-test.tsv.gz -l 15 -t 0.15

Precision (doc avg):          	0.0078
Recall (doc avg):             	0.0065
F1 score (doc avg):           	0.0067
Precision (subj avg):         	0.2491
Recall (subj avg):            	0.0100
F1 score (subj avg):          	0.0165
Precision (weighted subj avg):	0.4727
Recall (weighted subj avg):   	0.0065
F1 score (weighted subj avg): 	0.0112
Precision (microavg):         	0.5525
Recall (microavg):            	0.0065
F1 score (microavg):          	0.0129
F1@5:                         	0.0067
NDCG:                         	0.0067
NDCG@5:                       	0.0068
NDCG@10:                      	0.0067
Precision@1:                  	0.0079
Precision@3:                  	0.0078
Precision@5:                  	0.0078
True positives:               	258
False positives:              	209
False negatives:              	39280
Documents evaluated:          	31161


In [8]:
! annif train cm-nn-ensemble-en committees-train.tsv.gz

Backend nn_ensemble: creating NN ensemble model
Backend nn_ensemble: Initializing source projects: cm-parabel-en, cm-bonsai-en
2024-09-10T07:43:01.710Z [36mINFO [0m [omikuji::model] Loading model from data/projects/cm-parabel-en/omikuji-model...
2024-09-10T07:43:01.711Z [36mINFO [0m [omikuji::model] Loading model settings from data/projects/cm-parabel-en/omikuji-model/settings.json...
2024-09-10T07:43:01.712Z [36mINFO [0m [omikuji::model] Loaded model settings Settings { n_features: 131689, classifier_loss_type: Hinge }...
2024-09-10T07:43:01.714Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree0.cbor...
2024-09-10T07:43:01.767Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree1.cbor...
2024-09-10T07:43:01.833Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree2.cbor...
2024-09-10T07:43:01.895Z [36mINFO [0m [omikuji::model] Loaded model wi

In [9]:
! annif eval cm-nn-ensemble-en committees-test.tsv.gz -l 15 -t 0.15

2024-09-10T08:46:18.522Z [36mINFO [0m [omikuji::model] Loading model from data/projects/cm-parabel-en/omikuji-model...
2024-09-10T08:46:18.522Z [36mINFO [0m [omikuji::model] Loading model settings from data/projects/cm-parabel-en/omikuji-model/settings.json...
2024-09-10T08:46:18.525Z [36mINFO [0m [omikuji::model] Loaded model settings Settings { n_features: 131689, classifier_loss_type: Hinge }...
2024-09-10T08:46:18.526Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree0.cbor...
2024-09-10T08:46:18.566Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree1.cbor...
2024-09-10T08:46:18.625Z [36mINFO [0m [omikuji::model] Loading tree from data/projects/cm-parabel-en/omikuji-model/tree2.cbor...
2024-09-10T08:46:18.692Z [36mINFO [0m [omikuji::model] Loaded model with 3 trees; it took 0.17s
2024-09-10T08:46:25.132Z [36mINFO [0m [omikuji::model] Loading model from data/projects/cm-bonsai-e