In [1]:
import json
import os
from pathlib import Path
import pandas as pd

## Identify document types

In [2]:
search_root = Path.cwd() / 'specific'
docket_ids = os.listdir(search_root)

In [3]:
for docket in docket_ids:
    document_dir = search_root / docket / 'raw-data' / 'documents'
    json_docs = [doc.replace("_content.htm", ".json") for doc in os.listdir(document_dir) if doc.endswith('.htm')]
    for json_path in json_docs:
        with open(document_dir / json_path, 'r') as file:
            data = json.load(file)
            attributes = data['data']['attributes']
            print(f"{json_path}: {attributes['documentType']}")
    print()

CMS-2019-0039-0001.json: Proposed Rule
CMS-2019-0039-1625.json: Rule
CMS-2019-0039-1626.json: Rule
CMS-2019-0039-0397.json: Proposed Rule

DEA-2016-0015-0003.json: Proposed Rule
DEA-2016-0015-0006.json: Proposed Rule

DEA-2024-0059-0001.json: Proposed Rule
DEA-2024-0059-42928.json: Proposed Rule

HHS-ONC-2019-0002-0257.json: Proposed Rule
HHS-ONC-2019-0002-0001.json: Proposed Rule


CMS-2022-0163-0001.json: Notice



## Create document dataframe
- document name
- docket id
- document type
- posted date

In [6]:
doc_df = {'document_id': [], 'docket_id': [], 'document_type': [], 'posted_date': [], 'file_url': []}

for docket in docket_ids:
    document_dir = search_root / docket / 'raw-data' / 'documents'
    json_docs = [doc.replace("_content.htm", ".json") for doc in os.listdir(document_dir) if doc.endswith('.htm')]
    for json_path in json_docs:
        with open(document_dir / json_path, 'r') as file:
            data = json.load(file)
            attributes = data['data']['attributes']
            doc_df['document_id'].append(data['data']['id'])
            doc_df['docket_id'].append(attributes['docketId'])
            doc_df['document_type'].append(attributes['documentType'])
            doc_df['posted_date'].append(attributes['postedDate'])
            doc_df['file_url'].append(attributes['fileFormats'])

In [7]:
doc_df = pd.DataFrame(doc_df)
doc_df.to_csv("documents.csv")

## Read in comments data and duplicate dataframes for processing

In [8]:
docs_df = doc_df.copy()
comments = pd.read_csv("CMS-2019-0039-Comments.csv",sep = "|")

In [9]:
cmts_df = comments[comments['docketId']=="CMS-2019-0039"].copy()

In [10]:
cmts_df = cmts_df.rename(columns={"postedDate":"posted_date"})

In [11]:
docs_df = doc_df[doc_df.docket_id == "CMS-2019-0039"].copy()
docs_df

Unnamed: 0,document_id,docket_id,document_type,posted_date,file_url
0,CMS-2019-0039-0001,CMS-2019-0039,Proposed Rule,2019-05-03T04:00:00Z,[{'fileUrl': 'https://downloads.regulations.go...
1,CMS-2019-0039-1625,CMS-2019-0039,Rule,2020-05-01T04:00:00Z,[{'fileUrl': 'https://downloads.regulations.go...
2,CMS-2019-0039-1626,CMS-2019-0039,Rule,2021-12-10T05:00:00Z,[{'fileUrl': 'https://downloads.regulations.go...
3,CMS-2019-0039-0397,CMS-2019-0039,Proposed Rule,2019-04-23T04:00:00Z,[{'fileUrl': 'https://downloads.regulations.go...


## Create "windows" to match comments to specific documents

In [12]:
import pandas as pd
import numpy as np

# 1)  Make sure both frames are UTC datetimes
docs_df['posted_date'] = pd.to_datetime(docs_df['posted_date'],  utc=True)
cmts_df['posted_date'] = pd.to_datetime(cmts_df['posted_date'], utc=True)

# sort by postedDate (ascending) and assign incremental order starting at 1
docs_df = docs_df.sort_values("posted_date", ascending=True).reset_index(drop=True)
docs_df["doc_order"] = docs_df.index + 1      # 1, 2, 3, …

# 2)  Sort docs in the order you want the windows to appear
docs_df = docs_df.sort_values('doc_order')

# 3)  Build the window edges ------------------------------------------
#     left edges = each doc's own postedDate
#     right edge = next doc's postedDate, or "forever" for the last one
bins   = docs_df['posted_date'].tolist() + [pd.Timestamp.max.tz_localize('UTC')]
labels = docs_df['doc_order'].tolist()                 # what we'll assign

# 4)  Slot every comment timestamp into those windows ------------------
cmts_df['cmt_doc_order'] = pd.cut(
    cmts_df['posted_date'],
    bins=bins,
    labels=labels,
    right=False           # makes the interval [left, right)
).astype('Int64')         # optional: convert to nullable integer

In [14]:
cmts_df

Unnamed: 0.1,Unnamed: 0,docketId,commentOnDocumentId,comment_id,modifyDate,posted_date,receiveDate,comment,comment_text,cmt_doc_order
0,0,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-0002,2019-03-06T16:15:36Z,2019-03-06 05:00:00+00:00,2019-03-06T05:00:00Z,"In order to promote the secure, electronic exc...",,
1,1,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-0003,2019-03-06T16:15:38Z,2019-03-06 05:00:00+00:00,2019-03-05T05:00:00Z,file code CMS-9115-P<br/><br/>Please assist th...,,
2,2,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-0004,2019-03-06T16:15:39Z,2019-03-06 05:00:00+00:00,2019-03-05T05:00:00Z,The efficiency and accuracy of patient health ...,,
3,3,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-0005,2019-03-06T16:15:40Z,2019-03-06 05:00:00+00:00,2019-03-05T05:00:00Z,2015 CEHRT call for requirements and standards...,,
4,4,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-0006,2019-03-06T16:15:41Z,2019-03-06 05:00:00+00:00,2019-03-05T05:00:00Z,I am submitting a comment that calls for inclu...,"QUA INC \n\nPeg Graham, MBA, MPH \nFounder \nT...",
...,...,...,...,...,...,...,...,...,...,...
1616,1616,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-1619,2019-06-11T13:00:15Z,2019-06-11 04:00:00+00:00,2019-05-29T04:00:00Z,"Thank you for the opportunity to comment, plea...","May 29, 2019 \n\nSeema Verma \nAdministrator, ...",2
1617,1617,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-1620,2019-06-11T13:00:17Z,2019-06-11 04:00:00+00:00,2019-05-28T04:00:00Z,There are those concerned about liabilities wi...,,2
1618,1618,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-1621,2019-06-11T13:00:19Z,2019-06-11 04:00:00+00:00,2019-05-28T04:00:00Z,See attached file(s),"Via Electronic Submission \n\nApril 11, 2019 \...",2
1619,1619,CMS-2019-0039,CMS-2019-0039-0001,CMS-2019-0039-1622,2019-06-11T13:00:20Z,2019-06-11 04:00:00+00:00,2019-05-27T04:00:00Z,Honorable Seema Verma<br/>Administrator<br/>Ce...,,2


## Pull document text from file url

In [15]:
docs_df['downloadUrl'] = docs_df.file_url[1][0]['fileUrl']

In [16]:
# Minimal: add doc_text to existing df using its downloadUrl column
import requests, tempfile, os
from pathlib import Path
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text
from tqdm.notebook import tqdm

# ---------- tiny helpers ----------
def _html_to_text(content: bytes) -> str:
    return BeautifulSoup(content, "html.parser").get_text("\n")

def _pdf_to_text(content: bytes) -> str:
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
        tmp.write(content); tmp.flush()
        text = extract_text(tmp.name)
    os.unlink(tmp.name)
    return text

def fetch_and_extract(url: str | None) -> str:
    if not url or pd.isna(url):
        return ""
    try:
        r = requests.get(url, timeout=30); r.raise_for_status()
        ext = Path(url).suffix.lower()
        if ext in (".htm", ".html"):
            return _html_to_text(r.content)
        if ext == ".txt":
            return r.content.decode("utf‑8", "replace")
        if ext == ".pdf":
            return _pdf_to_text(r.content)
    except Exception as e:
        print(" !", url, e)
    return ""

# ---------- pull the files ----------
docs_df["doc_text"] = [
    fetch_and_extract(u) for u in tqdm(docs_df["downloadUrl"], desc="Downloading", unit="file")
]

docs_df[["downloadUrl", "doc_text"]].head()

Downloading:   0%|          | 0/4 [00:00<?, ?file/s]

Unnamed: 0,downloadUrl,doc_text
0,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4..."
1,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4..."
2,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4..."
3,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4..."


## Match comments to document windows

In [17]:
# docs_df
import re

import pandas as pd
import numpy as np

def add_comm_ref(df: pd.DataFrame, text_col: str = "text") -> pd.DataFrame:
    """
    Pull the slice between the first and last occurrence of the word
    'commenter' (case‑insensitive) in each cell of *text_col* and store it
    in a new column called 'comm_ref'.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe you want to enrich.
    text_col : str, default 'text'
        Column that contains the raw text.

    Returns
    -------
    pd.DataFrame
        Same dataframe with an extra 'comm_ref' column.
    """
    word = "commenter"
    wlen = len(word)

    def _extract(txt: str):
        if not isinstance(txt, str):
            return np.nan                     # non‑string cells
        lower = txt.lower()
        first = lower.find(word)
        last  = lower.rfind(word)
        if first == -1 or last == -1 or last <= first:
            return np.nan                     # < 2 occurrences
        return txt[first:last + wlen]         # inclusive slice

    df = df.copy()
    df["comm_ref"] = df[text_col].apply(_extract)
    return df
docs_df = add_comm_ref(docs_df, "doc_text")


In [18]:
docs_df

Unnamed: 0,document_id,docket_id,document_type,posted_date,file_url,doc_order,downloadUrl,doc_text,comm_ref
0,CMS-2019-0039-0397,CMS-2019-0039,Proposed Rule,2019-04-23 04:00:00+00:00,[{'fileUrl': 'https://downloads.regulations.go...,1,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4...",commenters recommended aligning \nexisting sta...
1,CMS-2019-0039-0001,CMS-2019-0039,Proposed Rule,2019-05-03 04:00:00+00:00,[{'fileUrl': 'https://downloads.regulations.go...,2,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4...",commenters recommended aligning \nexisting sta...
2,CMS-2019-0039-1625,CMS-2019-0039,Rule,2020-05-01 04:00:00+00:00,[{'fileUrl': 'https://downloads.regulations.go...,3,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4...",commenters recommended aligning \nexisting sta...
3,CMS-2019-0039-1626,CMS-2019-0039,Rule,2021-12-10 05:00:00+00:00,[{'fileUrl': 'https://downloads.regulations.go...,4,https://downloads.regulations.gov/CMS-2019-003...,"7610 \n\nFederal Register / Vol. 84, No. 4...",commenters recommended aligning \nexisting sta...


In [19]:
merged = pd.merge(left = docs_df, right = cmts_df, left_on = "doc_order", right_on = "cmt_doc_order", how = "inner")

In [20]:
docs_df.to_csv("docs_df.csv")
cmts_df.to_csv("cmts_df.csv")

In [21]:
# merged.to_csv("merged.csv", index = False)