In [1]:
import os
import numpy as np
import pandas as pd
import pyarrow
import requests
from rdflib import Graph

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

# Configurations
DATA_DIR = "./data"
MODEL_DIR = "./models"
RESULTS_DIR = "./results"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

## OHSUMED Articles data
The data is available on hugging face community datasets ([link](https://huggingface.co/datasets/community-datasets/ohsumed/viewer/ohsumed)), however does not contain the MESH labels for categorization or the relevance from querying. This data contains the mesh terms associated with the text, the title and abstract (which are used in the paper as the basis for prediction), publication type, author and source. Because of the lack of labels from the original data, this information is extracted from other sources as explained in the next sections. 

In [3]:
train = pd.read_parquet('../data/raw/ohsumed/train-00000-of-00001.parquet')
test = pd.read_parquet('../data/raw/ohsumed/test-00000-of-00001.parquet')

df = pd.concat([train, test])
df.head()

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source
0,1,87049087,Allied Health Personnel/*; Electric Countersho...,Refibrillation managed by EMT-Ds: incidence an...,JOURNAL ARTICLE.,Some patients converted from ventricular fibri...,Stults KR; Brown DD.,Am J Emerg Med 8703; 4(6):491-5
1,1,87049088,"Antidepressive Agents, Tricyclic/*PO; Arrhythm...",Tricyclic antidepressant overdose: emergency d...,JOURNAL ARTICLE.,There is controversy regarding the appropriate...,Foulke GE; Albertson TE; Walby WF.,Am J Emerg Med 8703; 4(6):496-500
2,1,87049089,Adult; Aircraft/*; Altitude/*; Blood Gas Monit...,Transconjunctival oxygen monitoring as a predi...,JOURNAL ARTICLE.,As the use of helicopters for air transport of...,Shufflebarger C; Jehle D; Cottington E; Martin M.,Am J Emerg Med 8703; 4(6):501-3
3,1,87049090,Adolescence; Adult; Aged; Blood Glucose/*ME; D...,Serum glucose changes after administration of ...,JOURNAL ARTICLE.,A prospective clinical trial was conducted to ...,Adler PM.,Am J Emerg Med 8703; 4(6):504-6
4,1,87049092,"Aged; Aged, 80 and over; Case Report; Female; ...",Nasogastric intubation: morbidity in an asympt...,JOURNAL ARTICLE.,An unusual case of a misdirected nasogastric t...,Gough D; Rust D.,Am J Emerg Med 8703; 4(6):511-3


## Creating sensitivity labels

The MESH labels used in the paper are said to be C12 and C13, which can be found on the following site: [link](https://meshb-prev.nlm.nih.gov/treeView). The 2025 corresponding labels for these categories are C12.050 for Female Urogenital Diseases and Pregnancy Complications, and C12.200 for Male Urogenital Diseases. There are several way to obtain this data, such as MESH API or direct RDF querying (see: [link](https://id.nlm.nih.gov/mesh/)), however due to errors, here the label terms are extracted as plain text and then filtered using python code. 

According to the paper on this dataset specifically done by the same authors and referenced in the paper we are trying to reproduce ([link](https://doi.org/10.1145/3331184.3331256)), if any of the MESH terms are found within the MESH terms column, that article is considered to be sensitive. 

In [4]:
import re

with open('../data/raw/ohsumed/c12.txt', 'r') as file:
    lines = file.readlines()

pattern = r'\[.*?\]'

c12 = [re.sub(pattern, '', line).strip() for line in lines]
c12 = [line for line in c12 if line != '']

print(c12)

['Female Urogenital Diseases and Pregnancy Complications', 'Female Urogenital Diseases', 'Genital Diseases, Female', 'Adnexal Diseases', 'Fallopian Tube Diseases', 'Fallopian Tube Neoplasms', 'Salpingitis', 'Ovarian Diseases', 'Anovulation', 'Menopause, Premature', 'Oophoritis', 'Ovarian Cysts', 'Polycystic Ovary Syndrome', 'Ovarian Hyperstimulation Syndrome', 'Ovarian Neoplasms', 'Brenner Tumor', 'Carcinoma, Endometrioid', 'Carcinoma, Ovarian Epithelial', 'Granulosa Cell Tumor', 'Hereditary Breast and Ovarian Cancer Syndrome', 'Luteoma', 'Meigs Syndrome', 'Sertoli-Leydig Cell Tumor', 'Thecoma', 'Ovarian Torsion', 'Primary Ovarian Insufficiency', 'Pelvic Inflammatory Disease', 'Endometritis', 'Oophoritis', 'Parametritis', 'Salpingitis', 'Dyspareunia', 'Endometriosis', 'Genital Neoplasms, Female', 'Gynatresia', 'Herpes Genitalis', 'Infertility, Female', 'Reproductive Tract Infections', 'Tuberculosis, Female Genital', 'Uterine Diseases', 'Adenomyosis', 'Endometrial Hyperplasia', 'Endomet

In [5]:
with open('../data/raw/ohsumed/c13.txt', 'r') as file:
    lines = file.readlines()

pattern = r'\[.*?\]'

c13 = [re.sub(pattern, '', line).strip() for line in lines]
c13 = [line for line in c13 if line != '']

print(c13)

['Male Urogenital Diseases', 'Genital Diseases, Male', 'Dyspareunia', 'Ejaculatory Dysfunction', 'Premature Ejaculation', 'Retrograde Ejaculation', 'Epididymitis', 'Erectile Dysfunction', 'Impotence, Vasculogenic', 'Fournier Gangrene', 'Genital Neoplasms, Male', 'Penile Neoplasms', 'Prostatic Neoplasms', 'Prostatic Neoplasms, Castration-Resistant', 'Testicular Neoplasms', 'Sertoli-Leydig Cell Tumor', 'Leydig Cell Tumor', 'Sertoli Cell Tumor', 'Hematocele', 'Hemospermia', 'Herpes Genitalis', 'Infertility, Male', 'Aspermia', 'Asthenozoospermia', 'Azoospermia', 'Oligospermia', 'Sertoli Cell-Only Syndrome', 'Teratozoospermia', 'Penile Diseases', 'Balanitis', 'Balanitis Xerotica Obliterans', 'Hypospadias', 'Penile Induration', 'Penile Neoplasms', 'Phimosis', 'Paraphimosis', 'Priapism', 'Prostatic Diseases', 'Prostatic Hyperplasia', 'Prostatic Neoplasms', 'Prostatic Neoplasms, Castration-Resistant', 'Prostatitis', 'Reproductive Tract Infections', 'Spermatic Cord Torsion', 'Spermatocele', 'Te

In [7]:
terms = c12 + c13

pattern = '|'.join([re.escape(term) for term in terms])

df['sensitive'] = df['mesh_terms'].str.contains(pattern, case=False, na=False).astype(int)

#df.to_csv('../data/raw/ohsumed/ohsumed.csv', index=False)

In [9]:
df.head(20)

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source,sensitive
0,1,87049087,Allied Health Personnel/*; Electric Countersho...,Refibrillation managed by EMT-Ds: incidence an...,JOURNAL ARTICLE.,Some patients converted from ventricular fibri...,Stults KR; Brown DD.,Am J Emerg Med 8703; 4(6):491-5,0
1,1,87049088,"Antidepressive Agents, Tricyclic/*PO; Arrhythm...",Tricyclic antidepressant overdose: emergency d...,JOURNAL ARTICLE.,There is controversy regarding the appropriate...,Foulke GE; Albertson TE; Walby WF.,Am J Emerg Med 8703; 4(6):496-500,0
2,1,87049089,Adult; Aircraft/*; Altitude/*; Blood Gas Monit...,Transconjunctival oxygen monitoring as a predi...,JOURNAL ARTICLE.,As the use of helicopters for air transport of...,Shufflebarger C; Jehle D; Cottington E; Martin M.,Am J Emerg Med 8703; 4(6):501-3,0
3,1,87049090,Adolescence; Adult; Aged; Blood Glucose/*ME; D...,Serum glucose changes after administration of ...,JOURNAL ARTICLE.,A prospective clinical trial was conducted to ...,Adler PM.,Am J Emerg Med 8703; 4(6):504-6,0
4,1,87049092,"Aged; Aged, 80 and over; Case Report; Female; ...",Nasogastric intubation: morbidity in an asympt...,JOURNAL ARTICLE.,An unusual case of a misdirected nasogastric t...,Gough D; Rust D.,Am J Emerg Med 8703; 4(6):511-3,0
5,1,87049093,"Abdominal Injuries/ET; Accidents, Occupational...",Massive transfusion without major complication...,JOURNAL ARTICLE.,A case of massive degloving injury of the trun...,Brotman S; Lamonica C; Cowley RA.,Am J Emerg Med 8703; 4(6):514-5,0
6,1,87049094,Adult; Carbon Monoxide Poisoning/CO/*TH; Femal...,Acute carbon monoxide poisoning during pregnancy.,JOURNAL ARTICLE.,The course of a pregnant patient at term who w...,Margulies JL.,Am J Emerg Med 8703; 4(6):516-9,1
7,1,87049096,"Case Report; Child, Preschool; Drug Administra...",Intraosseous infusion of phenytoin.,JOURNAL ARTICLE.,"In the critically ill child, administration of...",Walsh-Kelly CM; Berens RJ; Glaeser PW; Losek JD.,Am J Emerg Med 8703; 4(6):523-4,0
8,1,87049098,Alcohol Drinking; Case Report; Emergencies; Es...,Boerhaave's syndrome: an elusive diagnosis.,JOURNAL ARTICLE.,Boerhaave's syndrome represents a diagnostic d...,Schwartz JA; Turnbull TL; Dymowski J; Uehara DT.,Am J Emerg Med 8703; 4(6):532-6,0
9,1,87049099,Adult; Case Report; Electrocardiography/*; Ext...,Cases in electrocardiography.,JOURNAL ARTICLE.,Boerhaave's syndrome represents a diagnostic d...,Zimmers T.,Am J Emerg Med 8703; 4(6):537-9,0


## Add relevance annotations

For the relevance annotations, here is used the original repository containing the raw data ([link](https://dmice.ohsu.edu/hersh/ohsumed/)). which is also used within hugging face to make the parquet files used as base data. According to the aforementioned paper ([link](https://doi.org/10.1145/3331184.3331256)), there are 16140 total articles with relevance judgements. For that purpose, we are using the judged file from the repository to extract any judgements made within all iterations of experiments. Those articles that are not judged are left to have relevance None, which can be used to separate the train/valid and test splits. 

In [10]:
judged = pd.read_csv('../data/raw/ohsumed/judged', sep='\t', header=None, 
                     names=['query_id', 'document_ui', 'document_i', 'relevance1', 'relevance2', 'relevance3'])
judged

def get_relevance(row):
    # check same judgement
    if row['relevance1'] == row['relevance2'] == row['relevance3']:
        if row['relevance1'] in ['d', 'p']:
            return 'relevant'  # mark relevant
        elif row['relevance1'] == 'n':
            return 'not_relevant'  # mark not relevant
        else:
            return 'unjudged'  # if not relevant or judged, return 'unjudged'

    # fallback logic
    if row['relevance1'] in ['d', 'p']:
        return 'relevant'  # relevance1 if it's relevant
    elif row['relevance2'] in ['d', 'p']:
        return 'relevant'  # relevance2 if it's relevant
    elif row['relevance3'] in ['d', 'p']:
        return 'relevant'  # relevance3 if it's relevant
    elif row['relevance1'] == 'n' or row['relevance2'] == 'n' or row['relevance3'] == 'n':
        return 'not_relevant'  # if 'n', mark as not relevant
    else:
        return 'unjudged'  # mark as unjudged

judged['relevance_status'] = judged.apply(get_relevance, axis=1)
judged.head()

Unnamed: 0,query_id,document_ui,document_i,relevance1,relevance2,relevance3,relevance_status
0,1,87097544,40626,d,,d,relevant
1,1,87153566,11852,n,,n,not_relevant
2,1,87157536,12693,d,,,relevant
3,1,87157537,12694,d,,,relevant
4,1,87184723,15450,n,,,not_relevant


In [11]:
judgements = judged[['document_ui', 'relevance_status']]

df_merged = pd.merge(df, judgements, how='left', left_on='medline_ui', right_on='document_ui').drop(columns=['document_ui'])
#df_merged[~df_merged['relevance_status'].isna()]

In [12]:
df_merged.head()

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source,sensitive,relevance_status
0,1,87049087,Allied Health Personnel/*; Electric Countersho...,Refibrillation managed by EMT-Ds: incidence an...,JOURNAL ARTICLE.,Some patients converted from ventricular fibri...,Stults KR; Brown DD.,Am J Emerg Med 8703; 4(6):491-5,0,
1,1,87049088,"Antidepressive Agents, Tricyclic/*PO; Arrhythm...",Tricyclic antidepressant overdose: emergency d...,JOURNAL ARTICLE.,There is controversy regarding the appropriate...,Foulke GE; Albertson TE; Walby WF.,Am J Emerg Med 8703; 4(6):496-500,0,
2,1,87049089,Adult; Aircraft/*; Altitude/*; Blood Gas Monit...,Transconjunctival oxygen monitoring as a predi...,JOURNAL ARTICLE.,As the use of helicopters for air transport of...,Shufflebarger C; Jehle D; Cottington E; Martin M.,Am J Emerg Med 8703; 4(6):501-3,0,
3,1,87049090,Adolescence; Adult; Aged; Blood Glucose/*ME; D...,Serum glucose changes after administration of ...,JOURNAL ARTICLE.,A prospective clinical trial was conducted to ...,Adler PM.,Am J Emerg Med 8703; 4(6):504-6,0,
4,1,87049092,"Aged; Aged, 80 and over; Case Report; Female; ...",Nasogastric intubation: morbidity in an asympt...,JOURNAL ARTICLE.,An unusual case of a misdirected nasogastric t...,Gough D; Rust D.,Am J Emerg Med 8703; 4(6):511-3,0,


## Next steps

The aforementioned paper ([link](https://doi.org/10.1145/3331184.3331256)) explains that the steps to preprocess the data for modelling include tf-idf transformation of the titles + abstracts of every article. I could not find any information whether they have removed any common words such as 'a', 'the' and so on. 

Furthermore, in our paper, the data is separated in train, valid and test sets such that all of the articles containing both sensitivity and relevance annotations are used as test set (16460 in total), whereas from those without relevance annotations are used for train and validation with 85-15 split. The split between train/valid and test based on the relevance should be done with the relevance_status column (None means no relevance -> goes to train/valid split). 

All of these steps should be done within the preprocessing of the data.

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gerhardkarbeutz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
df_merged['text'] = df_merged['title'] + " " + df_merged['abstract']


In [16]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])
df_merged['text_cleaned'] = df_merged['text'].apply(remove_stopwords)

In [17]:
# Step 3: TF-IDF Transformation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(df_merged['text_cleaned'])

In [18]:
df_merged.head()

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source,sensitive,relevance_status,text,text_cleaned
0,1,87049087,Allied Health Personnel/*; Electric Countersho...,Refibrillation managed by EMT-Ds: incidence an...,JOURNAL ARTICLE.,Some patients converted from ventricular fibri...,Stults KR; Brown DD.,Am J Emerg Med 8703; 4(6):491-5,0,,Refibrillation managed by EMT-Ds: incidence an...,Refibrillation managed EMT-Ds: incidence outco...
1,1,87049088,"Antidepressive Agents, Tricyclic/*PO; Arrhythm...",Tricyclic antidepressant overdose: emergency d...,JOURNAL ARTICLE.,There is controversy regarding the appropriate...,Foulke GE; Albertson TE; Walby WF.,Am J Emerg Med 8703; 4(6):496-500,0,,Tricyclic antidepressant overdose: emergency d...,Tricyclic antidepressant overdose: emergency d...
2,1,87049089,Adult; Aircraft/*; Altitude/*; Blood Gas Monit...,Transconjunctival oxygen monitoring as a predi...,JOURNAL ARTICLE.,As the use of helicopters for air transport of...,Shufflebarger C; Jehle D; Cottington E; Martin M.,Am J Emerg Med 8703; 4(6):501-3,0,,Transconjunctival oxygen monitoring as a predi...,Transconjunctival oxygen monitoring predictor ...
3,1,87049090,Adolescence; Adult; Aged; Blood Glucose/*ME; D...,Serum glucose changes after administration of ...,JOURNAL ARTICLE.,A prospective clinical trial was conducted to ...,Adler PM.,Am J Emerg Med 8703; 4(6):504-6,0,,Serum glucose changes after administration of ...,Serum glucose changes administration 50% dextr...
4,1,87049092,"Aged; Aged, 80 and over; Case Report; Female; ...",Nasogastric intubation: morbidity in an asympt...,JOURNAL ARTICLE.,An unusual case of a misdirected nasogastric t...,Gough D; Rust D.,Am J Emerg Med 8703; 4(6):511-3,0,,Nasogastric intubation: morbidity in an asympt...,Nasogastric intubation: morbidity asymptomatic...


In [19]:
df_merged = df_merged.drop(labels=["title", "abstract", "text"], axis=1)

In [20]:
test_set = df_merged[~df_merged['relevance_status'].isna()]
train_valid_set = df_merged[df_merged['relevance_status'].isna()]

In [21]:
train_set, valid_set = train_test_split(train_valid_set, test_size=0.15, random_state=42)


In [22]:
# Display sizes of the splits
print("Train Set Size:", train_set.shape)
print("Validation Set Size:", valid_set.shape)
print("Test Set Size:", test_set.shape)

Train Set Size: (284013, 9)
Validation Set Size: (50121, 9)
Test Set Size: (16140, 9)


In [33]:
# Step 5: Save the splits for modeling
train_set.to_csv('../data/processed/train_set.csv')
valid_set.to_csv('../data/processed/valid_set.csv')
test_set.to_csv('../data/processed/test_set.csv')

# Step 6: Save TF-IDF features (if required for later use)
import scipy.sparse
scipy.sparse.save_npz('../data/processed/tfidf_features.npz', X_tfidf)


In [34]:
print("Shape of the matrix:", X_tfidf.shape)
print("Type of the matrix:", type(X_tfidf))


Shape of the matrix: (350274, 5000)
Type of the matrix: <class 'scipy.sparse._csr.csr_matrix'>


In [36]:
X_tfidf = scipy.sparse.load_npz('../data/processed/tfidf_features.npz')


In [35]:
# Load the .npz file
data = np.load('../data/processed/tfidf_features.npz')

# List all keys in the .npz file
print("Keys in the .npz file:", data.keys())

Keys in the .npz file: KeysView(NpzFile '../data/processed/tfidf_features.npz' with keys: indices, indptr, format, shape, data)


## Tests

In [5]:
# load rdf
# g = Graph()
# g.parse("../data/raw/ohsumed/mesh/mesh2025.nt/filtered.nt", format="nt") 

<Graph identifier=N9b144e94a7a0473780dba4f8dd538955 (<class 'rdflib.graph.Graph'>)>

In [7]:
"""

query = "
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>

SELECT DISTINCT ?descriptor ?label ?treeNum
WHERE {
  
  ?descriptor meshv:treeNumber ?treeNum .
  ?descriptor rdfs:label ?label .
  
}
ORDER BY ?treeNum
"
# ?descriptor rdf:type meshv:TopicalDescriptor .
# FILTER (STRSTARTS(?treeNum, "C12.050") || STRSTARTS(?treeNum, "C12.200"))

results = g.query(query)
print(results)

for row in results:
    print(f"Tree Number: {row['treeNum']}, Label: {row['label']}")

    """

<rdflib.plugins.sparql.processor.SPARQLResult object at 0x000001A57A3A8A40>


In [13]:
"""
# Base URL for SPARQL
sparql_url = "https://id.nlm.nih.gov/mesh/sparql"

# SPARQL query to get all descriptors under C12.050
sparql_query = "
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2025: <http://id.nlm.nih.gov/mesh/2025/>
PREFIX mesh2024: <http://id.nlm.nih.gov/mesh/2024/>
PREFIX mesh2023: <http://id.nlm.nih.gov/mesh/2023/>

SELECT DISTINCT ?descriptor ?label ?treeNum
WHERE {
  ?descriptor rdf:type meshv:TopicalDescriptor .
  ?descriptor meshv:treeNumber ?treeNum .
  ?descriptor rdfs:label ?label .
}
"

# FILTER (CONTAINS(?treeNum, "C12.050") || CONTAINS(?treeNum, "C12.200"))
# Function to query SPARQL endpoint
def query_sparql(query):
    params = {'query': query, 'format': 'application/json'}
    response = requests.get(sparql_url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed SPARQL query: {response.status_code}")
        return None

# Execute the query
sparql_data = query_sparql(sparql_query)

# Process and print results
if sparql_data:
    for result in sparql_data['results']['bindings']:
        descriptor = result['descriptor']['value']
        label = result['label']['value']
        print(f"Descriptor: {label} ({descriptor})")

"""

'\n# Base URL for SPARQL\nsparql_url = "https://id.nlm.nih.gov/mesh/sparql"\n\n# SPARQL query to get all descriptors under C12.050\nsparql_query = "\nPREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\nPREFIX owl: <http://www.w3.org/2002/07/owl#>\nPREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>\nPREFIX mesh: <http://id.nlm.nih.gov/mesh/>\nPREFIX mesh2025: <http://id.nlm.nih.gov/mesh/2025/>\nPREFIX mesh2024: <http://id.nlm.nih.gov/mesh/2024/>\nPREFIX mesh2023: <http://id.nlm.nih.gov/mesh/2023/>\n\nSELECT DISTINCT ?descriptor ?label ?treeNum\nWHERE {\n  ?descriptor rdf:type meshv:TopicalDescriptor .\n  ?descriptor meshv:treeNumber ?treeNum .\n  ?descriptor rdfs:label ?label .\n}\n"\n\n# FILTER (CONTAINS(?treeNum, "C12.050") || CONTAINS(?treeNum, "C12.200"))\n# Function to query SPARQL endpoint\ndef query_sparql(query):\n    params = {\'query\': query, \'format\': \'application/