In [1]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import spacy
import torch
from radgraph import RadGraph
from rich import print
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Spacy's English tokenizer
nlp = spacy.load("en_core_web_sm")

# Initialize RadGraph
radgraph = RadGraph()

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


model_type not provided, defaulting to radgraph-xl


  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [2]:
text_data_path = (
    "C:/Users/DryLab/Desktop/ViLLA/RPN_MIMIC/mimic-cxr-reports-preprocessed"
)

In [3]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")


def filter_sentences_by_absent_tokens(radgraph_output):
    remaining_sentences = []

    radgraph_output = radgraph(radgraph_output)

    # Iterate over RadGraph output
    for _, description in radgraph_output.items():
        text = description.get("text", "")
        entities = description.get("entities", {})

        # Step 1: Split the text into sentences
        sentences = nltk.tokenize.sent_tokenize(text)

        # Step 2: Create a mapping of word indices to sentence indices
        word_to_sentence = {}
        current_word_idx = 0

        for i, sentence in enumerate(sentences):
            sentence_words = nltk.tokenize.word_tokenize(sentence)
            for _ in sentence_words:
                word_to_sentence[current_word_idx] = i
                current_word_idx += 1

        # Step 3: Create a flag to keep or discard each sentence
        sentence_flags = [True] * len(sentences)

        # Step 4: Iterate over entities and check for tokens labeled "Observation::definitely absent"
        for _, entity_info in entities.items():
            label = entity_info.get("label")
            start_ix = entity_info.get("start_ix")

            # Check if the entity is labeled as "Observation::definitely absent"
            if label == "Observation::definitely absent":
                # Step 5: Find which sentence contains the token based on start and end indices
                sentence_idx = word_to_sentence.get(start_ix, None)
                if sentence_idx is not None:
                    sentence_flags[sentence_idx] = False

        # Step 6: Collect the remaining sentences
        for i, sentence in enumerate(sentences):
            if sentence_flags[i]:
                remaining_sentences.append(sentence)

    return remaining_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DryLab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DryLab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
def process_directory(main_dir):
    data = []

    for subdir, _, files in os.walk(main_dir):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(subdir, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
                    data.append(
                        {
                            "parent_folder": os.path.basename(os.path.dirname(subdir)),
                            "subfolder": os.path.splitext(file)[0],
                            "text_path": os.path.relpath(file_path, start=main_dir),
                            "text_content": text,
                        }
                    )

    df = pd.DataFrame(data)
    return df

In [5]:
df = process_directory(text_data_path)

In [6]:
df

Unnamed: 0,parent_folder,subfolder,text_path,text_content
0,p10,s50414267,files\p10\p10000032\s50414267.txt,"There is no focal consolidation, pleural effus..."
1,p10,s53189527,files\p10\p10000032\s53189527.txt,"The cardiac, mediastinal and hilar contours ar..."
2,p10,s53911762,files\p10\p10000032\s53911762.txt,Single frontal view of the chest provided. Th...
3,p10,s56699142,files\p10\p10000032\s56699142.txt,"The lungs are clear of focal consolidation, pl..."
4,p10,s57375967,files\p10\p10000764\s57375967.txt,PA and lateral views of the chest provided. ...
...,...,...,...,...
11698,p10,s56459556,files\p10\p10999395\s56459556.txt,The tip of the endotracheal tube projects over...
11699,p10,s57060480,files\p10\p10999395\s57060480.txt,"There is right PICC line, an right IJ central ..."
11700,p10,s52257272,files\p10\p10999512\s52257272.txt,The lungs are well expanded and clear. There ...
11701,p10,s52341872,files\p10\p10999737\s52341872.txt,PA and lateral views of the chest provided. L...


In [7]:
df["new_text_content"] = df["text_content"].apply(filter_sentences_by_absent_tokens)

In [8]:
df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content
0,p10,s50414267,files\p10\p10000032\s50414267.txt,"There is no focal consolidation, pleural effus...",[Bilateral nodular opacities that most likely ...
1,p10,s53189527,files\p10\p10000032\s53189527.txt,"The cardiac, mediastinal and hilar contours ar...","[The cardiac , mediastinal and hilar contours ..."
2,p10,s53911762,files\p10\p10000032\s53911762.txt,Single frontal view of the chest provided. Th...,"[Single frontal view of the chest provided ., ..."
3,p10,s56699142,files\p10\p10000032\s56699142.txt,"The lungs are clear of focal consolidation, pl...","[The heart size is normal ., The mediastinal c..."
4,p10,s57375967,files\p10\p10000764\s57375967.txt,PA and lateral views of the chest provided. ...,"[PA and lateral views of the chest provided .,..."
...,...,...,...,...,...
11698,p10,s56459556,files\p10\p10999395\s56459556.txt,The tip of the endotracheal tube projects over...,[The tip of the endotracheal tube projects ove...
11699,p10,s57060480,files\p10\p10999395\s57060480.txt,"There is right PICC line, an right IJ central ...","[There is right PICC line , an right IJ centra..."
11700,p10,s52257272,files\p10\p10999512\s52257272.txt,The lungs are well expanded and clear. There ...,"[The lungs are well expanded and clear ., The ..."
11701,p10,s52341872,files\p10\p10999737\s52341872.txt,PA and lateral views of the chest provided. L...,"[PA and lateral views of the chest provided .,..."


In [9]:
import nltk

# Ensure you have the NLTK sentence tokenizer models
nltk.download("punkt")


def preprocess_sentences(data):
    preprocessed_sentences = []

    for sentence in data:
        # Convert to lowercase
        sentence = sentence.lower()

        # Tokenize into sentences (in this case, each string is already a sentence, so this is a safeguard)
        sentences = nltk.tokenize.sent_tokenize(sentence)

        # Append the processed sentences to the list
        preprocessed_sentences.extend(sentences)

    text = " ".join(preprocessed_sentences)

    return text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DryLab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df["new_text_content"] = df["new_text_content"].apply(preprocess_sentences)

In [11]:
df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content
0,p10,s50414267,files\p10\p10000032\s50414267.txt,"There is no focal consolidation, pleural effus...",bilateral nodular opacities that most likely r...
1,p10,s53189527,files\p10\p10000032\s53189527.txt,"The cardiac, mediastinal and hilar contours ar...","the cardiac , mediastinal and hilar contours a..."
2,p10,s53911762,files\p10\p10000032\s53911762.txt,Single frontal view of the chest provided. Th...,single frontal view of the chest provided . th...
3,p10,s56699142,files\p10\p10000032\s56699142.txt,"The lungs are clear of focal consolidation, pl...",the heart size is normal . the mediastinal con...
4,p10,s57375967,files\p10\p10000764\s57375967.txt,PA and lateral views of the chest provided. ...,pa and lateral views of the chest provided . t...
...,...,...,...,...,...
11698,p10,s56459556,files\p10\p10999395\s56459556.txt,The tip of the endotracheal tube projects over...,the tip of the endotracheal tube projects over...
11699,p10,s57060480,files\p10\p10999395\s57060480.txt,"There is right PICC line, an right IJ central ...","there is right picc line , an right ij central..."
11700,p10,s52257272,files\p10\p10999512\s52257272.txt,The lungs are well expanded and clear. There ...,the lungs are well expanded and clear . the ca...
11701,p10,s52341872,files\p10\p10999737\s52341872.txt,PA and lateral views of the chest provided. L...,pa and lateral views of the chest provided . l...


In [12]:
df = df[
    df["new_text_content"].apply(lambda x: isinstance(x, str) and len(x.split()) >= 2)
]

In [13]:
def save_preprocessed_text(df, output_dir):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        # Construct the output file path
        subfolder_path = os.path.join(output_dir, row["parent_folder"])
        os.makedirs(subfolder_path, exist_ok=True)
        output_file_path = os.path.join(subfolder_path, f"{row['subfolder']}.txt")

        # Write the CONTENT to the new text file
        with open(output_file_path, "w", encoding="utf-8") as f:
            f.write(row["new_text_content"])


output_text_directory = "./mimic-cxr-reports-preprocessed-villa/p10"
save_preprocessed_text(df, output_text_directory)

In [15]:
index = 0
findings = df.iloc[index]["new_text_content"]
annotation = radgraph([findings])
print(f"Findings: {findings}")

In [16]:
annotation

{'0': {'text': 'bilateral nodular opacities that most likely represent nipple shadows . the cardiomediastinal silhouette is normal . clips project over the left lung , potentially within the breast . the imaged upper abdomen is unremarkable . chronic deformity of the posterior left sixth and seventh ribs are noted .',
  'entities': {'1': {'tokens': 'bilateral',
    'label': 'Anatomy::definitely present',
    'start_ix': 0,
    'end_ix': 0,
    'relations': []},
   '2': {'tokens': 'nodular',
    'label': 'Observation::definitely present',
    'start_ix': 1,
    'end_ix': 1,
    'relations': [['modify', '3']]},
   '3': {'tokens': 'opacities',
    'label': 'Observation::definitely present',
    'start_ix': 2,
    'end_ix': 2,
    'relations': [['located_at', '1']]},
   '4': {'tokens': 'nipple',
    'label': 'Anatomy::definitely present',
    'start_ix': 7,
    'end_ix': 7,
    'relations': []},
   '5': {'tokens': 'shadows',
    'label': 'Anatomy::definitely present',
    'start_ix': 8,
  

In [17]:
def extract_entities(textual_descriptions):
    extracted_entities = []

    textual_descriptions = radgraph(textual_descriptions)

    for _, description in textual_descriptions.items():
        # Extract entities using RadGraph
        entities = description.get("entities", {})

        # Filter entities that are 'Definitely Present'
        for _, entities_label in entities.items():
            label = entities_label.get("label")
            tokens = entities_label.get("tokens")
            if label == "Observation::definitely present":
                extracted_entities.append(tokens)

    return extracted_entities


textual_descriptions = findings
entities = extract_entities(textual_descriptions)

In [18]:
entities

['nodular',
 'opacities',
 'normal',
 'clips',
 'within',
 'unremarkable',
 'chronic',
 'deformity']

In [19]:
def filter_nouns(entities):
    noun_entities = []

    for entity in entities:
        # Tokenize the entity using Spacy
        doc = nlp(entity)
        for token in doc:
            # Filter out tokens that are nouns
            if token.pos_ == "NOUN":
                # Append the noun to the list of noun entities
                noun_entities.append(token.text)

    return noun_entities


noun_entities = filter_nouns(entities)

In [20]:
noun_entities

['opacities', 'clips', 'deformity']

In [21]:
df["tokens"] = df["new_text_content"].apply(extract_entities)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tokens"] = df["new_text_content"].apply(extract_entities)


In [22]:
df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content,tokens
0,p10,s50414267,files\p10\p10000032\s50414267.txt,"There is no focal consolidation, pleural effus...",bilateral nodular opacities that most likely r...,"[nodular, opacities, normal, clips, within, un..."
1,p10,s53189527,files\p10\p10000032\s53189527.txt,"The cardiac, mediastinal and hilar contours ar...","the cardiac , mediastinal and hilar contours a...","[normal, normal, clear, multiple, clips, remot..."
2,p10,s53911762,files\p10\p10000032\s53911762.txt,Single frontal view of the chest provided. Th...,single frontal view of the chest provided . th...,"[normal, multiple, clips, remote, fractures]"
3,p10,s56699142,files\p10\p10000032\s56699142.txt,"The lungs are clear of focal consolidation, pl...",the heart size is normal . the mediastinal con...,"[normal, normal, multiple surgical clips, old,..."
4,p10,s57375967,files\p10\p10000764\s57375967.txt,PA and lateral views of the chest provided. ...,pa and lateral views of the chest provided . t...,"[adequately, aerated, focal, consolidation, mi..."
...,...,...,...,...,...,...
11698,p10,s56459556,files\p10\p10999395\s56459556.txt,The tip of the endotracheal tube projects over...,the tip of the endotracheal tube projects over...,"[tip, endotracheal tube, two tubes, low, diffu..."
11699,p10,s57060480,files\p10\p10999395\s57060480.txt,"There is right PICC line, an right IJ central ...","there is right picc line , an right ij central...","[picc line, central line, tips, shallow inspir..."
11700,p10,s52257272,files\p10\p10999512\s52257272.txt,The lungs are well expanded and clear. There ...,the lungs are well expanded and clear . the ca...,"[well, expanded, clear, unremarkable]"
11701,p10,s52341872,files\p10\p10999737\s52341872.txt,PA and lateral views of the chest provided. L...,pa and lateral views of the chest provided . l...,"[clear, normal, stable]"


In [23]:
df["noun_tokens"] = df["tokens"].apply(filter_nouns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["noun_tokens"] = df["tokens"].apply(filter_nouns)


In [24]:
df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content,tokens,noun_tokens
0,p10,s50414267,files\p10\p10000032\s50414267.txt,"There is no focal consolidation, pleural effus...",bilateral nodular opacities that most likely r...,"[nodular, opacities, normal, clips, within, un...","[opacities, clips, deformity]"
1,p10,s53189527,files\p10\p10000032\s53189527.txt,"The cardiac, mediastinal and hilar contours ar...","the cardiac , mediastinal and hilar contours a...","[normal, normal, clear, multiple, clips, remot...",[clips]
2,p10,s53911762,files\p10\p10000032\s53911762.txt,Single frontal view of the chest provided. Th...,single frontal view of the chest provided . th...,"[normal, multiple, clips, remote, fractures]",[clips]
3,p10,s56699142,files\p10\p10000032\s56699142.txt,"The lungs are clear of focal consolidation, pl...",the heart size is normal . the mediastinal con...,"[normal, normal, multiple surgical clips, old,...",[clips]
4,p10,s57375967,files\p10\p10000764\s57375967.txt,PA and lateral views of the chest provided. ...,pa and lateral views of the chest provided . t...,"[adequately, aerated, focal, consolidation, mi...","[consolidation, engorgement, calcifications, c..."
...,...,...,...,...,...,...,...
11698,p10,s56459556,files\p10\p10999395\s56459556.txt,The tip of the endotracheal tube projects over...,the tip of the endotracheal tube projects over...,"[tip, endotracheal tube, two tubes, low, diffu...","[tip, tube, tubes, opacities, opacity, opacifi..."
11699,p10,s57060480,files\p10\p10999395\s57060480.txt,"There is right PICC line, an right IJ central ...","there is right picc line , an right ij central...","[picc line, central line, tips, shallow inspir...","[line, line, tips, inspiration, opacities, fin..."
11700,p10,s52257272,files\p10\p10999512\s52257272.txt,The lungs are well expanded and clear. There ...,the lungs are well expanded and clear . the ca...,"[well, expanded, clear, unremarkable]",[]
11701,p10,s52341872,files\p10\p10999737\s52341872.txt,PA and lateral views of the chest provided. L...,pa and lateral views of the chest provided . l...,"[clear, normal, stable]",[]


In [25]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

val_df, test_df = train_test_split(
    temp_df, test_size=0.6667, random_state=42, shuffle=True
)

print(
    f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}"
)

In [26]:
train_df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content,tokens,noun_tokens
1619,p10,s55439095,files\p10\p10144083\s55439095.txt,Apparent widening of the mediastinum is likely...,apparent widening of the mediastinum is likely...,"[apparent, decreased, accentuating, mild, atel...",[atelectasis]
2750,p10,s51097181,files\p10\p10242587\s51097181.txt,Frontal supine portable radiograph of the ches...,frontal supine portable radiograph of the ches...,"[mild, enlargement, stable, atelectasis]","[enlargement, atelectasis]"
3471,p10,s59925089,files\p10\p10292285\s59925089.txt,There is mild bibasilar atelectasis. The hear...,there is mild bibasilar atelectasis . the hear...,"[mild, atelectasis, mildly, enlarged, tube, co...","[atelectasis, tube, coils, tip, endotracheal, ..."
5148,p10,s55101999,files\p10\p10439374\s55101999.txt,Portable semi-erect chest radiograph ___ at 04...,portable semi - erect chest radiograph ___ at ...,"[picc, line, unchanged, position, stably, enla...","[picc, line, position]"
3525,p10,s53942022,files\p10\p10296357\s53942022.txt,The heart size is top normal with tortuosity o...,the heart size is top normal with tortuosity o...,"[top normal, tortuosity, stable, s - shaped, s...","[tortuosity, scoliosis, scoliosis]"
...,...,...,...,...,...,...,...
11352,p10,s55130237,files\p10\p10971699\s55130237.txt,"Since ___, there has been interval placement o...","since ___ , there has been interval placement ...","[dual - chamber, icd, leads, intact, severe, c...","[chamber, chamber]"
5219,p10,s58107350,files\p10\p10446182\s58107350.txt,The lungs are clear without consolidation or e...,the cardiomediastinal silhouette is normal . a...,"[normal, picc, picc]","[picc, picc]"
5423,p10,s59081890,files\p10\p10461137\s59081890.txt,Airspace opacity is seen overlying the right u...,airspace opacity is seen overlying the right u...,"[opacity, prominence, edema, low, blunting, re...","[opacity, prominence, edema, opacity, patient,..."
861,p10,s54213212,files\p10\p10069423\s54213212.txt,The patient is status post intubation with an ...,the patient is status post intubation with an ...,"[endotracheal tube, ng tube, large, dense, con...","[tube, tube, consolidation, consolidation, opa..."


In [27]:
val_df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content,tokens,noun_tokens
8693,p10,s55797364,files\p10\p10745635\s55797364.txt,Frontal and lateral views of the chest. Heart...,frontal and lateral views of the chest . heart...,"[normal, opacity, small, bochdalek, hernia, in...","[opacity, bochdalek, bochdalek]"
5172,p10,s58092349,files\p10\p10440642\s58092349.txt,Frontal and lateral views of the chest were ob...,frontal and lateral views of the chest were ob...,"[persistent, opacity, blunted, enlarged, calci...","[opacity, riding, changes, disease, changes, o..."
7183,p10,s52666436,files\p10\p10611684\s52666436.txt,There is slight elevation of the right hemidia...,there is slight elevation of the right hemidia...,"[slight, elevation, unremarkable, slight, elev...","[elevation, elevation]"
3592,p10,s53038472,files\p10\p10301415\s53038472.txt,New right-sided PICC line with the tip in the ...,new right - sided picc line with the tip in th...,"[new, picc line, tip, opacity, decreased in ex...","[line, tip, opacity, extent, asymmetric, edema..."
3725,p10,s53981461,files\p10\p10312715\s53981461.txt,Frontal and lateral views of the chest are obt...,frontal and lateral views of the chest are obt...,"[prominent, shadow, small rounded opacity, sub...","[shadow, opacity, opacity, opacity]"
...,...,...,...,...,...,...,...
2301,p10,s52654331,files\p10\p10203607\s52654331.txt,The heart size is normal. The hilar and media...,the heart size is normal . the hilar and media...,"[normal, normal, unremarkable]",[]
169,p10,s50435883,files\p10\p10011607\s50435883.txt,Cardiac silhouette size is mildly enlarged. Th...,cardiac silhouette size is mildly enlarged . t...,"[mildly, enlarged, mildly, tortuous, unchanged...","[limits, loss, height]"
10899,p10,s54345016,files\p10\p10932783\s54345016.txt,Heart size remains mildly enlarged. Mediastin...,heart size remains mildly enlarged . mediastin...,"[mildly, enlarged, normal, normal, clear]",[]
2700,p10,s57018667,files\p10\p10233650\s57018667.txt,The lungs are clear. The cardiomediastinal sil...,the lungs are clear . the cardiomediastinal si...,"[clear, normal]",[]


In [28]:
test_df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,new_text_content,tokens,noun_tokens
3728,p10,s59478788,files\p10\p10312715\s59478788.txt,The lungs are clear. The heart size is normal...,the lungs are clear . the heart size is normal...,"[clear, normal, slightly, increased, normal]",[]
9059,p10,s55866250,files\p10\p10773382\s55866250.txt,Patient's condition required examination in si...,patient ' s condition required examination in ...,"[multiple surgical clips, bypass surgery, mild...","[clips, bypass, surgery, enlargement, position..."
767,p10,s53765730,files\p10\p10060204\s53765730.txt,Dual lead left-sided AICD is stable in positio...,dual lead left - sided aicd is stable in posit...,"[dual lead, aicd, stable, prominence, markings...","[lead, prominence, markings, prominence, promi..."
11428,p10,s50313341,files\p10\p10978636\s50313341.txt,Nasoenteric tube is seen in the region of the ...,nasoenteric tube is seen in the region of the ...,"[nasoenteric, tube, normal, minimal, congestio...","[tube, congestion, tube, tip]"
11585,p10,s56501444,files\p10\p10992808\s56501444.txt,Mild prominence of the cardiac silhouette is l...,mild prominence of the cardiac silhouette is l...,"[mild, prominence, low, unremarkable]",[prominence]
...,...,...,...,...,...,...,...
10682,p10,s56348856,files\p10\p10914703\s56348856.txt,Again seen is a consolidation in the right mid...,again seen is a consolidation in the right mid...,"[consolidation, unchanged, pneumonia, pneumonia]","[consolidation, pneumonia, pneumonia]"
1196,p10,s56033097,files\p10\p10104732\s56033097.txt,"Cardiac, mediastinal and hilar contours are no...","cardiac , mediastinal and hilar contours are n...","[normal, normal]",[]
6140,p10,s50443876,files\p10\p10529674\s50443876.txt,PA and lateral views of the chest were reviewe...,pa and lateral views of the chest were reviewe...,"[normal, mild, thickening, stable, hazy, opaci...","[opacification, opacity]"
9427,p10,s54790585,files\p10\p10803276\s54790585.txt,Previous small right-sided pleural effusion is...,previous small right - sided pleural effusion ...,"[small, effusion, trace, clear, unremarkable, ...","[effusion, trace, effusion]"


In [29]:
all_noun_tokens = [token for sublist in train_df["noun_tokens"] for token in sublist]

In [30]:
noun_token_counts = Counter(all_noun_tokens)

In [31]:
token = noun_token_counts.most_common(50)

In [32]:
attributes = []

for i in range(0, 50):
    attributes.append(token[i][0])

In [33]:
attributes

['atelectasis',
 'opacity',
 'effusion',
 'tube',
 'opacities',
 'edema',
 'tip',
 'effusions',
 'limits',
 'congestion',
 'position',
 'changes',
 'clips',
 'line',
 'sternotomy',
 'opacification',
 'pneumonia',
 'port',
 'calcifications',
 'consolidation',
 'pneumothorax',
 'wires',
 'picc',
 'endotracheal',
 'improvement',
 'prominence',
 'elevation',
 'markings',
 'increase',
 'enlargement',
 'density',
 'size',
 'pacemaker',
 'cath',
 'loss',
 'interval',
 'disease',
 'collapse',
 'deformity',
 'place',
 'compression',
 'air',
 'calcification',
 'hardware',
 'catheter',
 'trace',
 'device',
 'et',
 'lead',
 'aeration']

In [282]:
# Attributes got from running text_embeddings.py (same code as the cells above)

attributes = []

attributes = np.loadtxt(
    "C:/Users/DryLab/Desktop/ViLLA/RPN_MIMIC/top_50_attributes_final.txt",
    delimiter=",",
    dtype=str,
)

In [36]:
# et = Endotracheal Tube

attributes

['atelectasis',
 'opacity',
 'effusion',
 'tube',
 'opacities',
 'edema',
 'tip',
 'effusions',
 'limits',
 'congestion',
 'position',
 'changes',
 'clips',
 'line',
 'sternotomy',
 'opacification',
 'pneumonia',
 'port',
 'calcifications',
 'consolidation',
 'pneumothorax',
 'wires',
 'picc',
 'endotracheal',
 'improvement',
 'prominence',
 'elevation',
 'markings',
 'increase',
 'enlargement',
 'density',
 'size',
 'pacemaker',
 'cath',
 'loss',
 'interval',
 'disease',
 'collapse',
 'deformity',
 'place',
 'compression',
 'air',
 'calcification',
 'hardware',
 'catheter',
 'trace',
 'device',
 'et',
 'lead',
 'aeration']

In [34]:
def split_into_sentences(text):
    # Tokenize the text into sentences
    return nltk.tokenize.sent_tokenize(text)

In [36]:
train_df["sentences"] = train_df["new_text_content"].apply(split_into_sentences)

In [48]:
train_df

Unnamed: 0,parent_folder,subfolder,text_path,text_content,tokens,noun_tokens,sentences
1619,p10,s55439095,p10\p10144083\s55439095.txt,apparent widening of the mediastinum is likely...,"[apparent, decreased, accentuating, mild, atel...",[atelectasis],[apparent widening of the mediastinum is likel...
2750,p10,s51097181,p10\p10242587\s51097181.txt,frontal supine portable radiograph of the ches...,"[mild, enlargement, stable, atelectasis]","[enlargement, atelectasis]",[frontal supine portable radiograph of the che...
3471,p10,s59925089,p10\p10292285\s59925089.txt,there is mild bibasilar atelectasis . the hear...,"[mild, atelectasis, mildly, enlarged, tube, co...","[atelectasis, tube, coils, tip, endotracheal, ...","[there is mild bibasilar atelectasis ., the he..."
5148,p10,s55101999,p10\p10439374\s55101999.txt,portable semi - erect chest radiograph ___ at ...,"[picc, line, unchanged, position, stably, enla...","[picc, line, position]",[portable semi - erect chest radiograph ___ at...
3525,p10,s53942022,p10\p10296357\s53942022.txt,the heart size is top normal with tortuosity o...,"[top normal, tortuosity, stable, s - shaped, s...","[tortuosity, scoliosis, scoliosis]",[the heart size is top normal with tortuosity ...
...,...,...,...,...,...,...,...
11352,p10,s55130237,p10\p10971699\s55130237.txt,"since ___ , there has been interval placement ...","[dual - chamber, icd, leads, intact, severe, c...","[chamber, chamber]","[since ___ , there has been interval placement..."
5219,p10,s58107350,p10\p10446182\s58107350.txt,the cardiomediastinal silhouette is normal . a...,"[normal, picc, picc]","[picc, picc]","[the cardiomediastinal silhouette is normal .,..."
5423,p10,s59081890,p10\p10461137\s59081890.txt,airspace opacity is seen overlying the right u...,"[opacity, prominence, edema, low, blunting, re...","[opacity, prominence, edema, opacity, patient,...",[airspace opacity is seen overlying the right ...
861,p10,s54213212,p10\p10069423\s54213212.txt,the patient is status post intubation with an ...,"[endotracheal tube, ng tube, large, dense, con...","[tube, tube, consolidation, consolidation, opa...",[the patient is status post intubation with an...


In [37]:
from collections import Counter

for attribute in attributes:
    # Identify sentences that contain the attribute from the "sentences" column
    sentences_with_attributes = []

    for sentence_list in train_df["sentences"]:
        if isinstance(sentence_list, list):
            for sentence in sentence_list:
                if attribute in sentence:
                    sentences_with_attributes.append(sentence)

    # Count the frequency of each sentence containing the attribute
    sentence_counter = Counter(sentences_with_attributes)

    # Get the 200 most frequent sentences
    most_common_sentences = [
        sentence for sentence, _ in sentence_counter.most_common(200)
    ]

    # Print the attribute and the two most common sentences for debugging
    print(f"Attribute: {attribute}")
    print(sentence_counter.most_common(5))

In [46]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

attribute_embeddings = []

model = SentenceTransformer("all-mpnet-base-v2", device=device)

for attribute in attributes:
    # Identify sentences that contain the attribute from the "sentences" column
    sentences_with_attributes = []

    for sentence_list in train_df["sentences"]:
        if isinstance(sentence_list, list):
            for sentence in sentence_list:
                if attribute in sentence:
                    sentences_with_attributes.append(sentence)

    # Count the frequency of each sentence containing the attribute
    sentence_counter = Counter(sentences_with_attributes)

    # Get the 200 most frequent sentences
    most_common_sentences = [
        sentence for sentence, _ in sentence_counter.most_common(200)
    ]

    with torch.inference_mode():
        if most_common_sentences:
            # Compute sentence embeddings
            sentence_embeddings = model.encode(
                most_common_sentences, show_progress_bar=False
            )

            # Average the embeddings
            average_embedding = sentence_embeddings.mean(axis=0)
            attribute_embeddings.append(average_embedding)
        else:
            print(f"No sentences found for attribute: {attribute}")

attribute_embeddings = np.stack(attribute_embeddings)
attribute_embeddings = torch.tensor(attribute_embeddings).to(device)
print(attribute_embeddings.shape)
attribute_embeddings = dict(zip(attributes, attribute_embeddings))

# Output the averaged embeddings
for attribute, embedding in attribute_embeddings.items():
    print(f"Shape: {embedding.shape}")
    print(f"Attribute: {attribute}")
    print(f"Embedding: {embedding[:10]}, dtype: {embedding.dtype}")

In [None]:
# import torch
# from transformers import AutoModel, AutoTokenizer

# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")


# # model = SentenceTransformer("all-MiniLM-L6-v2", device=device)


# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[
#         0
#     ]  # First element of model_output contains all token embeddings
#     input_mask_expanded = (
#         attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     )
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
#         input_mask_expanded.sum(1), min=1e-9
#     )


# attribute_embeddings = []

# for attribute in attributes:
#     # Identify sentences that contain the attribute
#     sentences_with_attributes = df[df["text_content"].str.contains(attribute)][
#         "text_content"
#     ].tolist()

#     # Filter the 200 most frequent sentences
#     sentence_counter = Counter(sentences_with_attributes)
#     most_common_sentences = [
#         sentence for sentence, count in sentence_counter.most_common(200)
#     ]

#     # Load model from HuggingFace Hub
#     tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
#     model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2").to(
#         device
#     )

#     # Tokenize sentences
#     encoded_input = tokenizer(
#         most_common_sentences,
#         padding=True,
#         truncation=True,
#         return_tensors="pt",
#     )

#     with torch.inference_mode():
#         if most_common_sentences:
#             # Compute sentence embeddings
#             sentence_embeddings = model(**encoded_input.to(device))

#             # Average the embeddings
#             average_embedding = mean_pooling(
#                 sentence_embeddings, encoded_input["attention_mask"]
#             )
#             # attribute_embeddings[attribute] = average_embedding
#             # sentence_embeddings = sentence_embeddings.mean(dim=0, keepdim=True)
#             # sentence_embeddings /= sentence_embeddings.norm(dim=-1, keepdim=True)
#             attribute_embeddings.append(average_embedding)
#         else:
#             print(f"No sentences found for attribute: {attribute}")

# attribute_embeddings = (
#     torch.stack(attribute_embeddings).squeeze().detach().cpu().numpy()
# )
# print(attribute_embeddings.shape)
# attribute_embeddings = dict(zip(attributes, attribute_embeddings))