# GPU Config

In [1]:
import torch

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU is available. Using {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU is not available. Using CPU.")

GPU is available. Using Tesla P100-PCIE-16GB


# Load mimic-cxr

In [3]:
cxr_file = '/kaggle/input/mimic-cxr-data/cleaned_data.csv'

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(cxr_file)

# Get keywords from impression

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
def extract_keywords_from_impression(df, column_name='Impression', top_n=10):
    """
    Extracts important keywords from the specified column in the DataFrame using TF-IDF.

    Parameters:
    - df: DataFrame containing the text data.
    - column_name: The name of the column that contains the text (default is 'Impression').
    - top_n: The number of top keywords to extract per document (default is 10).

    Returns:
    - A list of keywords for each document in the column.
    """
    # Ensure there are no NaN values in the column
    df = df.dropna(subset=[column_name])

    # Get the stop words as a list
    stop_words = list(stopwords.words('english'))

    # Initialize the TF-IDF Vectorizer using the list of stop words
    tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)

    # Fit and transform the 'Impression' column
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[column_name])

    # Get the feature names (i.e., words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Function to extract top keywords for each document
    def extract_top_keywords(row):
        # Get the row as a dense array
        row_dense = row.toarray().flatten()
        # Get the indices of top N values (highest TF-IDF scores)
        top_indices = row_dense.argsort()[-top_n:][::-1]
        # Return the corresponding words
        return [feature_names[i] for i in top_indices]

    # Apply the function to each row in the TF-IDF matrix
    df['keywords'] = [extract_top_keywords(row) for row in tfidf_matrix]

    return df[['Impression', 'keywords']]

In [9]:
result_df = extract_keywords_from_impression(df)

In [10]:
result_df.head()

Unnamed: 0,Impression,keywords
0,No acute cardiopulmonary process.,"[process, cardiopulmonary, acute, enteric, ent..."
1,No acute cardiopulmonary abnormality.,"[abnormality, cardiopulmonary, acute, equivoca..."
2,No acute intrathoracic process.,"[intrathoracic, process, acute, emphysematous,..."
3,No acute cardiopulmonary process.,"[process, cardiopulmonary, acute, enteric, ent..."
4,"Focal consolidation at the left lung base, pos...","[representing, engorgement, possibly, central,..."


In [11]:
def get_unique_keywords(df, keywords_column='keywords'):
    """
    Extracts a list of unique keywords from the 'keywords' column in the DataFrame.

    Parameters:
    - df: DataFrame containing the keywords column.
    - keywords_column: The name of the column that contains the keywords list (default is 'keywords').

    Returns:
    - A list of unique keywords.
    """
    # Flatten the list of keywords from all rows into a single list
    all_keywords = [keyword for sublist in df[keywords_column] for keyword in sublist]
    
    # Remove duplicates by converting the list to a set and then back to a list
    unique_keywords = list(set(all_keywords))
    
    return unique_keywords

In [12]:
unique_keywords_list = get_unique_keywords(result_df)

In [13]:
len(unique_keywords_list)

1000

In [14]:
import re

In [15]:
filtered_keywords = [word for word in unique_keywords_list if len(word) >= 3 and not re.search(r'\d', word)]

In [16]:
len(filtered_keywords)

968

# Load mrconso

In [17]:
mrconso_path = '/kaggle/input/mrconso/MRCONSO.RRF'

In [18]:
columns = ['CUI', 'LAT', 'TS', 'LUI', 'STT', 'SUI', 'ISPREF', 'AUI', 'SAUI', 'SCUI', 
           'SDUI', 'SAB', 'TTY', 'CODE', 'STR', 'SRL', 'SUPPRESS', 'CVF']

In [19]:
mrconso_df = pd.read_csv(mrconso_path, sep='|', names=columns, header=None, dtype=str, index_col=False)

In [20]:
mrconso_df.head()

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,0,N,256.0
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,0,N,256.0
2,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0
3,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0
4,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",0,N,


In [21]:
mrconso_df.shape

(7048963, 18)

In [22]:
mrconso_df_filtered = mrconso_df[(mrconso_df['LAT'] == 'ENG') & (mrconso_df['SUPPRESS'] == 'N')]

In [23]:
mrconso_df_filtered.head()

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,C0000005,ENG,P,L0000005,PF,S0007492,Y,A26634265,,M0019694,D012711,MSH,PEP,D012711,(131)I-Macroaggregated Albumin,0,N,256.0
1,C0000005,ENG,S,L0270109,PF,S0007491,Y,A26634266,,M0019694,D012711,MSH,ET,D012711,(131)I-MAA,0,N,256.0
2,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0
3,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0
4,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",0,N,


In [24]:
mrconso_df_filtered.shape

(5370585, 18)

In [25]:
ner_data = mrconso_df_filtered[['CUI', 'STR', 'SAB', 'TTY']]

In [26]:
ner_data.head()

Unnamed: 0,CUI,STR,SAB,TTY
0,C0000005,(131)I-Macroaggregated Albumin,MSH,PEP
1,C0000005,(131)I-MAA,MSH,ET
2,C0000039,"1,2-dipalmitoylphosphatidylcholine",RXNORM,IN
3,C0000039,"1,2-dipalmitoylphosphatidylcholine",MTH,PN
4,C0000039,"1,2-Dipalmitoylphosphatidylcholine",MSH,MH


In [27]:
ner_data.to_csv('ner_data.csv', index=False)

# Find rows with keywords

In [None]:
from tqdm import tqdm

In [None]:
tqdm.pandas()

In [None]:
import numpy as np

In [None]:
def find_first_keyword(text, keywords):
    for keyword in keywords:
        if re.search(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):  # Word boundary to match whole words
            return keyword
    return None

In [None]:
fraction = 0.01
filtered_data = ner_data.sample(frac=fraction, random_state=1).copy()  # random_state for reproducibility

In [None]:
filtered_data['matched_keyword'] = filtered_data['STR'].progress_apply(lambda text: find_first_keyword(text, filtered_keywords))

In [None]:
filtered_data = filtered_data[filtered_data['matched_keyword'].notna()]

In [None]:
filtered_data.shape

In [None]:
filtered_data.head()

In [None]:
filtered_data.to_csv('filtered_data.csv', index=False)

In [30]:
filtered_data = pd.read_csv('/kaggle/input/filtered-data/filtered_data v2.csv')

In [31]:
filtered_data.head()

Unnamed: 0,CUI,STR,SAB,TTY,matched_keyword
0,C4482898,Complications during rehabilitation stay:Findi...,LNC,MTH_LN,finding
1,C3893828,down-regulation of proteolysis involved in cel...,GO,SY,process
2,C3808632,Irregular ruffled or jagged appearance at derm...,OMIM,PTCS,appearance
3,C3166082,PhenX measure - perceived social support - con...,LNC,LPDN,support
4,C3899334,Dual-Energy Contrast-Enhanced Digital Subtract...,NCI,SY,dual


# Model

In [28]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [29]:
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [30]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# Preprocess data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
filtered_data['input_text'] = "ner: " + filtered_data['STR']
filtered_data['target_text'] = filtered_data['TTY']

In [None]:
filtered_data.head()

In [None]:
filtered_data = filtered_data[['input_text', 'target_text']].dropna()

In [None]:
filtered_data.shape

In [None]:
train_df, val_df = train_test_split(filtered_data, test_size=0.1)

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
def tokenize_data(examples):
    # Ensure input_text and target_text are lists of strings
    input_text = examples['input_text'] if isinstance(examples['input_text'], list) else [examples['input_text']]
    target_text = examples['target_text'] if isinstance(examples['target_text'], list) else [examples['target_text']]
    
    # Tokenize inputs and labels
    input_encodings = tokenizer(input_text, truncation=True, padding='max_length', max_length=64)
    target_encodings = tokenizer(target_text, truncation=True, padding='max_length', max_length=64)

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }


In [None]:
train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = val_dataset.map(tokenize_data, batched=True)

# Train

In [None]:
output_dir="/kaggle/working"

In [None]:
import os

In [None]:
model_output = os.path.join(output_dir, 'model')

In [None]:
os.mkdir(model_output)

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir=model_output,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    learning_rate=2e-3,
    warmup_steps=0,
    weight_decay=0.0,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    fp16=True,  # Enable mixed-precision
    save_steps=1000
)

In [None]:
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

TrainOutput(global_step=1644, training_loss=0.06707047891787224, metrics={'train_runtime': 817.7436, 'train_samples_per_second': 16.08, 'train_steps_per_second': 2.01, 'total_flos': 222451168444416.0, 'train_loss': 0.06707047891787224, 'epoch': 1.0})

In [None]:
model.save_pretrained(model_output)

# Reload model

In [31]:
model_path = '/kaggle/input/ner-t5-v1/transformers/default/1'

In [32]:
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [33]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [34]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [35]:
def generate_tags(sentences):
    tags = []
    for sentence in sentences:
        # Prepare the input by formatting it as needed
        input_text = f"Generate tags : {sentence}"  # Customize the prompt if needed
        inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

        # Generate outputs
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=50)  # Adjust max_length based on your needs

        # Decode the generated tokens to text
        decoded_tags = tokenizer.decode(outputs[0], skip_special_tokens=True)
        tags.append(decoded_tags)

    return tags

In [36]:
example_sentences = [
    "The patient exhibited significant senile plaques in the cerebral cortex, indicative of Alzheimer's disease.",
    "A thoracentesis was performed to analyze the pleural fluid for any abnormalities.",
    "The count of mononuclear cells in the pleural fluid was higher than normal, suggesting infection.",
    "An elevated level of lactate dehydrogenase in the pleural fluid can indicate malignancy.",
    "The patient was diagnosed with Stage IV Pleural Malignant Mesothelioma, requiring urgent treatment.",
    "Testing for Aspergillus sp antibodies in the pleural fluid was done to rule out fungal infections.",
    "The doctor recommended the excision of a pleural lesion due to the patient's persistent symptoms.",
    "The child was hospitalized with severe pneumonia, requiring immediate medical intervention.",
    "The diagnosis of secondary organizing pneumonia was confirmed through imaging studies.",
    "Pneumocystis pneumonia is a serious condition often seen in immunocompromised patients.",
    "Recent outbreaks of mycoplasma pneumonia have been reported in pig farms.",
    "The patient’s symptoms aligned with idiopathic interstitial pneumonia, necessitating further evaluation.",
    "Exposure to certain allergens led to the development of allergic pneumonitis in the patient.",
    "The treatment plan was adjusted due to the development of Grade 3 pneumonitis as per CTCAE guidelines.",
    "Artificial pneumoperitoneum was induced during the laparoscopic procedure for better visibility."
]


In [37]:
related_tags = generate_tags(example_sentences)

In [38]:
for example_sentence, tag in zip(example_sentences, related_tags):
    print(f"Sentence: {example_sentence}\nTags: {tag}\n")

Sentence: The patient exhibited significant senile plaques in the cerebral cortex, indicative of Alzheimer's disease.
Tags: PTCS

Sentence: A thoracentesis was performed to analyze the pleural fluid for any abnormalities.
Tags: LPDN

Sentence: The count of mononuclear cells in the pleural fluid was higher than normal, suggesting infection.
Tags: PT

Sentence: An elevated level of lactate dehydrogenase in the pleural fluid can indicate malignancy.
Tags: PTCS

Sentence: The patient was diagnosed with Stage IV Pleural Malignant Mesothelioma, requiring urgent treatment.
Tags: PT

Sentence: Testing for Aspergillus sp antibodies in the pleural fluid was done to rule out fungal infections.
Tags: CN

Sentence: The doctor recommended the excision of a pleural lesion due to the patient's persistent symptoms.
Tags: LC

Sentence: The child was hospitalized with severe pneumonia, requiring immediate medical intervention.
Tags: PN

Sentence: The diagnosis of secondary organizing pneumonia was confir

In [None]:
df.head()

In [39]:
from tqdm import tqdm

In [40]:
fraction = 0.1
sampled_df = df.sample(frac=fraction, random_state=1).copy()

In [41]:
sampled_df['tags'] = ''

# Process each finding in the DataFrame
for index, row in tqdm(sampled_df.iterrows(), total=sampled_df.shape[0], desc="Processing Findings"):
    findings_text = row['Findings']
    
    # Split findings into sentences (assuming sentences are separated by periods)
    sentences = findings_text.split('. ')
    
    tags = []
    for i, sentence in enumerate(sentences):
        # Generate tags for the current sentence
        sentence_tags = generate_tags([sentence])
        
        # If tags are found, format them as line_number:tag
        if sentence_tags:
            tags.append(f"{i+1}:{', '.join(sentence_tags)}")
    
    # Combine tags into a single string and update the 'tags' column
    if tags:
        sampled_df.at[index, 'tags'] = ' '.join(tags)

Processing Findings: 100%|██████████| 12198/12198 [42:23<00:00,  4.80it/s]


In [42]:
sampled_df.head()

Unnamed: 0,Filename,Findings,Impression,tags
58700,s56524853.txt,PA and lateral chest radiographs were provided...,No evidence of intrathoracic metastatic diseae.,1:LC 2:PN 3:LPDN 4:CN
13635,s57070959.txt,AP upright and lateral views of the chest prov...,"Mild edema, mild to moderate bilateral pleural...",1:PN 2:PTCS 3:LPDN 4:LPDN 5:LPDN 6:CN
97812,s50400928.txt,"Since ___, extensive pulmonary edema is mildly...",1. Extensive pulmonary edema is mildly improve...,1:PTCS 2:LC 3:LPDN 4:LPDN 5:LPN 6:PT 7:LC 8:PT...
82980,s59658037.txt,The lung volumes are low. There is no pleural...,Prominence the right paratracheal stripe may b...,1:LPDN 2:PN 3:PTCS 4:PT
66438,s57503870.txt,Two views were obtained of the chest. The lun...,No acute intrathoracic process to explain the ...,1:LC 2:LPDN 3:LPDN 4:PT 5:CN


In [43]:
sampled_df.to_csv('sampled_findings_with_ner_tags')