# Data Preparation

In [1]:
import gcsfs
import json

gcs_file_system = gcsfs.GCSFileSystem(project="capstone")
gcs_json_path = "gs://radiology-data/report_CT.json"
with gcs_file_system.open(gcs_json_path) as f:
    data = json.load(f)

In [2]:
len(data)

246824

In [41]:
# Access specific information in the data
report_id = data['Report_42']['report_id']
modality = data['Report_42']['modality']
clinical_information = data['Report_42']['clinical_information']['clinical_information']
findings_group = data['Report_42']['findings_group']['findings']
impression_group = data['Report_42']['impression_group']['impression']

# Print the extracted information
print(f"Report ID: {report_id}",'\n')
print(f"Modality: {modality}",'\n')
print(f"Clinical Information: {clinical_information}",'\n')
print(f"Findings: {findings_group}",'\n')
print(f"Impression: {impression_group}",'\n')

Report ID: 42 

Modality: CT 

Clinical Information: . Evaluate for stricturing crohns disease in patient with ileal crohns disease status post resection in 2005 history: abdominal pain, decreased appetite, nausea. 

Findings: . No significant abnormality noted. Hepatic steatosis suggested. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. Right sided abdominal postsurgical sequela related to prior partial ileal resection. At level of postsurgical anastomosis, axial images 111 to 115 series 4/coronal image 58, mild to moderate distal ileal luminal narrowing and mild wall enhancement seen. However, no significant proximal bowel dilatation noted and appearance is not significantly changed from prior 2010 examination. May reflect underdistention or chronic changes or postsurgical sequela. Some areas of jejunum not well distended, making evaluation suboptimal. No signific

In [81]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

paragraph = data['Report_42']['findings_group']['findings']

# Tokenize the paragraph into sentences
sentences = sent_tokenize(paragraph)

# Create a set to store distinct sentences
distinct_sentences = set()

# Add distinct sentences to the set
for sentence in sentences:
    distinct_sentences.add(sentence.strip())

# Print the distinct sentences
distinct_sentences


[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'.',
 'At level of postsurgical anastomosis, axial images 111 to 115 series 4/coronal image 58, mild to moderate distal ileal luminal narrowing and mild wall enhancement seen.',
 'Hepatic steatosis suggested.',
 'However, no significant proximal bowel dilatation noted and appearance is not significantly changed from prior 2010 examination.',
 'May reflect underdistention or chronic changes or postsurgical sequela.',
 'Minimal fat stranding in ischioanal fossae alluded to on earlier exam without significant change.',
 'No significant abnormality noted.',
 'Right sided abdominal postsurgical sequela related to prior partial ileal resection.',
 'Small fat containing periumbilical hernia.',
 'Some areas of jejunum not well distended, making evaluation suboptimal.',
 'Visualized osseous structures without significant change.'}

In [32]:
text = 'history of lung cancer.. Progression of radiation changes and scarring of the right perihilar region. Increased small right pleural effusion. The reference right upper lobe nodule is partially obscured by the adjacent pleural effusion but measures approximately 1.9 x 1.4 cm , compared to 1.2 x 1.0 cm previously. Additional right lung nodules are increased in size. Stable left lower lobe interstitial opacity. The dominant right peritracheal mass measures 10.3 x 5.3 cm , compared to 8.5 x 4.9 cm previously. The mass encases the left brachiocephalic vein and narrows the SVC which is reconstituted distally. There is also mass effect and mild narrowing of the trachea. Right-sided chest port catheter tip terminates at the cavoatrial junction. The reference right cardiophrenic angle hypoattenuating mass measures 5.8 x 4.3 cm , compared to 6.0 x 4.9 cm previously. Right internal mammary lymphadenopathy. Scattered mediastinal lymphadenopathy is stable to increased in size. Severe. No axillary lymphadenopathy. Degenerative changes of the spine. Reference right hepatic lobe hypodense lesion measures 1.0 cm , compared to 1.6 cm previously. Additional hepatic hypodensities are again noted. The vessels are patent. Normally distended gallbladder. No biliary ductal dilatation. No significant abnormality noted. No significant abnormality noted. The reference peripancreatic lymph node measures 3.6 x 3.1 cm compared to 3.7 x 3.1 cm previously. Right adrenal multilobulated mass measures 2.5 x 2.2 cm compared to 2.3 x 1.6 cm previously. Nonobstructive right renal collecting system calculi. Stable perinephric fluid. Left pararenal space soft tissue nodule. Periduodenal mass measuring 2.1 x 2.0 cm , increased in size. Abdominal aortic aneurysm status post aortobiiliac stent placement. The excluded aneurysmal sac measures approximately 5.8 x 6.6 cm , compared to 5.9 x 6.8 cm previously. Severe. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. No significant abnormality noted. Left common iliac aneurysm, similar to prior. No significant abnormality noted'
# Split the text into sentences
sentences = text.split('. ')

# Create a dictionary to store sentence counts
sentence_counts = {}

# Find repeated sentences and update the counts
for sentence in sentences:
    if sentence in sentence_counts:
        sentence_counts[sentence] += 1
    else:
        sentence_counts[sentence] = 1

# Print the text with repeated parts highlighted
for sentence in sentences:
    if sentence_counts[sentence] > 1:
        highlighted_sentence = f"\033[1m{sentence}\033[0m"  # Highlight using ANSI escape codes
        print(highlighted_sentence)
    else:
        print(sentence)


history of lung cancer.
Progression of radiation changes and scarring of the right perihilar region
Increased small right pleural effusion
The reference right upper lobe nodule is partially obscured by the adjacent pleural effusion but measures approximately 1.9 x 1.4 cm , compared to 1.2 x 1.0 cm previously
Additional right lung nodules are increased in size
Stable left lower lobe interstitial opacity
The dominant right peritracheal mass measures 10.3 x 5.3 cm , compared to 8.5 x 4.9 cm previously
The mass encases the left brachiocephalic vein and narrows the SVC which is reconstituted distally
There is also mass effect and mild narrowing of the trachea
Right-sided chest port catheter tip terminates at the cavoatrial junction
The reference right cardiophrenic angle hypoattenuating mass measures 5.8 x 4.3 cm , compared to 6.0 x 4.9 cm previously
Right internal mammary lymphadenopathy
Scattered mediastinal lymphadenopathy is stable to increased in size
[1mSevere[0m
No axillary lymphad

## Create Dataframe

In [89]:
full_report = []
for key, value in data.items():
    report = dict()
    report_id = value['report_id']
    try:
        clinical_information = value['clinical_information']['clinical_information']
    except KeyError:
        clinical_information = ""
    clinical_information = clinical_information.replace('empty.','')
    clinical_information = clinical_information.lstrip('. ')
    findings_group = value['findings_group']['findings']
    findings_group  = findings_group.lstrip('. ')
    impression_group = value['impression_group']['impression']

    report['report_id'] = report_id
    report['clinical_information'] = clinical_information
    report['findings'] = findings_group
    report['impression'] = impression_group
    
    full_report.append(report)

In [90]:
import pandas as pd
CT_report = pd.DataFrame(full_report)

In [91]:
CT_report.head(15)

Unnamed: 0,report_id,clinical_information,findings,impression
0,6,locally recurrent oral tongue squamous cell ca...,There are post-treatment findings in the neck ...,Postoperative findings with evidence of recurr...
1,7,64 years old male with history of left humerus...,"Scattered pulmonary micronodules, some which a...",No evidence of metastatic disease.
2,13,"female, 57 years old, with subarachnoid hemorr...",A large coil mass is redemonstrated in the reg...,Redemonstration of a large coil mass situated ...
3,14,"male, 66 years old, status post subdural hemor...",Findings are redemonstrated compatible with su...,No significant change in the size of bilateral...
4,15,,The ventricles and sulci are within normal lim...,No acute intracranial hemorrhage.
5,16,,There is a very subtle focal hypoattenuating a...,No acute intracranial hemorrhage. Very subtle ...
6,17,"male, 84 years old, status post subdural hemor...","Since the prior examination, two burr holes ha...",Expected findings status post evacuation of a ...
7,18,,The ventricles and sulci are within normal lim...,No acute intracranial abnormality. Dental cari...
8,22,Evaluate for acute intraabdominal process hist...,No significant abnormality noted. Hepatic subc...,Unremarkable exam. Normal appendix. If there i...
9,26,,The frontal sinus and frontoethmoidal recesses...,Mild interval improvement in extensive paranas...


## Preprocessing

### Import NLTK methods to preprocess our texts

In [92]:
import sys
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
#nltk.download('punkt')
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [122]:
def keep_unique_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s', text)
    unique_sentences = set()
    result = []

    for sentence in sentences:
        cleaned_sentence = sentence.strip('.!?')
        if cleaned_sentence not in unique_sentences:
            unique_sentences.add(cleaned_sentence)
            result.append(sentence)

    return ' '.join(result)

### Build dataset with background & findings tokens + impression sentences

In [123]:
testing_report = CT_report[:8000].copy()
testing_report_preprocessed = pd.DataFrame()
testing_report_preprocessed['clinical_information_findings'] = testing_report['clinical_information'] + ' ' + testing_report['findings']
testing_report_preprocessed['clinical_information_findings'] = testing_report_preprocessed['clinical_information_findings'].apply(keep_unique_sentences)

testing_report_preprocessed['impression'] = testing_report['impression']
testing_report_preprocessed.head(10)

Unnamed: 0,clinical_information_findings,impression
0,locally recurrent oral tongue squamous cell ca...,Postoperative findings with evidence of recurr...
1,64 years old male with history of left humerus...,No evidence of metastatic disease.
2,"female, 57 years old, with subarachnoid hemorr...",Redemonstration of a large coil mass situated ...
3,"male, 66 years old, status post subdural hemor...",No significant change in the size of bilateral...
4,The ventricles and sulci are within normal li...,No acute intracranial hemorrhage.
5,There is a very subtle focal hypoattenuating ...,No acute intracranial hemorrhage. Very subtle ...
6,"male, 84 years old, status post subdural hemor...",Expected findings status post evacuation of a ...
7,The ventricles and sulci are within normal li...,No acute intracranial abnormality. Dental cari...
8,Evaluate for acute intraabdominal process hist...,Unremarkable exam. Normal appendix. If there i...
9,The frontal sinus and frontoethmoidal recesse...,Mild interval improvement in extensive paranas...


In [124]:
instruction_tuning_dataset = [
    {
        "instruction": "Generate impression based on clinical information and findings.",
        "input": row['clinical_information_findings'],
        "output": row['impression']
    }
    for row in testing_report_preprocessed.to_dict(orient="records")
]

In [125]:
len(instruction_tuning_dataset)

8000

## Save instruction tuning data as json file

In [127]:
with open('CT_InstructionTuning.json', 'w') as outfile:
    for obj in instruction_tuning_dataset:
        json.dump(obj, outfile)
        outfile.write('\n')