In [2]:
import os
import json
import random
import time
import re
import pickle
import traceback
from tqdm import tqdm
import pandas as pd
import time
import gzip
import numpy as np

In [3]:
queries = [
    
    "Identify the sentence discussing stage.",
    
    "Identify the sentence discussing histology.",
    
    "Identify the sentence discussing tumor feature.",
    
    "Identify the sentence discussing recurrence."
]

In [4]:
for q in queries:
    print(q)
    print(q.encode())

Identify the sentence discussing stage.
b'Identify the sentence discussing stage.'
Identify the sentence discussing histology.
b'Identify the sentence discussing histology.'
Identify the sentence discussing tumor feature.
b'Identify the sentence discussing tumor feature.'
Identify the sentence discussing recurrence.
b'Identify the sentence discussing recurrence.'


In [5]:
## Functions to process the data
def DataClean(data,labels_path,data_path=None,delete=None):

    print('\n\nImporting and filtering database...')

    if data is not None and not data.empty:
        notes = data
    else:
        notes = pd.read_csv(data_path)

    ints_str = '0123456789-#[]' # characters that usually main categories don't start with

    print('\n\nSplitting each note into sections:\n\n')

    notes_sections = {}

    for note_index in tqdm(range(notes.shape[0])):
        note = notes['text'][note_index].replace('\n\n\n\n','\n').replace('\n\n\n','\n').replace('     ','\n')
        paragraphs = note.split('\n')

        subsections, new_section = [], ' '
        for p in paragraphs:
            line = p.strip()
            if len(line)>0 and ':' in line and not (line[line.find(':')-1] in ints_str) and not(line[0] in ints_str):
                subsections.append([new_section.strip()])
                new_section = p + ' '
            else:
                new_section += p + ' '
        subsections.append([new_section])
        subsections.pop(0)

        note_sect_tit,note_sect_par = [],[]
        for sect in subsections:
            note_sect_tit += [str(*sect)[0:str(*sect).find(':')]]
            note_sect_par += [str(*sect)[str(*sect).find(':')+1:].strip()]
        note_df = pd.DataFrame({'title':note_sect_tit,'category':'','text':note_sect_par, 'label':''})
        notes_sections[notes['note_id'][note_index]] = note_df

    f = open(labels_path, 'r')
    obj_label = f.readlines()
    obj_label_dict = {}
    i = 0
    for s in obj_label:
        i += 1
        if '/' in s:
            buffer = s.strip('\n').lower().split('/')
            for item in buffer:
                obj_label_dict[item] = i
        else:
            obj_label_dict[s.strip('\n').lower()] = i
    f.close()

    for key in tqdm(list(notes_sections.keys())):
        buffer = 'begin_title'
        t = list(notes_sections[key]['title'])
        for idx in range(len(t)):
            for item in list(obj_label_dict.keys()):
                if item in t[idx].lower() and len(t[idx].lower())>2:
                    buffer = item
                    notes_sections[key]['category'][idx] = buffer
                    notes_sections[key]['label'][idx] = obj_label_dict[buffer]
                    break
            notes_sections[key]['category'][idx] = buffer
            notes_sections[key]['label'][idx] = obj_label_dict[buffer]

    notes_sections_output = {}
    row_id  = notes_sections.keys()
    for key in tqdm(row_id):
        buffer = ''
        note_sect_tit, note_sect_par, note_sect_lab = [], [], []
        for i in range(len(notes_sections[key]['category'])):
            if buffer != notes_sections[key]['category'][i]:
                buffer = notes_sections[key]['category'][i]
                note_sect_tit.append(buffer)
                note_sect_lab.append(notes_sections[key]['title'][i])
                note_sect_par.append(notes_sections[key]['text'][i])
                # if buffer == 'followup instruction' or buffer == 'follow up' or buffer == 'follow-up':
                #     break
            else:
                note_sect_par[-1] = note_sect_par[-1] + ' ' + notes_sections[key]['title'][i] + ' ' + notes_sections[key]['text'][i]
        note_df = pd.DataFrame({'title': note_sect_tit, 'text': note_sect_par, 'label': note_sect_lab})
        notes_sections_output[key] = note_df


    notes_sections = notes_sections_output

    if delete != None:
        for key,value in notes_sections.items():
            notes_sections[key] = notes_sections[key][~notes_sections[key]['label'].isin(delete)]

    return notes_sections

In [6]:
df  = pd.read_csv("malignant_neoplasm_updated_first_100_rows.csv").iloc[:10,]

In [7]:
NotesSections = DataClean(data=df,
                          labels_path="/Users/sinianzhang/Desktop/LLM/ColBert/ColBERT/labels.txt",
                          delete=['Name','Admission Date','Discharge Date','Date of Birth','Followup Instructions'])



Importing and filtering database...


Splitting each note into sections:




100%|██████████| 10/10 [00:00<00:00, 1382.75it/s]


100%|██████████| 10/10 [00:00<00:00, 413.79it/s]
100%|██████████| 10/10 [00:00<00:00, 1188.76it/s]


In [8]:
def GzipSearch(query,sentences,k):

    distance_from_query = []
    Cq = len(gzip.compress(query.encode()))

    # Loop over the sentences
    for sentence in sentences:
        
        # Compute the length of the compressed string for sentence
        Cs = len(gzip.compress(sentence.encode()))

        # Concatenate query and sentence, and compute the length of the compressed string
        qs = " ".join([query, sentence])
        Cqs = len(gzip.compress(qs.encode()))

        # Compute the Normalized Compression Distance (NCD)
        ncd = (Cqs - min(Cq, Cs)) / max(Cq, Cs)

        # Append the NCD to the list
        distance_from_query.append(ncd)

    # Sort the list and get the indices of the sorted elements
    sorted_idx = np.argsort(np.array(distance_from_query))

    # Get the sentences of the top k nearest neighbors
    top_k_sentences = [sentences[idx] for idx in sorted_idx[:k]]
    
    return top_k_sentences


In [9]:
Data = {}

for key,value in NotesSections.items():

  Data[key] = {}

  Strings = NotesSections[key]['text'].tolist()

  ## Delete some useless infomation
  paragraphs = [s for s in Strings if s != ""]

  collection = []

  for para in paragraphs:

    Sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<! [a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=\.|\?|\!)\"*\s*\s*(?:\W*)(?<![A-Z])', para)

    collection = collection + [s for s in Sentences if len(s) > 2]

  Data[key]['content'] = ''.join(collection)

  Data[key]['qas'] = []

  for query in queries:

    qas = {}

    print(f"#> {query}")

    # Find the top-k passages for this query
    results = GzipSearch(query,collection,5)
    qas[query] = results
    
    print(results)

    Data[key]['qas'].append(qas)


#> Identify the sentence discussing stage.
['The adrenals and spleen are unremarkable.', 'The pancreas is within normal limits.', 'The left kidney is within normal limits.', 'The patient is status post supracervical hysterectomy.', 'No destructive osseous lesions.']
#> Identify the sentence discussing histology.
['The adrenals and spleen are unremarkable.', 'The left kidney is within normal limits.', 'Coronal and sagittal reformations were provided.', 'Multiple pulmonary and hepatic metastases.', 'The patient is status post supracervical hysterectomy.']
#> Identify the sentence discussing tumor feature.
['The adrenals and spleen are unremarkable.', 'Coronal and sagittal reformations were provided.', 'No retroperitoneal or mesenteric adenopathy.', 'New diagnosis of metastatic cancer with unknown primary.', 'The left kidney is within normal limits.']
#> Identify the sentence discussing recurrence.
['The adrenals and spleen are unremarkable.', 'No retroperitoneal or mesenteric adenopathy.

In [11]:
file_path = 'gzip.json'

with open(file_path, 'w') as file:
    json.dump(Data, file,indent=4)