In [12]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer


In [13]:
aligned_transcriptions_dir = 'results/extract_dialogue/aligned_transcriptions/'
aligned_fb_detection_dir = 'results/extract_dialogue/aligned_fb_detection/'
component_classification_dir = 'results/extract_dialogue/component_classification/'
rag_embeddings_dir = 'results/extract_dialogue/rag_embeddings/'
no_fb_min_str_len = 20

# Phrase Only

In [14]:
def get_phrase_embeddings(transcriptions):
    fb_idxs = []
    no_fb_idxs = []

    for i in range(len(transcriptions)):
        transcription = str(transcriptions.loc[i, 'transcription'])
        if len(eval(transcriptions.loc[i, 'human_annotations'])) > 0:
            fb_idxs.append(i)
        elif len(transcription) > no_fb_min_str_len:
            no_fb_idxs.append(i)
    
    fb_phrases, fb_times = [], []
    no_fb_phrases, no_fb_times = [], []
    
    for i in fb_idxs:
        phrase = transcriptions.loc[i, 'transcription']
        start = transcriptions.loc[i, 'start']
        end = transcriptions.loc[i, 'end']
        start_hms = f'{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d}'
        end_hms = f'{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d}'
        time = f"{start_hms}-{end_hms}"
        
        fb_phrases.append(phrase)
        fb_times.append(time)

    for i in no_fb_idxs:
        phrase = transcriptions.loc[i, 'transcription']
        start = transcriptions.loc[i, 'start']
        end = transcriptions.loc[i, 'end']
        start_hms = f'{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d}'
        end_hms = f'{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d}'
        time = f"{start_hms}-{end_hms}"
        
        no_fb_phrases.append(phrase)
        no_fb_times.append(time)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    fb_embeddings = model.encode(fb_phrases)
    no_fb_embeddings = model.encode(no_fb_phrases)
    
    return fb_phrases, no_fb_phrases, fb_times, no_fb_times, fb_embeddings, no_fb_embeddings

def run_phrase(case_id):
    transcriptions = pd.read_csv(os.path.join(aligned_transcriptions_dir, f'LFB{case_id}_full.csv'), index_col=0)
    
    dir_ = os.path.join(rag_embeddings_dir, f'phrase_only')
    
    annotations_dir = os.path.join(dir_, 'annotations')
    fb_embeddings_dir = os.path.join(dir_, f'fb')
    no_fb_embeddings_dir = os.path.join(dir_, f'no_fb')
    
    os.makedirs(annotations_dir, exist_ok=True)
    os.makedirs(fb_embeddings_dir, exist_ok=True)
    os.makedirs(no_fb_embeddings_dir, exist_ok=True)
    
    fb_phrases, no_fb_phrases, fb_times, no_fb_times, fb_embeddings, no_fb_embeddings = get_phrase_embeddings(transcriptions)
    
    df = pd.DataFrame(columns=['time', 'case', 'transcription', 'fb_instance', 'embedding_path'])
    
    for i in range(len(fb_embeddings)):
        embedding = fb_embeddings[i]
        time = fb_times[i]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(fb_embeddings_dir, filename)
        np.save(path, embedding)

        df.loc[len(df)] = [time, case_id, fb_phrases[i], True, path]
    
    for i in range(len(no_fb_embeddings)):
        embedding = no_fb_embeddings[i]
        time = no_fb_times[i]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(no_fb_embeddings_dir, filename)
        np.save(path, embedding)

        df.loc[len(df)] = [time, case_id, no_fb_phrases[i], False, path]
    
    df.to_csv(os.path.join(annotations_dir, f'LFB{case_id}_full.csv'))

In [15]:
# for i in range(26, 34):
#     print(f'Processing LFB{i}')
#     run_phrase(i)
#     print()

In [16]:
transcriptions = pd.read_csv('results/extract_dialogue/aligned_transcriptions/LFB26_full.csv')
fb_idxs = []
no_fb_idxs = []

for i in range(len(transcriptions)):
    transcription = str(transcriptions.loc[i, 'transcription'])
    if len(eval(transcriptions.loc[i, 'human_annotations'])) > 0:
        fb_idxs.append(i)
    elif len(transcription) > no_fb_min_str_len:
        no_fb_idxs.append(i)

fb_phrases, fb_times = [], []
no_fb_phrases, no_fb_times = [], []

for i in fb_idxs:
    phrase = transcriptions.loc[i, 'transcription']
    start = transcriptions.loc[i, 'start']
    end = transcriptions.loc[i, 'end']
    start_hms = f'{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d}'
    end_hms = f'{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d}'
    time = f"{start_hms}-{end_hms}"
    
    fb_phrases.append(phrase)
    fb_times.append(time)

# Context + Phrase

In [17]:
def get_context_phrase_embeddings_fb(aligned_fb_detection):
    fb_df = aligned_fb_detection[aligned_fb_detection['true_fb_instance'] == True]
    no_fb_df = aligned_fb_detection[aligned_fb_detection['true_fb_instance'] == False]
    
    model = SentenceTransformer('all-MiniLM-L6-v2')
    fb_embeddings = model.encode(fb_df['context_dialogue'].to_list())
    no_fb_embeddings = model.encode(no_fb_df['context_dialogue'].to_list())
    
    return fb_embeddings, no_fb_embeddings, fb_df, no_fb_df

def get_context_phrase_embeddings_component(component_classification):
    anatomic_df = component_classification[component_classification['true_f_anatomic'] == True]
    procedural_df = component_classification[component_classification['true_f_procedural'] == True]
    technical_df = component_classification[component_classification['true_f_technical'] == True]
    
    model = SentenceTransformer('all-MiniLM-L6-v2')
    anatomic_embeddings = model.encode(anatomic_df['context_dialogue'].to_list())
    procedural_embeddings = model.encode(procedural_df['context_dialogue'].to_list())
    technical_embeddings = model.encode(technical_df['context_dialogue'].to_list())
    
    return anatomic_embeddings, procedural_embeddings, technical_embeddings, anatomic_df, procedural_df, technical_df
    
def run_context_phrase(case_id):
    aligned_fb_detection = pd.read_csv(os.path.join(aligned_fb_detection_dir, f"LFB{case_id}_full 'all phrases'.csv"))
    component_classification = pd.read_csv(os.path.join(component_classification_dir, f"LFB{case_id}_full 'all phrases'.csv"))
    
    dir_ = os.path.join(rag_embeddings_dir, f'context+phrase')
    
    annotations_fb_dir = os.path.join(dir_, 'annotations_fb')
    annotations_component_dir = os.path.join(dir_, 'annotations_component')
    fb_embeddings_dir = os.path.join(dir_, f'fb')
    no_fb_embeddings_dir = os.path.join(dir_, f'no_fb')
    
    os.makedirs(annotations_fb_dir, exist_ok=True)
    os.makedirs(annotations_component_dir, exist_ok=True)
    os.makedirs(fb_embeddings_dir, exist_ok=True)
    os.makedirs(no_fb_embeddings_dir, exist_ok=True)
    
    fb_embeddings, no_fb_embeddings, fb_df, no_fb_df = get_context_phrase_embeddings_fb(aligned_fb_detection)
    f_anatomic_embeddings, f_procedural_embeddings, f_technical_embeddings, f_anatomic_df, f_procedural_df, f_technical_df = get_context_phrase_embeddings_component(component_classification)
    
    annotations_fb = pd.DataFrame(columns=['time', 'case_id', 'context_dialogue', 'fb_instance', 'embedding_path'])
    for i in range(len(fb_embeddings)):
        embedding = fb_embeddings[i]
        time = fb_df.iloc[i]['phrase'][1:18]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(fb_embeddings_dir, filename)
        np.save(path, embedding)

        # context_dialogue = fb_df.iloc[i]['context_dialogue'].split('\n')[1:-1]
        # context_dialogue = [x.split(':')[2][:-1].strip() for x in context_dialogue]
        # context_dialogue = ' '.join(context_dialogue)
        context_dialogue = fb_df.iloc[i]['context_dialogue']
        
        annotations_fb.loc[len(annotations_fb)] = [time, case_id, context_dialogue, True, path]
    for i in range(len(no_fb_embeddings)):
        embedding = no_fb_embeddings[i]
        time = no_fb_df.iloc[i]['phrase'][1:18]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(no_fb_embeddings_dir, filename)
        np.save(path, embedding)

        # context_dialogue = no_fb_df.iloc[i]['context_dialogue'].split('\n')[1:-1]
        # context_dialogue = [x.split(':')[2].strip() for x in context_dialogue]
        # context_dialogue = ' '.join(context_dialogue)
        context_dialogue = no_fb_df.iloc[i]['context_dialogue']

        annotations_fb.loc[len(annotations_fb)] = [time, case_id, context_dialogue, False, path]
    annotations_fb.to_csv(os.path.join(annotations_fb_dir, f'LFB{case_id}_full.csv'))

    
    annotations_component = pd.DataFrame(columns=['time', 'case_id', 'context_dialogue', 'f_anatomic', 'f_procedural', 'f_technical', 'embedding_path'])
    for i in range(len(f_anatomic_embeddings)):
        embedding = f_anatomic_embeddings[i]
        time = f_anatomic_df.iloc[i]['phrase'][1:18]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(fb_embeddings_dir, filename)
        np.save(path, embedding)

        # context_dialogue = f_anatomic_df.iloc[i]['context_dialogue'].split('\n')[1:-1]
        # context_dialogue = [x.split(':')[2][:-1].strip() for x in context_dialogue]
        # context_dialogue = ' '.join(context_dialogue)
        context_dialogue = f_anatomic_df.iloc[i]['context_dialogue']
        
        f_anatomic, f_procedural, f_technical = f_anatomic_df.iloc[i]['true_f_anatomic'], f_anatomic_df.iloc[i]['true_f_procedural'], f_anatomic_df.iloc[i]['true_f_technical']
        
        if time not in annotations_component['time']:
            annotations_component.loc[len(annotations_component)] = [time, case_id, context_dialogue, f_anatomic, f_procedural, f_technical, path]
    for i in range(len(f_procedural_embeddings)):
        embedding = f_procedural_embeddings[i]
        time = f_procedural_df.iloc[i]['phrase'][1:18]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(fb_embeddings_dir, filename)
        np.save(path, embedding)

        # context_dialogue = f_procedural_df.iloc[i]['context_dialogue'].split('\n')[1:-1]
        # context_dialogue = [x.split(':')[2].strip() for x in context_dialogue]
        # context_dialogue = ' '.join(context_dialogue)
        context_dialogue = f_procedural_df.iloc[i]['context_dialogue']
        
        f_anatomic, f_procedural, f_technical = f_procedural_df.iloc[i]['true_f_anatomic'], f_procedural_df.iloc[i]['true_f_procedural'], f_procedural_df.iloc[i]['true_f_technical']
        
        if time not in annotations_component['time']:
            annotations_component.loc[len(annotations_component)] = [time, case_id, context_dialogue, f_anatomic, f_procedural, f_technical, path]
    for i in range(len(f_technical_embeddings)):
        embedding = f_technical_embeddings[i]
        time = f_technical_df.iloc[i]['phrase'][1:18]
        filename = f'LFB{case_id}_{time}.npy'
        path = os.path.join(fb_embeddings_dir, filename)
        np.save(path, embedding)

        # context_dialogue = f_technical_df.iloc[i]['context_dialogue'].split('\n')[1:-1]
        # context_dialogue = [x.split(':')[2].strip() for x in context_dialogue]
        # context_dialogue = ' '.join(context_dialogue)
        context_dialogue = f_technical_df.iloc[i]['context_dialogue']
        
        f_anatomic, f_procedural, f_technical = f_technical_df.iloc[i]['true_f_anatomic'], f_technical_df.iloc[i]['true_f_procedural'], f_technical_df.iloc[i]['true_f_technical']
        
        if time not in annotations_component['time']:
            annotations_component.loc[len(annotations_component)] = [time, case_id, context_dialogue, f_anatomic, f_procedural, f_technical, path]
    annotations_component.to_csv(os.path.join(annotations_component_dir, f'LFB{case_id}_full.csv'))
    
    return annotations_fb, annotations_component

In [18]:
aligned_fb_detection = pd.read_csv(os.path.join(aligned_fb_detection_dir, f"LFB{1}_full 'all phrases'.csv"))
aligned_fb_detection.iloc[0]['phrase'][1:18]

'00:35:37-00:35:40'

In [19]:
for i in [1, 2, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33]:
    print(f'Processing LFB{i}')
    try:
        run_context_phrase(i)
    except Exception as e:
        print(f"Error: {e}")
    print()

Processing LFB1





Processing LFB2





Processing LFB6
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB6_full 'all phrases'.csv"

Processing LFB8
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB8_full 'all phrases'.csv"

Processing LFB9





Processing LFB10





Processing LFB11
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB11_full 'all phrases'.csv"

Processing LFB12
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB12_full 'all phrases'.csv"

Processing LFB13
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB13_full 'all phrases'.csv"

Processing LFB15
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB15_full 'all phrases'.csv"

Processing LFB16
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB16_full 'all phrases'.csv"

Processing LFB17
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB17_full 'all phrases'.csv"

Processing LFB18





Processing LFB19
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB19_full 'all phrases'.csv"

Processing LFB20
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB20_full 'all phrases'.csv"

Processing LFB21
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB21_full 'all phrases'.csv"

Processing LFB22
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB22_full 'all phrases'.csv"

Processing LFB23
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB23_full 'all phrases'.csv"

Processing LFB24
Error: [Errno 2] No such file or directory: "results/extract_dialogue/aligned_fb_detection/LFB24_full 'all phrases'.csv"

Processing LFB25
Error: [Errno 2] No such file or directory: "results/extract_dialogue/component_classification/LFB25_full 'all phrases'.csv"

Pr