In [5]:
import webvtt
import spacy
import pandas as pd

# Load spaCy for NLP processing (using a medium model for better similarity)
nlp = spacy.load("en_core_web_md")

# Function to calculate semantic similarity between two text snippets
def calculate_similarity(text1, text2):
    # Process the texts with spaCy
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    
    # Calculate and return the similarity score (between 0 and 1)
    return doc1.similarity(doc2)

# Function to split caption and calculate internal similarity between parts
def split_caption_and_calculate_similarity(caption_text):
    # Split the caption into parts (you can adjust the splitting strategy here)
    sentences = caption_text.split(".")  # Simple split by period (or use sentence tokenization)
    
    if len(sentences) >= 2:
        part1 = sentences[0].strip()  # First part of the caption (before the period)
        part2 = sentences[1].strip()  # Second part of the caption (after the period)
        similarity = calculate_similarity(part1, part2)  # Calculate similarity between the parts
        return similarity
    else:
        return 1.0  # If there's only one part, similarity with itself is 1.0

# Function to parse VTT file and calculate similarity
def analyze_vtt_with_similarity(file_path):
    data = []  # List to store rows of data
    captions = []  # List to store all captions for the conversation

    previous_caption_text = None  # To store the previous caption text for similarity calculation

    # Read all captions and store them
    for caption in webvtt.read(file_path):
        speaker_text = caption.text.strip()  # The speaker's text is assumed to be here
        captions.append(speaker_text)  # Collect all captions for later comparison
        
        # Calculate semantic similarity with the previous caption (if available)
        similarity = None
        if previous_caption_text:
            similarity = calculate_similarity(previous_caption_text, speaker_text)
        
        # Update the previous caption text for the next iteration
        previous_caption_text = speaker_text

        # Append data for this caption (but without full conversation similarity yet)
        data.append({
            "File Name": file_path,
            "Start Time": caption.start,
            "End Time": caption.end,
            "Caption Text": speaker_text,
            "Semantic Similarity with Previous": similarity if similarity is not None else "N/A",
            "Internal Semantic Similarity": split_caption_and_calculate_similarity(speaker_text)  # Add internal similarity
        })

    # Combine all captions into one string representing the full conversation
    full_conversation = " ".join(captions)

    # Now calculate similarity of each caption with the full conversation
    for row in data:
        caption_text = row["Caption Text"]
        conversation_similarity = calculate_similarity(caption_text, full_conversation)
        row["Similarity with Full Conversation"] = conversation_similarity

    # Create DataFrame from the collected data
    df = pd.DataFrame(data)
    return df

# Function to handle multiple VTT files and compare captions
def analyze_multiple_vtt_with_similarity(files):
    all_data = []  # List to accumulate data from all files

    # Process each VTT file
    for file_path in files:
        file_data = analyze_vtt_with_similarity(file_path)  # Get DataFrame for the current file
        all_data.append(file_data)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Example usage
vtt_files = ["HAKA3_copy.vtt", "meeting1_copy.vtt", "meeting2_copy.vtt"]  # List of VTT files
similarity_df = analyze_multiple_vtt_with_similarity(vtt_files)

# Display the resulting DataFrame with semantic similarity
similarity_df


  return doc1.similarity(doc2)


Unnamed: 0,File Name,Start Time,End Time,Caption Text,Semantic Similarity with Previous,Internal Semantic Similarity,Similarity with Full Conversation
0,HAKA3_copy.vtt,00:00:04.000,00:00:08.000,"Unknown Speaker: Hey, it's very good. How abou...",,0.870865,0.976718
1,HAKA3_copy.vtt,00:00:08.000,00:00:12.000,Speaker 1: I'm good. I'm good. Where are you f...,0.931798,0.857582,0.959142
2,HAKA3_copy.vtt,00:00:16.000,00:00:19.000,Unknown Speaker: Interesting. Are you an under...,0.854675,0.519242,0.925572
3,HAKA3_copy.vtt,00:00:19.000,00:00:26.000,"Speaker 1: No, I'm a grad student at BU. Which...",0.902818,0.753104,0.963707
4,HAKA3_copy.vtt,00:00:26.000,00:00:30.000,"Unknown Speaker: What? Excuse me, what's your ...",0.934009,1.0,0.967547
5,HAKA3_copy.vtt,00:00:30.000,00:00:32.000,"Speaker 1: University of Illinois, Urbana Cham...",0.827755,0.0,0.797841
6,HAKA3_copy.vtt,00:00:32.000,00:00:38.000,"Speaker 1: All right, got it. So what are you ...",0.778459,0.87361,0.981088
7,HAKA3_copy.vtt,00:00:38.000,00:00:41.000,Speaker 2: I'm studying psychology with a mino...,0.829053,0.0,0.891161
8,HAKA3_copy.vtt,00:00:41.000,00:00:51.000,"Speaker 1: Minor in biology. Well, that's quit...",0.888286,0.546214,0.982803
9,HAKA3_copy.vtt,00:00:51.000,00:00:55.000,"Speaker 2: Yes, still bad. Well, yeah.",0.95883,0.817236,0.973988
