In [1]:
import json
import pandas as pd
import re
from rapidfuzz.fuzz import token_set_ratio

# ----------- Normalization Functions -----------

def normalize_text(text):
    # Lowercase and strip punctuation (keep apostrophes for contractions)
    return re.sub(r"[^\w\s']", '', text.lower()).strip()

# ----------- Fuzzy Matching Function -----------

def refined_fuzzy_match(words_list, utterance_text, threshold=85):
    utterance_clean = normalize_text(utterance_text)
    utterance_words = utterance_clean.split()
    n = len(utterance_words)
    norm_words = [normalize_text(w["word"]) for w in words_list]

    best_score = 0
    best_i = None
    best_window = []

    # Step 1: Find best fuzzy-matching window
    for window_size in range(n, n + 6):  # small window buffer
        for i in range(len(norm_words) - window_size + 1):
            window = norm_words[i:i+window_size]
            window_text = ' '.join(window)
            score = token_set_ratio(window_text, utterance_clean)
            if score > best_score and score >= threshold:
                best_score = score
                best_i = i
                best_window = window

    # Step 2: Slide for exact submatch within best window
    if best_i is not None:
        for i in range(best_i, best_i + len(best_window) - n + 1):  # typo was `besi`
            if norm_words[i:i+n] == utterance_words:
                return words_list[i]["start"], words_list[i+n-1]["end"], 100  # perfect alignment

        # fallback: use full fuzzy window timing
        return words_list[best_i]["start"], words_list[best_i + len(best_window) - 1]["end"], best_score  # typo: `wt_ords_list`

    return None, None, 0

# ----------- Load Dataset A (Transcript JSON) -----------

with open("/home/cleode5a7/Desktop/analyses_friends_annotations/friends_annotations/annotation_results/Speech2Text/s1/friends_s01e01a_aa.json", "r") as f:
    transcript_json = json.load(f)

words_list = transcript_json["results"]["channels"][0]["alternatives"][0]["words"]

# ----------- Load Dataset B (Utterance CSV) -----------

df_utterances = pd.read_csv("../meld_data/utterrance_ep1.csv")

# ----------- Apply Fuzzy Matching to Each Utterance -----------

results = []
for utterance in df_utterances["Utterance"]:
    start, end, score = refined_fuzzy_match(words_list, utterance)
    results.append((start, end, score))

df_utterances[["start", "end", "match_score"]] = pd.DataFrame(results, index=df_utterances.index)

# ----------- Save or Display -----------



In [2]:
df_utterances

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,start,end,match_score
0,5550,"Alright, so I'm back in high school, I'm stand...",Chandler,neutral,neutral,559,0,1,1,0 days 00:01:19.037000,0 days 00:01:25.417000,53.87,60.51,98.989899
1,5551,"Oh, yeah. Had that dream.",All,neutral,neutral,559,1,1,1,0 days 00:01:25.627000,0 days 00:01:27.169000,,,0.0
2,5552,"Then I look down, and I realize there's a phon...",Chandler,surprise,negative,559,2,1,1,0 days 00:01:27.378000,0 days 00:01:31.465000,62.69,65.84,96.551724
3,5553,Instead of...?,Joey,surprise,negative,559,3,1,1,0 days 00:01:34.928000,0 days 00:01:35.600000,71.35,72.082,100.0
4,5554,That's right.,Chandler,neutral,neutral,559,4,1,1,0 days 00:01:35.600000,0 days 00:01:36.892000,72.136,72.98,100.0
5,5555,Never had that dream.,Joey,neutral,neutral,559,5,1,1,0 days 00:01:37.055000,0 days 00:01:37.973000,,,0.0
6,5556,No.,Phoebe,neutral,neutral,559,6,1,1,0 days 00:01:37.973000,0 days 00:01:38.629000,133.43,133.902,100.0
7,5557,"All of a sudden, the phone starts to ring.",Chandler,neutral,neutral,559,7,1,1,0 days 00:01:38.723000,0 days 00:01:42.851000,75.03,78.93,100.0
8,7646,"I don't want to be single, okay? I just... I j...",Ross,sadness,negative,769,0,1,1,0 days 00:03:22.660000,0 days 00:03:25.829000,209.252,219.554,96.296296
9,7647,And I just want a million dollars!,Chandler,fear,negative,769,1,1,1,0 days 00:03:29.959000,0 days 00:03:33.086000,218.47,220.9,100.0
