In [16]:
import glob
import json
import os

import librosa
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import soundfile as sf 

import utils
import plots_improved as plots

# listen to audios
import IPython.display as ipd

In [17]:
# Parse the CSV data
df = pd.read_json("../data/input_data/muchomusic_musiccaps.json")

def is_single_sounding_event(prompt, answers):
    prompt_lower = prompt.lower()

    # Questions that typically require longer-term understanding
    long_term_keywords = [
        'genre', 'mood', 'atmosphere', 'tempo', 'style', 'overall', 'primary', 'mainly',
        'primarily', 'convey', 'purpose', 'suitable for', 'intended for', 'typically heard',
        'likely performed', 'cultural', 'influence', 'era', 'origin', 'recording quality',
        'emotion', 'feeling', 'vocal style', 'vocal range', 'harmonic structure',
        'rhythmic pattern', 'time signature', 'lyrical content', 'language'
    ]
    
    # Questions that are typically about single events
    single_event_keywords = [
        'instrument plays', 'which instrument', 'sound effect', 'vocal technique',
        'specific technique', 'at the beginning', 'at the end', 'interrupts',
        'distinctive characteristic', 'unique element', 'particular sound',
        'specific moment', 'brief appearance', 'sound at start', 'sound at end'
    ]
    
    # Check for long-term understanding keywords
    for keyword in long_term_keywords:
        if keyword in prompt_lower:
            return False
    
    # Check for single event keywords
    for keyword in single_event_keywords:
        if keyword in prompt_lower:
            return True
    
    # Default to False for ambiguous cases (requires longer understanding)
    return False

# Apply the function to each row
df['question'] = df.apply(lambda row: row['prompt'].split("Question: ")[-1], axis=1) # skip in-context example
df['single_sounding_event'] = df.apply(lambda row: is_single_sounding_event(row['question'], row['answers']), axis=1)

In [18]:
# after manual inspection, this is what we got
single_sounding_events = [337, 375, 396, 400, 425, 427, 428,482, 499, 514,655, 661, 662, 666, 719, 828, 833, 869,980,995,1028,1075,1137,1176,]

In [19]:
# dataset_path = "/media/gigibs/DD02EEEC68459F17/datasets"
# dataset_path = "/scratch/gv2167/datasets/"
dataset_path = "/home/gigibs/Documents/datasets"

In [20]:
qfs = pd.read_json("../data/output_data/qwen_fs.json")
qzs = pd.read_json("../data/output_data/qwen_zs.json")
mfs = pd.read_json("../data/output_data/mu_fs.json")
mzs = pd.read_json("../data/output_data/mu_zs.json")

qfs = utils.parse_df(qfs, experiment_name="qwen_fs")
qzs = utils.parse_df(qzs, experiment_name="qwen_zs")
mfs = utils.parse_df(mfs, experiment_name="mu_fs").fillna(0)
mzs = utils.parse_df(mzs, experiment_name="mu_zs").fillna(0)

  text_score = text_contrib / (text_contrib + audio_contrib)# + 1e-5)
  audio_score = audio_contrib / (text_contrib + audio_contrib)# + 1e-5)
  text_score = text_contrib / (text_contrib + audio_contrib)# + 1e-5)
  audio_score = audio_contrib / (text_contrib + audio_contrib)# + 1e-5)


In [21]:
def get_shapley_values(row):
    question_id =  row.name
    
    data = f"../{row['output_folder']}/{question_id}_info.npz"
    data = np.load(data)
    tokens = row["input_ids"]
    audio_tokens = np.where(tokens < 0)[-1]
    question_tokens = np.where(tokens >= 0)[-1]
    
    all_shapley_values = data["shapley_values"].squeeze(0).squeeze(0)
    audio_shapley_values = all_shapley_values[audio_tokens]
    question_shapley_values = all_shapley_values[question_tokens]

    return all_shapley_values, audio_shapley_values, question_shapley_values

def weight_important_features(audio, audio_shapley_values):
    """
    given an audio and its shapley values, apply the magnitude to the audio windows and 
    return the processed audio
    """
    # Calculate segment length
    total_length = len(audio)
    num_segments = len(audio_shapley_values)
    segment_length = total_length // num_segments
    
    processed_audio = np.zeros_like(audio)
    
    for i, magnitude in enumerate(audio_shapley_values):
        start_idx = i * segment_length
        end_idx = (i + 1) * segment_length
        
        # For the last segment, include any remaining samples
        if i == num_segments - 1:
            end_idx = total_length
        
        processed_audio[start_idx:end_idx] = audio[start_idx:end_idx] * magnitude

    return processed_audio

def save_qual_results(sample, img_format="pdf", gt_start=None, gt_end=None):
    if sample["experiment"].startswith("qwen"):
        sample_rate = 16000
    else:
        sample_rate = 24000

    x, fs = librosa.load(os.path.join(dataset_path, sample.audio_path), sr=sample_rate)
    all_shapley_values, audio_shapley_values, question_shapley_values = get_shapley_values(sample)

    
    output_folder = f"{sample['experiment']}_{sample.name}"
    os.makedirs(output_folder, exist_ok=True)
    print("saving data in ", output_folder)
    
    # generate plot per whole output
    plots.visualize_shapley_analysis(
        text_shapley_values=question_shapley_values,
        question_tokens=sample["input_tokens"],
        answer_tokens=sample["output_tokens"],
        audio_signal=x,
        audio_shapley_values=audio_shapley_values,
        sample_rate=sample_rate,
        gt_start=gt_start,
        gt_end=gt_end,
        idx=None,
        figsize=(12,7),
        colormap="binary",
        threshold=0.8, 
        save_folder=output_folder,
        show_image=False,
        output_format=img_format
    )

    positive_contrib = np.clip(audio_shapley_values, a_min=0, a_max=None).sum(axis=1)
    neg_contrib = np.clip(audio_shapley_values, a_min=None, a_max=0).sum(axis=1)
    agg_contrib = audio_shapley_values.sum(axis=1)
   
    pos_wav = weight_important_features(x, positive_contrib)
    neg_wav = weight_important_features(x, neg_contrib)
    # agg_wav =  weight_important_features(x, agg_contrib)
    sf.write(os.path.join(output_folder, "agg_positive.wav"),  pos_wav, sample_rate)
    sf.write(os.path.join(output_folder, "agg_negative.wav"), neg_wav, sample_rate)
    # sf.write(os.path.join(output_folder, "agg_abs.wav"), agg_wav, sample_rate)

    # generate plots per token
    for i, t in enumerate(sample["output_tokens"]):
        highlighted_text = plots.visualize_shapley_analysis(
                text_shapley_values=question_shapley_values,
                question_tokens=sample["input_tokens"],
                answer_tokens=sample["output_tokens"],
                audio_signal=x,
                audio_shapley_values=audio_shapley_values,
                sample_rate=sample_rate,
                gt_start=gt_start,
                gt_end=gt_end,
                idx=i,
                figsize=(12,7),
                colormap="binary",
                threshold=0.3, 
                save_folder=output_folder,
                show_image=False,
                output_format=img_format
            )

        positive_contrib = np.clip(audio_shapley_values[:, i], a_min=0, a_max=None)
        neg_contrib = np.clip(audio_shapley_values[:, i], a_min=None, a_max=0)
        agg_contrib = audio_shapley_values[:, i]
        pos_wav = weight_important_features(x, positive_contrib)
        neg_wav = weight_important_features(x, neg_contrib)
        # agg_wav =  weight_important_features(x, agg_contrib)
        
        sf.write(os.path.join(output_folder, f"{i}_{t}_positive.wav"), pos_wav, sample_rate)
        sf.write(os.path.join(output_folder, f"{i}_{t}_negative.wav"), neg_wav, sample_rate)
        # sf.write(os.path.join(output_folder, f"{i}_{t}_abs.wav"), agg_wav, sample_rate)

    with open(os.path.join(output_folder, f"{output_folder}.txt"), "w") as f:
        f.write(sample["model_output"])
        
    return

In [None]:
sse = pd.read_csv("single_sounding_events.csv")

sse.set_index("question_id", inplace=True)

In [24]:
sse

Unnamed: 0_level_0,gt_start,gt_end
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1
337,5,10
396,0,3
400,8,10
425,0,3
427,3,5
482,8,10
499,4,6
655,0,4
661,6,10
719,6,7


In [None]:
# generate all figures w approx ground truth
for exp in [mzs, mfs, qzs, qfs]:
    for i in single_sounding_events:
        try:
            save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])
        except Exception as e:
            print("could not process track", i)

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_337
text_max 42.45125325520833, text_min -30.605326334635418, text_median -0.47770182291666674
audio_max 34.761383056640625, audio_min -29.008656819661457, audio_median -0.27577718098958337
max_abs_value 42.45125325520833
text_max 3.5916341145833335, text_min -3.896484375, text_median -0.14420572916666666
audio_max 2.6204427083333335, audio_min -3.1939493815104165, audio_median 0.06380208333333333
max_abs_value 6.566324869791667
text_max 6.179158528645833, text_min -4.107401529947917, text_median -0.056315104166666664
audio_max 2.9174702962239585, audio_min -2.7297770182291665, audio_median 0.026204427083333332
max_abs_value 6.566324869791667
text_max 3.3898111979166665, text_min -3.0590006510416665, text_median -0.11966959635416667
audio_max 2.0861002604166665, audio_min -2.1305338541666665, audio_median 0.11124674479166667
max_abs_value 6.566324869791667
text_max 2.46337890625, text_min -2.57666015625, text_median 0.03173828125
audio_max 2.2698567708333335, audi

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 3.5120442708333335, text_min -3.6966145833333335, text_median 0.21923828125
audio_max 3.9954427083333335, audio_min -3.5787760416666665, audio_median 0.044270833333333336
max_abs_value 7.908668518066406
text_max 2.8070170084635415, text_min -3.1527328491210938, text_median 0.35231526692708337
audio_max 2.3931070963541665, audio_min -3.28680419921875, audio_median -0.1943359375
max_abs_value 7.908668518066406
text_max 2.3331705729166665, text_min -2.4481608072916665, text_median 0.18007405598958334
audio_max 2.6909993489583335, audio_min -3.5216471354166665, audio_median -0.0913848876953125
max_abs_value 7.908668518066406
text_max 3.205322265625, text_min -3.4685872395833335, text_median 0.3836212158203125
audio_max 2.9664713541666665, audio_min -4.01318359375, audio_median -0.10410563151041666
max_abs_value 7.908668518066406


  fig = plt.figure(figsize=figsize)


text_max 5.766133626302083, text_min -3.2342122395833335, text_median 0.22298177083333331
audio_max 3.4903971354166665, audio_min -7.449544270833333, audio_median -0.3160400390625
max_abs_value 7.908668518066406
text_max 3.2681884765625, text_min -3.8179524739583335, text_median 0.10461997985839844
audio_max 2.5320231119791665, audio_min -3.3774820963541665, audio_median -0.11485799153645833
max_abs_value 7.908668518066406
text_max 2.95050048828125, text_min -2.1877848307291665, text_median 0.13008626302083331
audio_max 2.5473225911458335, audio_min -3.1843668619791665, audio_median -0.11263020833333333
max_abs_value 7.908668518066406
text_max 3.1262410481770835, text_min -2.0150553385416665, text_median 0.220703125
audio_max 2.5904947916666665, audio_min -3.15234375, audio_median -0.15321985880533856
max_abs_value 7.908668518066406
text_max 3.976318359375, text_min -2.9545135498046875, text_median 0.11279296875
audio_max 2.155359903971354, audio_min -3.1503499348958335, audio_median -

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_400
text_max 35.522705078125, text_min -26.32091776529948, text_median -0.6960042317708333
audio_max 24.403727213541668, audio_min -20.276285807291668, audio_median -0.6331380208333331
max_abs_value 35.522705078125
text_max 3.9039713541666665, text_min -2.01953125, text_median 0.3180338541666667
audio_max 5.013020833333333, audio_min -3.9788411458333335, audio_median -0.16861979166666666
max_abs_value 7.6689453125
text_max 3.77783203125, text_min -2.4509684244791665, text_median -0.013671875
audio_max 3.8438720703125, audio_min -3.8179117838541665, audio_median 0.029703776041666668
max_abs_value 7.6689453125
text_max 3.5531005859375, text_min -3.1887613932291665, text_median 0.14601643880208334
audio_max 3.4188639322916665, audio_min -2.7369791666666665, audio_median -0.1396484375
max_abs_value 7.6689453125
text_max 3.54931640625, text_min -1.9544270833333333, text_median -0.09505208333333333
audio_max 2.58740234375, audio_min -3.453125, audio_median -0.1336263020

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_425
text_max 14.6181640625, text_min -12.795471191406248, text_median -0.035730997721354185
audio_max 11.818440755208334, audio_min -8.950358072916666, audio_median 0.515096028645833
max_abs_value 14.6181640625
text_max 4.304850260416667, text_min -2.6502278645833335, text_median -0.0048828125
audio_max 2.9244791666666665, audio_min -3.9755859375, audio_median 0.0
max_abs_value 5.479329427083333
text_max 2.5359700520833335, text_min -2.5388081868489585, text_median 0.06929524739583333
audio_max 1.9031575520833333, audio_min -1.8473307291666667, audio_median -0.020670572916666668
max_abs_value 5.479329427083333
text_max 2.4856770833333335, text_min -2.6481119791666665, text_median -0.15911865234375
audio_max 1.8995157877604167, audio_min -2.6720377604166665, audio_median 0.040384928385416664
max_abs_value 5.479329427083333
text_max 2.66796875, text_min -2.3413899739583335, text_median -0.21708170572916666
audio_max 2.7493896484375, audio_min -1.6435343424479167, au

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_427
text_max 25.826459248860672, text_min -14.40142822265625, text_median 0.8717753092447915
audio_max 19.47196706136068, audio_min -17.871734619140625, audio_median -0.5784708658854167
max_abs_value 25.826459248860672
text_max 6.16015625, text_min -5.276529947916667, text_median 0.18115234375
audio_max 3.12890625, audio_min -2.5709635416666665, audio_median -0.108154296875
max_abs_value 8.012369791666666
text_max 4.33740234375, text_min -3.177734375, text_median 0.3741353352864583
audio_max 2.910400390625, audio_min -3.7367757161458335, audio_median -0.10215250651041666
max_abs_value 8.012369791666666
text_max 3.6273600260416665, text_min -2.4378255208333335, text_median 0.19745699564615884
audio_max 3.0426432291666665, audio_min -4.3760325113932295, audio_median -0.27775065104166663
max_abs_value 8.012369791666666
text_max 3.8873418172200522, text_min -2.4652506510416665, text_median 0.16106160481770834
audio_max 2.5065104166666665, audio_min -2.33758544921875, 

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 4.760579427083333, text_min -2.9098307291666665, text_median 0.06380208333333333
audio_max 1.8323567708333333, audio_min -1.9791666666666667, audio_median 0.041015625
max_abs_value 7.021321614583333
text_max 2.15655517578125, text_min -2.55157470703125, text_median -0.019453684488932293
audio_max 2.71923828125, audio_min -1.6112467447916667, audio_median 0.07460530598958333
max_abs_value 7.021321614583333
text_max 4.900594075520833, text_min -5.8295338948567705, text_median -0.2879231770833333
audio_max 2.5319620768229165, audio_min -2.1998291015625, audio_median 0.2596028645833333
max_abs_value 7.021321614583333
text_max 2.2794392903645835, text_min -3.697265625, text_median -0.23079427083333334
audio_max 3.7194010416666665, audio_min -2.5550130208333335, audio_median 0.049153645833333336
max_abs_value 7.021321614583333
text_max 3.5201619466145835, text_min -5.233083089192708, text_median -0.22355143229166666
audio_max 3.1510009765625, audio_min -2.497314453125, audio_median 

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_499
text_max 37.46070861816406, text_min -43.803385416666664, text_median 2.809858957926432
audio_max 48.29247029622396, audio_min -49.357625325520836, audio_median 1.4046669006347656
max_abs_value 49.357625325520836
text_max 3.98828125, text_min -4.060221354166667, text_median 0.34505208333333337
audio_max 3.232421875, audio_min -2.6881510416666665, audio_median 0.09037272135416667
max_abs_value 5.647216796875
text_max 2.166290283203125, text_min -2.9716796875, text_median 0.2894159952799479
audio_max 2.3607584635416665, audio_min -3.101318359375, audio_median 0.021570841471354168
max_abs_value 5.647216796875
text_max 2.0998331705729165, text_min -2.4736429850260415, text_median -0.019816080729166668
audio_max 2.933349609375, audio_min -3.238494873046875, audio_median 0.08174641927083334
max_abs_value 5.647216796875
text_max 3.8837432861328125, text_min -2.1979166666666665, text_median 0.12150065104166666
audio_max 3.3948567708333335, audio_min -4.038126627604167

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 3.5421549479166665, text_min -2.38909912109375, text_median -0.142578125
audio_max 2.8841145833333335, audio_min -2.6697591145833335, audio_median -0.06640625
max_abs_value 4.73828125
text_max 4.536376953125, text_min -2.6759847005208335, text_median -0.08219401041666667
audio_max 2.2765299479166665, audio_min -2.6370442708333335, audio_median 0.17333984375
max_abs_value 4.73828125
text_max 3.8075459798177085, text_min -2.1521097819010415, text_median 0.09716796875
audio_max 1.9861653645833333, audio_min -2.6831766764322915, audio_median -0.0458984375
max_abs_value 4.73828125
text_max 3.1436767578125, text_min -1.7578125, text_median -0.10929361979166667
audio_max 2.5208333333333335, audio_min -1.9046223958333333, audio_median 0.010986328125
max_abs_value 4.73828125
text_max 2.9447428385416665, text_min -2.4544270833333335, text_median -0.12589518229166666
audio_max 2.4508158365885415, audio_min -2.5398763020833335, audio_median 0.08833821614583333
max_abs_value 4.73828125
tex

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_661
text_max 25.736531575520832, text_min -24.969228108723957, text_median 1.8321965535481772
audio_max 20.90480550130208, audio_min -37.750795682271324, audio_median -0.11129506429036451
max_abs_value 37.750795682271324
text_max 3.2815755208333335, text_min -2.5354817708333335, text_median 0.2913411458333333
audio_max 2.0986328125, audio_min -2.6253255208333335, audio_median 0.11149088541666667
max_abs_value 7.971354166666667
text_max 3.4840494791666665, text_min -2.6625162760416665, text_median 0.10506184895833334
audio_max 1.950439453125, audio_min -2.9226888020833335, audio_median -0.10475667317708334
max_abs_value 7.971354166666667
text_max 3.3248697916666665, text_min -2.0089518229166665, text_median 0.033976236979166664
audio_max 2.278076171875, audio_min -3.1630678176879883, audio_median 0.0859375
max_abs_value 7.971354166666667
text_max 2.8043619791666665, text_min -4.0209808349609375, text_median 0.09851582845052084
audio_max 2.245361328125, audio_min -2

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 5.881184895833333, text_min -4.214192708333333, text_median -0.06380208333333333
audio_max 2.990234375, audio_min -2.375, audio_median 0.10416666666666667
max_abs_value 5.881184895833333
text_max 2.854248046875, text_min -2.2621256510416665, text_median 0.07666015625
audio_max 2.4132486979166665, audio_min -2.1042887369791665, audio_median -0.12898763020833334
max_abs_value 5.881184895833333
text_max 3.1774088541666665, text_min -2.366943359375, text_median 0.026448567708333332
audio_max 2.5509440104166665, audio_min -1.72705078125, audio_median 0.10237630208333333
max_abs_value 5.881184895833333
text_max 3.8748372395833335, text_min -2.7142740885416665, text_median 0.0361328125
audio_max 2.5703125, audio_min -2.8346354166666665, audio_median 0.08740234375
max_abs_value 5.881184895833333
text_max 3.0909423828125, text_min -3.0243326822916665, text_median 0.027018229166666668
audio_max 2.3390909830729165, audio_min -2.814453125, audio_median 0.14774576822916666
max_abs_value 5.

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 4.8056640625, text_min -4.23681640625, text_median 0.061767578125
audio_max 5.06640625, audio_min -4.69677734375, audio_median 0.1474609375
max_abs_value 8.01708984375
text_max 2.60797119140625, text_min -3.698486328125, text_median -0.033050537109375
audio_max 4.3828125, audio_min -3.465576171875, audio_median 0.064453125
max_abs_value 8.01708984375
text_max 3.5048828125, text_min -3.6630859375, text_median 0.1041259765625
audio_max 4.6980133056640625, audio_min -3.743408203125, audio_median -0.17724609375
max_abs_value 8.01708984375
text_max 3.45556640625, text_min -5.1728515625, text_median 0.09429931640625
audio_max 6.41162109375, audio_min -3.6890716552734375, audio_median -0.1182708740234375
max_abs_value 8.01708984375
text_max 4.0301055908203125, text_min -3.822509765625, text_median 0.03104400634765625
audio_max 4.5111083984375, audio_min -3.3485107421875, audio_median -0.1314697265625
max_abs_value 8.01708984375
text_max 3.117431640625, text_min -4.2237548828125, text

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_zs_869
text_max 34.54249827067058, text_min -39.18535359700521, text_median 0.13989766438802098
audio_max 29.74558512369792, audio_min -30.22911580403646, audio_median 1.2307535807291667
max_abs_value 39.18535359700521
text_max 3.4830729166666665, text_min -3.6988932291666665, text_median 0.008951822916666666
audio_max 2.1256510416666665, audio_min -3.4420572916666665, audio_median 0.146484375
max_abs_value 6.750518798828125
text_max 3.5516459147135415, text_min -5.16357421875, text_median -0.143280029296875
audio_max 3.4322916666666665, audio_min -3.0719401041666665, audio_median 0.025309244791666668
max_abs_value 6.750518798828125
text_max 3.171875, text_min -3.6783854166666665, text_median 0.0029296875
audio_max 1.4217529296875, audio_min -2.2460123697916665, audio_median -0.024251302083333332
max_abs_value 6.750518798828125
text_max 3.1964950561523438, text_min -3.7977701822916665, text_median 0.16853841145833334
audio_max 2.1624348958333335, audio_min -1.7503865

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 4.326171875, text_min -2.6595052083333335, text_median 0.13297526041666666
audio_max 2.4710286458333335, audio_min -2.6710611979166665, audio_median -0.13427734375
max_abs_value 4.990234375
text_max 2.6324869791666665, text_min -3.8539225260416665, text_median 0.050252278645833336
audio_max 3.1879069010416665, audio_min -1.8260091145833333, audio_median -0.11404673258463541
max_abs_value 4.990234375
text_max 2.6202799479166665, text_min -1.983154296875, text_median 0.06984456380208333
audio_max 2.2028401692708335, audio_min -2.7204793294270835, audio_median -0.18868001302083331
max_abs_value 4.990234375
text_max 2.6416015625, text_min -2.2649739583333335, text_median 0.06441243489583333
audio_max 2.0634256998697915, audio_min -2.8298441569010415, audio_median -0.29728190104166663
max_abs_value 4.990234375
text_max 3.5602213541666665, text_min -2.118896484375, text_median 0.13672892252604166
audio_max 2.7224527994791665, audio_min -2.9233805338541665, audio_median -0.1447703043

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 6.58984375, text_min -6.4248046875, text_median -0.0546875
audio_max 5.462890625, audio_min -4.859375, audio_median 0.0
max_abs_value 15.8203125
text_max 5.1318359375, text_min -6.140625, text_median 0.0419921875
audio_max 7.1494140625, audio_min -6.3675537109375, audio_median 0.0
max_abs_value 15.8203125
text_max 6.544921875, text_min -5.63623046875, text_median -0.00390625
audio_max 10.9375, audio_min -10.48828125, audio_median 0.0
max_abs_value 15.8203125
text_max 15.41162109375, text_min -5.062744140625, text_median -0.04052734375
audio_max 4.3753662109375, audio_min -15.8203125, audio_median 0.0
max_abs_value 15.8203125
text_max 6.320556640625, text_min -6.266845703125, text_median -0.0191650390625
audio_max 7.048652648925781, audio_min -4.04437255859375, audio_median 0.0
max_abs_value 15.8203125
text_max 5.4293212890625, text_min -5.7590179443359375, text_median 0.021240234375
audio_max 4.486053466796875, audio_min -5.323944091796875, audio_median 0.0
max_abs_value 15.82

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 9.1728515625, text_min -5.3046875, text_median 0.15234375
audio_max 5.8974609375, audio_min -8.99609375, audio_median 0.0
max_abs_value 12.48828125
text_max 9.21484375, text_min -8.72509765625, text_median -0.087890625
audio_max 6.88427734375, audio_min -7.41766357421875, audio_median -0.001708984375
max_abs_value 12.48828125
text_max 7.185546875, text_min -6.48974609375, text_median -0.056243896484375
audio_max 6.05078125, audio_min -5.85498046875, audio_median 0.0
max_abs_value 12.48828125
text_max 9.14501953125, text_min -8.88671875, text_median 0.076171875
audio_max 12.42041015625, audio_min -8.33984375, audio_median 0.0
max_abs_value 12.48828125
text_max 9.24267578125, text_min -8.6103515625, text_median 0.0408935546875
audio_max 10.92333984375, audio_min -6.9404296875, audio_median 0.0
max_abs_value 12.48828125
text_max 5.8780517578125, text_min -7.5986328125, text_median 0.26416015625
audio_max 6.8251953125, audio_min -7.69140625, audio_median 0.0
max_abs_value 12.48828

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_fs_400
text_max 41.15386962890625, text_min -45.658935546875, text_median 0.5277099609375
audio_max 46.0242919921875, audio_min -65.822021484375, audio_median 0.0
max_abs_value 65.822021484375
text_max 6.4921875, text_min -5.2080078125, text_median 0.1015625
audio_max 5.873046875, audio_min -6.994140625, audio_median 0.0
max_abs_value 17.47235107421875
text_max 12.19329833984375, text_min -4.326171875, text_median 0.0877685546875
audio_max 5.21337890625, audio_min -17.47235107421875, audio_median 0.0
max_abs_value 17.47235107421875
text_max 7.5771484375, text_min -5.443359375, text_median -0.00146484375
audio_max 7.1259765625, audio_min -9.41845703125, audio_median 0.0
max_abs_value 17.47235107421875
text_max 7.81103515625, text_min -5.83203125, text_median -0.037109375
audio_max 5.31640625, audio_min -7.08984375, audio_median 0.0146484375
max_abs_value 17.47235107421875
text_max 6.30322265625, text_min -5.710693359375, text_median 0.07177734375
audio_max 5.587280273

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_fs_425
text_max 30.50341796875, text_min -26.1640625, text_median -0.237060546875
audio_max 33.39501953125, audio_min -32.18505859375, audio_median 0.0
max_abs_value 33.39501953125
text_max 6.15625, text_min -5.3173828125, text_median 0.1474609375
audio_max 7.576171875, audio_min -12.073486328125, audio_median 0.0
max_abs_value 12.955078125
text_max 6.78759765625, text_min -5.657958984375, text_median 0.10791015625
audio_max 6.9404296875, audio_min -5.91888427734375, audio_median 0.0
max_abs_value 12.955078125
text_max 6.14013671875, text_min -5.120361328125, text_median 0.04833984375
audio_max 6.23046875, audio_min -5.01513671875, audio_median 0.0
max_abs_value 12.955078125
text_max 6.17041015625, text_min -7.22412109375, text_median 0.0947265625
audio_max 7.982421875, audio_min -6.17822265625, audio_median 0.0
max_abs_value 12.955078125
text_max 5.116943359375, text_min -4.810546875, text_median -0.0050048828125
audio_max 10.45556640625, audio_min -12.955078125, au

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_fs_427
text_max 39.162109375, text_min -56.3134765625, text_median 0.39361572265625
audio_max 40.39697265625, audio_min -39.338134765625, audio_median -0.23614501953125
max_abs_value 56.3134765625
text_max 9.6123046875, text_min -5.580078125, text_median -0.02197265625
audio_max 5.888671875, audio_min -6.29638671875, audio_median -0.0009765625
max_abs_value 18.666015625
text_max 14.01171875, text_min -6.16015625, text_median 0.23291015625
audio_max 5.32421875, audio_min -13.2783203125, audio_median -0.169921875
max_abs_value 18.666015625
text_max 6.4541015625, text_min -5.18798828125, text_median 0.13623046875
audio_max 6.2333984375, audio_min -4.955078125, audio_median -0.0771484375
max_abs_value 18.666015625
text_max 7.39697265625, text_min -7.7861328125, text_median -0.027252197265625
audio_max 7.354248046875, audio_min -6.92333984375, audio_median 0.0
max_abs_value 18.666015625
text_max 7.80841064453125, text_min -6.712890625, text_median 0.083251953125
audio_max

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 5.33984375, text_min -6.6640625, text_median 0.05859375
audio_max 4.767578125, audio_min -5.134765625, audio_median -0.02734375
max_abs_value 16.03515625
text_max 4.6343994140625, text_min -6.61767578125, text_median 0.0693359375
audio_max 3.91845703125, audio_min -4.232421875, audio_median -0.13134765625
max_abs_value 16.03515625
text_max 10.0006103515625, text_min -9.812744140625, text_median 0.254638671875
audio_max 8.0947265625, audio_min -7.314453125, audio_median -0.08984375
max_abs_value 16.03515625
text_max 8.3134765625, text_min -5.82568359375, text_median -0.01171875
audio_max 4.078125, audio_min -7.5078125, audio_median -0.109375
max_abs_value 16.03515625
text_max 8.4503173828125, text_min -8.2978515625, text_median 0.30126953125
audio_max 5.575927734375, audio_min -6.9395751953125, audio_median -0.0322265625
max_abs_value 16.03515625
text_max 13.0673828125, text_min -6.0458984375, text_median 0.0888671875
audio_max 7.445556640625, audio_min -11.6064453125, audio_me

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_fs_499
text_max 62.416259765625, text_min -75.3116455078125, text_median -0.693267822265625
audio_max 69.19464111328125, audio_min -46.65025329589844, audio_median -1.8040008544921875
max_abs_value 75.3116455078125
text_max 7.63671875, text_min -7.1572265625, text_median 0.052734375
audio_max 5.9375, audio_min -5.359375, audio_median -0.15625
max_abs_value 13.7958984375
text_max 5.948974609375, text_min -5.462890625, text_median 0.142913818359375
audio_max 5.068359375, audio_min -5.84765625, audio_median 0.0
max_abs_value 13.7958984375
text_max 6.72021484375, text_min -6.3896484375, text_median 0.0888671875
audio_max 5.5595703125, audio_min -6.14697265625, audio_median -0.0234375
max_abs_value 13.7958984375
text_max 4.44775390625, text_min -5.99560546875, text_median 0.07470703125
audio_max 4.5419921875, audio_min -5.169921875, audio_median -0.124755859375
max_abs_value 13.7958984375
text_max 5.98681640625, text_min -6.51513671875, text_median 0.06512451171875
audio_

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 7.4765625, text_min -6.486328125, text_median 0.1875
audio_max 6.904296875, audio_min -6.025390625, audio_median 0.12109375
max_abs_value 13.373046875
text_max 4.784423828125, text_min -4.970703125, text_median -0.05126953125
audio_max 6.41259765625, audio_min -5.951171875, audio_median -0.0205078125
max_abs_value 13.373046875
text_max 6.08740234375, text_min -5.732666015625, text_median -0.0126953125
audio_max 6.896484375, audio_min -5.5926513671875, audio_median 0.0234375
max_abs_value 13.373046875
text_max 7.6279296875, text_min -7.857421875, text_median 0.02099609375
audio_max 12.804443359375, audio_min -12.96337890625, audio_median -0.11328125
max_abs_value 13.373046875
text_max 7.49560546875, text_min -7.4921875, text_median -0.0458984375
audio_max 8.16650390625, audio_min -11.8935546875, audio_median 0.01953125
max_abs_value 13.373046875
text_max 6.0107421875, text_min -5.43017578125, text_median 0.01513671875
audio_max 6.565673828125, audio_min -5.900390625, audio_medi

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  mu_fs_661
text_max 27.86376953125, text_min -26.84014892578125, text_median -0.16839599609375
audio_max 36.7806396484375, audio_min -35.079345703125, audio_median 0.0
max_abs_value 36.7806396484375
text_max 7.8876953125, text_min -8.40234375, text_median 0.0654296875
audio_max 7.365234375, audio_min -6.134765625, audio_median -0.001953125
max_abs_value 15.466796875
text_max 4.283203125, text_min -5.66796875, text_median 0.1346435546875
audio_max 4.978515625, audio_min -6.357421875, audio_median 0.0
max_abs_value 15.466796875
text_max 5.40234375, text_min -4.7412109375, text_median -0.037109375
audio_max 5.36279296875, audio_min -7.0634765625, audio_median -0.019775390625
max_abs_value 15.466796875
text_max 5.90771484375, text_min -5.306640625, text_median -0.029296875
audio_max 6.3603515625, audio_min -6.544921875, audio_median -0.18115234375
max_abs_value 15.466796875
text_max 5.01953125, text_min -4.9296875, text_median -0.093994140625
audio_max 6.365234375, audio_min

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 6.826171875, text_min -7.68359375, text_median 0.02734375
audio_max 4.951171875, audio_min -5.1328125, audio_median 0.0
max_abs_value 16.587890625
text_max 4.9888916015625, text_min -8.302734375, text_median 0.041015625
audio_max 5.07421875, audio_min -4.7529296875, audio_median 0.0
max_abs_value 16.587890625
text_max 6.53173828125, text_min -7.5693359375, text_median -0.1322021484375
audio_max 5.62939453125, audio_min -7.1201171875, audio_median 0.0
max_abs_value 16.587890625
text_max 7.3125, text_min -6.1474609375, text_median -0.099609375
audio_max 6.6640625, audio_min -6.9453125, audio_median 0.0
max_abs_value 16.587890625
text_max 10.2734375, text_min -7.487548828125, text_median -0.06005859375
audio_max 6.7900390625, audio_min -9.1162109375, audio_median 0.0
max_abs_value 16.587890625
text_max 9.8740234375, text_min -13.17578125, text_median -0.084228515625
audio_max 5.7060546875, audio_min -6.68359375, audio_median 0.0
max_abs_value 16.587890625
text_max 6.9423828125, t

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 7.12109375, text_min -6.50634765625, text_median 0.0
audio_max 6.73486328125, audio_min -8.240234375, audio_median 0.244140625
max_abs_value 14.176025390625
text_max 7.0956878662109375, text_min -7.9145355224609375, text_median -0.060546875
audio_max 7.029296875, audio_min -5.732421875, audio_median -0.01953125
max_abs_value 14.176025390625
text_max 6.1767578125, text_min -5.935546875, text_median 0.070068359375
audio_max 6.7626953125, audio_min -6.3048095703125, audio_median -0.017578125
max_abs_value 14.176025390625
text_max 7.7978515625, text_min -6.27880859375, text_median 0.155517578125
audio_max 6.294189453125, audio_min -7.7294921875, audio_median -0.10546875
max_abs_value 14.176025390625
text_max 6.53515625, text_min -5.76806640625, text_median 0.052490234375
audio_max 5.08349609375, audio_min -6.078125, audio_median 0.015625
max_abs_value 14.176025390625
text_max 8.71923828125, text_min -7.2314453125, text_median 0.140625
audio_max 7.611328125, audio_min -7.556640625,

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 1.0625, text_min -0.421875, text_median 0.0625
audio_max 0.203125, audio_min -0.125, audio_median 0.0
max_abs_value 5.830078125
text_max 1.328125, text_min -0.37890625, text_median 0.0546875
audio_max 0.20703125, audio_min -0.28515625, audio_median 0.015625
max_abs_value 5.830078125
text_max 1.1044921875, text_min -0.669921875, text_median 0.02099609375
audio_max 0.2861328125, audio_min -0.26953125, audio_median 0.0224609375
max_abs_value 5.830078125
text_max 0.8447265625, text_min -0.791015625, text_median 0.01953125
audio_max 0.37890625, audio_min -0.41015625, audio_median 0.0078125
max_abs_value 5.830078125
text_max 3.0501708984375, text_min -1.04296875, text_median -0.04296875
audio_max 0.32666015625, audio_min -0.1826171875, audio_median 0.013671875
max_abs_value 5.830078125
text_max 3.900390625, text_min -2.671875, text_median 0.01171875
audio_max 0.27197265625, audio_min -0.18914794921875, audio_median -0.0009765625
max_abs_value 5.830078125
text_max 1.73486328125, text

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.8854166666666666, text_min -0.4375, text_median 0.0625
audio_max 0.07291666666666667, audio_min -0.19791666666666666, audio_median -0.010416666666666666
max_abs_value 5.0390625
text_max 1.0393880208333333, text_min -0.71240234375, text_median 0.061197916666666664
audio_max 0.24869791666666666, audio_min -0.19889322916666666, audio_median -0.005859375
max_abs_value 5.0390625
text_max 3.857421875, text_min -2.5767008463541665, text_median 0.020843505859375
audio_max 0.8717447916666666, audio_min -0.3086140950520833, audio_median 0.0
max_abs_value 5.0390625
text_max 3.2224934895833335, text_min -0.76220703125, text_median 0.0185546875
audio_max 0.23893229166666666, audio_min -0.130859375, audio_median 0.007486979166666667
max_abs_value 5.0390625
text_max 2.51123046875, text_min -0.6888020833333334, text_median 0.10286458333333333
audio_max 0.21484375, audio_min -0.14583333333333334, audio_median -0.005208333333333333
max_abs_value 5.0390625
text_max 1.3421223958333333, text_min

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_400
text_max 7.465240478515625, text_min -2.738037109375, text_median 1.0245513916015625
audio_max 0.9830474853515625, audio_min -4.86865234375, audio_median -0.1190185546875
max_abs_value 7.465240478515625
text_max 1.09375, text_min -0.96875, text_median 0.03125
audio_max 0.265625, audio_min -0.09375, audio_median 0.015625
max_abs_value 6.87890625
text_max 4.509765625, text_min -1.3828125, text_median -0.03125
audio_max 0.431640625, audio_min -0.2109375, audio_median 0.001953125
max_abs_value 6.87890625
text_max 0.953125, text_min -0.46875, text_median 0.00390625
audio_max 0.06640625, audio_min -0.3984375, audio_median -0.013671875
max_abs_value 6.87890625
text_max 3.505859375, text_min -0.49462890625, text_median 0.0595703125
audio_max 0.13671875, audio_min -0.29296875, audio_median -0.00128173828125
max_abs_value 6.87890625
text_max 1.390625, text_min -0.685546875, text_median 0.046875
audio_max 0.09375, audio_min -0.43359375, audio_median -0.03125
max_abs_va

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_425
text_max 8.2724609375, text_min -2.179606119791667, text_median 0.7840983072916667
audio_max 6.987263997395833, audio_min -1.7612304687499998, audio_median 0.08247884114583334
max_abs_value 8.2724609375
text_max 0.71875, text_min -0.6041666666666666, text_median 0.08333333333333334
audio_max 0.125, audio_min -0.19791666666666666, audio_median -0.005208333333333333
max_abs_value 5.616861979166667
text_max 3.8971354166666665, text_min -1.6106770833333333, text_median 0.046875
audio_max 0.22526041666666666, audio_min -0.15576171875, audio_median 0.011067708333333332
max_abs_value 5.616861979166667
text_max 1.92578125, text_min -0.632080078125, text_median 0.109375
audio_max 1.0885213216145833, audio_min -0.2643229166666667, audio_median 0.0
max_abs_value 5.616861979166667
text_max 0.740234375, text_min -0.3919270833333333, text_median 0.005533854166666667
audio_max 0.19205729166666666, audio_min -0.3216145833333333, audio_median 0.0
max_abs_value 5.616861979166

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_427
text_max 6.674072265625, text_min -4.4482421875, text_median 0.74786376953125
audio_max 4.50341796875, audio_min -3.6728515625, audio_median 0.043366432189941406
max_abs_value 6.674072265625
text_max 0.703125, text_min -0.609375, text_median 0.078125
audio_max 0.15625, audio_min -0.203125, audio_median 0.015625
max_abs_value 3.9443359375
text_max 2.78125, text_min -0.58203125, text_median 0.03125
audio_max 0.76171875, audio_min -0.61328125, audio_median 0.025390625
max_abs_value 3.9443359375
text_max 0.9736328125, text_min -0.36627197265625, text_median 0.048828125
audio_max 0.373046875, audio_min -0.548828125, audio_median 0.0166015625
max_abs_value 3.9443359375
text_max 3.544921875, text_min -0.630859375, text_median 0.0107421875
audio_max 1.34375, audio_min -0.2744140625, audio_median -0.0146484375
max_abs_value 3.9443359375
text_max 2.330078125, text_min -1.494140625, text_median 0.1529541015625
audio_max 0.51171875, audio_min -0.4375, audio_median -0.02

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 1.4270833333333333, text_min -0.5104166666666666, text_median 0.03125
audio_max 0.19791666666666666, audio_min -0.15625, audio_median 0.010416666666666666
max_abs_value 5.2115478515625
text_max 5.2115478515625, text_min -1.00390625, text_median 0.01947021484375
audio_max 0.21354166666666666, audio_min -0.4970703125, audio_median 0.0155029296875
max_abs_value 5.2115478515625
text_max 1.9007161458333333, text_min -0.8108723958333334, text_median 0.06770833333333334
audio_max 0.21516927083333334, audio_min -0.4788411458333333, audio_median -0.007080078125
max_abs_value 5.2115478515625
text_max 2.69921875, text_min -1.357421875, text_median -0.032267252604166664
audio_max 0.22395833333333334, audio_min -0.5348307291666666, audio_median 0.001953125
max_abs_value 5.2115478515625
text_max 4.452921549479167, text_min -2.2867838541666665, text_median 0.023691813151041664
audio_max 0.10493787129720052, audio_min -0.2994791666666667, audio_median -0.0048427581787109375
max_abs_value 5.21

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_499
text_max 12.547760009765625, text_min -4.060089111328125, text_median 0.88885498046875
audio_max 0.8885917663574219, audio_min -3.417388916015625, audio_median -0.07221221923828125
max_abs_value 12.547760009765625
text_max 0.984375, text_min -0.21875, text_median 0.046875
audio_max 0.234375, audio_min -0.109375, audio_median 0.0
max_abs_value 6.84375
text_max 3.095703125, text_min -0.412109375, text_median 0.0048828125
audio_max 0.40234375, audio_min -0.25, audio_median -0.0068359375
max_abs_value 6.84375
text_max 0.82421875, text_min -0.54296875, text_median 0.0517578125
audio_max 0.11328125, audio_min -0.267578125, audio_median -0.01953125
max_abs_value 6.84375
text_max 1.517578125, text_min -0.50390625, text_median 0.03125
audio_max 0.442138671875, audio_min -0.265625, audio_median -0.0126953125
max_abs_value 6.84375
text_max 1.01171875, text_min -0.5859375, text_median 0.0341796875
audio_max 0.21484375, audio_min -0.43359375, audio_median -0.0224609375
m

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 1.1458333333333333, text_min -0.6875, text_median 0.11458333333333333
audio_max 0.08333333333333333, audio_min -0.21875, audio_median -0.015625
max_abs_value 6.053385416666667
text_max 1.3619791666666667, text_min -1.2122395833333333, text_median 0.09212239583333333
audio_max 0.3951822916666667, audio_min -0.22493489583333334, audio_median -0.019287109375
max_abs_value 6.053385416666667
text_max 6.053385416666667, text_min -2.6897786458333335, text_median 0.03385416666666667
audio_max 0.8893229166666666, audio_min -0.14322916666666666, audio_median 0.0
max_abs_value 6.053385416666667
text_max 1.529296875, text_min -0.5371907552083334, text_median 0.0860595703125
audio_max 0.2574869791666667, audio_min -0.13564046223958334, audio_median -0.01171875
max_abs_value 6.053385416666667
text_max 2.0846354166666665, text_min -0.4745279947916667, text_median 0.092041015625
audio_max 0.2265625, audio_min -0.2942708333333333, audio_median 0.023333231608072914
max_abs_value 6.0533854166666

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_661
text_max 5.433738708496094, text_min -2.9765625, text_median 0.768310546875
audio_max 1.659912109375, audio_min -3.8873291015625, audio_median -0.05267333984375
max_abs_value 5.433738708496094
text_max 0.6875, text_min -0.609375, text_median 0.046875
audio_max 0.15625, audio_min -0.15625, audio_median -0.015625
max_abs_value 5.3408203125
text_max 2.140625, text_min -2.099609375, text_median 0.07421875
audio_max 0.21728515625, audio_min -0.12548828125, audio_median -0.00390625
max_abs_value 5.3408203125
text_max 2.07568359375, text_min -0.58203125, text_median 0.125
audio_max 0.24609375, audio_min -0.21484375, audio_median -0.0078125
max_abs_value 5.3408203125
text_max 2.173828125, text_min -1.36328125, text_median 0.025390625
audio_max 0.146484375, audio_min -0.51953125, audio_median 0.01171875
max_abs_value 5.3408203125
text_max 0.4140625, text_min -0.796875, text_median 0.0390625
audio_max 0.265625, audio_min -0.1875, audio_median 0.0
max_abs_value 5.34082

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.796875, text_min -0.359375, text_median 0.09375
audio_max 0.125, audio_min -0.203125, audio_median 0.015625
max_abs_value 4.7060546875
text_max 1.181640625, text_min -0.66064453125, text_median 0.00390625
audio_max 0.515625, audio_min -0.08984375, audio_median 0.0234375
max_abs_value 4.7060546875
text_max 4.40625, text_min -2.052734375, text_median -0.037109375
audio_max 0.76953125, audio_min -0.2705078125, audio_median 0.03125
max_abs_value 4.7060546875
text_max 0.896484375, text_min -0.724609375, text_median 0.05859375
audio_max 0.451171875, audio_min -0.1982421875, audio_median 0.03125
max_abs_value 4.7060546875
text_max 3.04248046875, text_min -1.09765625, text_median 0.0322265625
audio_max 0.27392578125, audio_min -0.32763671875, audio_median 0.00390625
max_abs_value 4.7060546875
text_max 1.486328125, text_min -2.556640625, text_median 0.0078125
audio_max 0.361328125, audio_min -0.21484375, audio_median 0.016357421875
max_abs_value 4.7060546875
text_max 1.220703125, tex

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.78125, text_min -0.328125, text_median 0.046875
audio_max 0.3125, audio_min -0.3125, audio_median 0.0
max_abs_value 4.847412109375
text_max 2.23193359375, text_min -1.186767578125, text_median 0.0203857421875
audio_max 0.240966796875, audio_min -0.14453125, audio_median 0.0051727294921875
max_abs_value 4.847412109375
text_max 2.80078125, text_min -0.720703125, text_median 0.0458984375
audio_max 0.291015625, audio_min -0.17578125, audio_median 0.001953125
max_abs_value 4.847412109375
text_max 0.9013671875, text_min -0.9423828125, text_median 0.0557861328125
audio_max 0.24169921875, audio_min -0.95947265625, audio_median 0.0113525390625
max_abs_value 4.847412109375
text_max 0.5703125, text_min -0.6015625, text_median 0.0078125
audio_max 0.15625, audio_min -0.15625, audio_median 0.01171875
max_abs_value 4.847412109375
text_max 2.3955078125, text_min -0.5390625, text_median 0.1044921875
audio_max 0.3828125, audio_min -0.5390625, audio_median 0.005859375
max_abs_value 4.847412109

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_zs_869
text_max 8.787689208984375, text_min -2.567352294921875, text_median 1.059814453125
audio_max 0.896484375, audio_min -1.5723953247070312, audio_median -0.0115814208984375
max_abs_value 8.787689208984375
text_max 1.53125, text_min -0.484375, text_median 0.09375
audio_max 0.09375, audio_min -0.09375, audio_median 0.0
max_abs_value 6.0076904296875
text_max 0.974609375, text_min -0.4638671875, text_median 0.076171875
audio_max 0.357421875, audio_min -0.265625, audio_median -0.03125
max_abs_value 6.0076904296875
text_max 3.135223388671875, text_min -1.752197265625, text_median 0.0244140625
audio_max 0.451171875, audio_min -0.279541015625, audio_median -0.0097198486328125
max_abs_value 6.0076904296875
text_max 3.099609375, text_min -0.94873046875, text_median 0.01171875
audio_max 0.193359375, audio_min -0.7624893188476562, audio_median -0.009765625
max_abs_value 6.0076904296875
text_max 2.6363525390625, text_min -0.76171875, text_median 0.138671875
audio_max 0.120

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 1.015625, text_min -0.71875, text_median 0.09375
audio_max 0.140625, audio_min -0.109375, audio_median -0.03125
max_abs_value 7.607421875
text_max 0.8125, text_min -0.3359375, text_median 0.0361328125
audio_max 0.37890625, audio_min -0.1875, audio_median 0.01171875
max_abs_value 7.607421875
text_max 4.1376953125, text_min -3.40380859375, text_median 0.0450439453125
audio_max 0.5302734375, audio_min -0.38671875, audio_median 0.0048828125
max_abs_value 7.607421875
text_max 0.943359375, text_min -0.5234375, text_median 0.0439453125
audio_max 0.220703125, audio_min -0.2578125, audio_median 0.000244140625
max_abs_value 7.607421875
text_max 1.9951171875, text_min -0.43212890625, text_median 0.00537109375
audio_max 0.4208984375, audio_min -0.26953125, audio_median 0.017578125
max_abs_value 7.607421875
text_max 7.607421875, text_min -0.45703125, text_median 0.0751953125
audio_max 0.51171875, audio_min -0.59765625, audio_median 0.0
max_abs_value 7.607421875
text_max 1.47314453125, text

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.9375, text_min -0.65625, text_median 0.0
audio_max 0.125, audio_min -0.53125, audio_median 0.0
max_abs_value 3.4609375
text_max 1.6875, text_min -0.9453125, text_median 0.03125
audio_max 0.7265625, audio_min -0.265625, audio_median 0.0
max_abs_value 3.4609375
text_max 0.86328125, text_min -0.5205078125, text_median 0.0078125
audio_max 0.23046875, audio_min -0.77734375, audio_median 0.0078125
max_abs_value 3.4609375
text_max 1.91015625, text_min -0.50775146484375, text_median 0.0078125
audio_max 0.1796875, audio_min -0.671875, audio_median -0.01171875
max_abs_value 3.4609375
text_max 1.9609375, text_min -0.5390625, text_median 0.009765625
audio_max 0.2125244140625, audio_min -0.1171875, audio_median -0.0048828125
max_abs_value 3.4609375
text_max 3.4609375, text_min -1.1806640625, text_median -0.015625
audio_max 0.382568359375, audio_min -0.51953125, audio_median -0.00390625
max_abs_value 3.4609375
text_max 0.734375, text_min -0.751953125, text_median -0.009765625
audio_max 0.

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.875, text_min -0.65625, text_median 0.0
audio_max 0.1875, audio_min -0.40625, audio_median 0.0
max_abs_value 4.857421875
text_max 1.61328125, text_min -1.12890625, text_median 0.01953125
audio_max 0.25, audio_min -0.265625, audio_median 0.00732421875
max_abs_value 4.857421875
text_max 4.65625, text_min -0.8125, text_median 0.015625
audio_max 0.171875, audio_min -0.3515625, audio_median 0.0
max_abs_value 4.857421875
text_max 0.71484375, text_min -0.56353759765625, text_median 0.0302734375
audio_max 0.380859375, audio_min -0.7890625, audio_median 0.0
max_abs_value 4.857421875
text_max 0.640625, text_min -0.90625, text_median 0.03125
audio_max 0.1875, audio_min -0.421875, audio_median 0.0
max_abs_value 4.857421875
text_max 0.97265625, text_min -0.904296875, text_median 0.03125
audio_max 0.4140625, audio_min -0.5546875, audio_median 0.00213623046875
max_abs_value 4.857421875
text_max 0.328125, text_min -0.3984375, text_median 0.025390625
audio_max 0.2890625, audio_min -0.2734375

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_400
text_max 5.82080078125, text_min -5.505889892578125, text_median 0.266082763671875
audio_max 1.6513671875, audio_min -3.8603515625, audio_median 0.0068359375
max_abs_value 5.82080078125
text_max 1.4375, text_min -2.25, text_median 0.015625
audio_max 0.34375, audio_min -0.15625, audio_median 0.0
max_abs_value 3.1171875
text_max 3.1171875, text_min -1.25, text_median 0.0302734375
audio_max 0.2265625, audio_min -0.34375, audio_median -0.0078125
max_abs_value 3.1171875
text_max 2.51953125, text_min -0.8671875, text_median 0.01171875
audio_max 0.3046875, audio_min -0.1171875, audio_median 0.015625
max_abs_value 3.1171875
text_max 1.53515625, text_min -1.01171875, text_median 0.0078125
audio_max 0.234375, audio_min -1.765625, audio_median -0.0146484375
max_abs_value 3.1171875
text_max 0.625, text_min -1.1953125, text_median 0.03125
audio_max 0.109375, audio_min -0.328125, audio_median 0.0
max_abs_value 3.1171875
text_max 0.828125, text_min -0.96875, text_median -0

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_425
text_max 6.0068359375, text_min -2.98583984375, text_median 0.2373046875
audio_max 3.853271484375, audio_min -0.83984375, audio_median 0.01904296875
max_abs_value 6.0068359375
text_max 2.3125, text_min -1.96875, text_median 0.03125
audio_max 0.15625, audio_min -0.59375, audio_median 0.0
max_abs_value 3.9736328125
text_max 2.65625, text_min -1.1875, text_median 0.015625
audio_max 0.484375, audio_min -0.203125, audio_median 0.0
max_abs_value 3.9736328125
text_max 2.265625, text_min -0.77734375, text_median 0.0390625
audio_max 0.62109375, audio_min -0.46875, audio_median 0.0
max_abs_value 3.9736328125
text_max 1.169921875, text_min -0.8056640625, text_median 0.0185546875
audio_max 0.3125, audio_min -0.345703125, audio_median 0.0
max_abs_value 3.9736328125
text_max 1.15625, text_min -1.03125, text_median 0.0078125
audio_max 0.21875, audio_min -0.21875, audio_median 0.0
max_abs_value 3.9736328125
text_max 1.30078125, text_min -0.953125, text_median 0.015625
audio

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_427
text_max 6.642578125, text_min -2.981201171875, text_median 0.1376953125
audio_max 3.14453125, audio_min -6.294921875, audio_median 0.080413818359375
max_abs_value 6.642578125
text_max 1.59375, text_min -0.4375, text_median 0.03125
audio_max 0.125, audio_min -0.625, audio_median 0.0
max_abs_value 3.41015625
text_max 2.484375, text_min -0.59375, text_median 0.0
audio_max 1.3203125, audio_min -1.015625, audio_median 0.015625
max_abs_value 3.41015625
text_max 1.0146484375, text_min -0.56396484375, text_median 0.0120849609375
audio_max 0.33203125, audio_min -0.978515625, audio_median 0.00634765625
max_abs_value 3.41015625
text_max 3.41015625, text_min -0.7578125, text_median -0.0078125
audio_max 0.921875, audio_min -0.3359375, audio_median 0.015625
max_abs_value 3.41015625
text_max 2.0390625, text_min -0.71875, text_median 0.0078125
audio_max 0.3671875, audio_min -0.5234375, audio_median 0.0
max_abs_value 3.41015625
text_max 0.9921875, text_min -0.8515625, text_

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.96875, text_min -0.875, text_median 0.0
audio_max 0.25, audio_min -0.21875, audio_median 0.0
max_abs_value 3.3515625
text_max 2.8828125, text_min -1.31640625, text_median 0.03515625
audio_max 0.203125, audio_min -0.4140625, audio_median 0.0078125
max_abs_value 3.3515625
text_max 2.765625, text_min -1.1875, text_median -0.00390625
audio_max 0.6484375, audio_min -0.328125, audio_median 0.01171875
max_abs_value 3.3515625
text_max 0.80712890625, text_min -1.20947265625, text_median 0.005859375
audio_max 0.25, audio_min -1.109375, audio_median 0.0
max_abs_value 3.3515625
text_max 0.65625, text_min -1.0, text_median 0.015625
audio_max 0.265625, audio_min -0.15625, audio_median 0.0
max_abs_value 3.3515625
text_max 2.0390625, text_min -0.8984375, text_median 0.013671875
audio_max 0.578125, audio_min -0.7734375, audio_median 0.0048828125
max_abs_value 3.3515625
text_max 0.921875, text_min -0.765625, text_median -0.015625
audio_max 0.15625, audio_min -0.625, audio_median 0.0078125
max

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_499
text_max 6.19775390625, text_min -4.042236328125, text_median 0.0439453125
audio_max 1.12841796875, audio_min -2.9013671875, audio_median -0.0263671875
max_abs_value 6.19775390625
text_max 0.8125, text_min -0.625, text_median 0.015625
audio_max 0.53125, audio_min -0.34375, audio_median 0.0
max_abs_value 4.783203125
text_max 4.783203125, text_min -1.11328125, text_median -0.0234375
audio_max 0.21337890625, audio_min -0.96875, audio_median 0.001953125
max_abs_value 4.783203125
text_max 2.001953125, text_min -1.8125, text_median 0.0
audio_max 0.232421875, audio_min -2.6875, audio_median 0.0078125
max_abs_value 4.783203125
text_max 2.2734375, text_min -1.5908203125, text_median -0.0009765625
audio_max 0.7802734375, audio_min -0.243408203125, audio_median -0.00634765625
max_abs_value 4.783203125
text_max 0.782470703125, text_min -0.439453125, text_median 0.0107421875
audio_max 0.9755859375, audio_min -0.63909912109375, audio_median 0.00439453125
max_abs_value 4.7

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.90625, text_min -1.84375, text_median 0.03125
audio_max 0.21875, audio_min -0.28125, audio_median 0.0
max_abs_value 5.7890625
text_max 1.849609375, text_min -0.80078125, text_median 0.0224609375
audio_max 0.421875, audio_min -0.37109375, audio_median 0.0
max_abs_value 5.7890625
text_max 5.7890625, text_min -3.3671875, text_median 0.0
audio_max 0.6826171875, audio_min -0.369140625, audio_median 0.0
max_abs_value 5.7890625
text_max 1.76171875, text_min -0.89453125, text_median 0.05224609375
audio_max 0.25, audio_min -0.3984375, audio_median 0.0009765625
max_abs_value 5.7890625
text_max 1.2734375, text_min -0.80078125, text_median 0.01171875
audio_max 0.22265625, audio_min -0.2890625, audio_median 0.0
max_abs_value 5.7890625
text_max 3.0859375, text_min -0.77734375, text_median 0.0
audio_max 0.22265625, audio_min -0.44921875, audio_median 0.0
max_abs_value 5.7890625
text_max 0.7265625, text_min -1.1640625, text_median 0.01171875
audio_max 0.203125, audio_min -0.21875, audio_med

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_661
text_max 4.989501953125, text_min -2.448394775390625, text_median 0.2327880859375
audio_max 2.896240234375, audio_min -2.95361328125, audio_median 0.02099609375
max_abs_value 4.989501953125
text_max 0.96875, text_min -1.28125, text_median 0.03125
audio_max 0.21875, audio_min -0.15625, audio_median 0.0
max_abs_value 4.5546875
text_max 2.2177734375, text_min -1.796875, text_median 0.0234375
audio_max 1.01171875, audio_min -0.5859375, audio_median 0.0
max_abs_value 4.5546875
text_max 3.2734375, text_min -0.6484375, text_median 0.0
audio_max 0.609375, audio_min -0.5703125, audio_median 0.0
max_abs_value 4.5546875
text_max 0.953125, text_min -1.091796875, text_median 0.010498046875
audio_max 0.361328125, audio_min -1.58154296875, audio_median 0.00048828125
max_abs_value 4.5546875
text_max 0.7109375, text_min -0.9921875, text_median 0.0
audio_max 0.390625, audio_min -0.375, audio_median 0.015625
max_abs_value 4.5546875
text_max 0.897705078125, text_min -1.61083984

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


text_max 0.90625, text_min -0.59375, text_median 0.0
audio_max 0.4375, audio_min -0.6875, audio_median 0.0
max_abs_value 4.0390625
text_max 1.5234375, text_min -0.6171875, text_median 0.0244140625
audio_max 0.625, audio_min -0.1484375, audio_median 0.0078125
max_abs_value 4.0390625
text_max 3.8095703125, text_min -1.5703125, text_median 0.01171875
audio_max 1.0703125, audio_min -0.833984375, audio_median 0.0126953125
max_abs_value 4.0390625
text_max 0.71875, text_min -1.54296875, text_median 0.0234375
audio_max 0.3828125, audio_min -0.34375, audio_median 0.01171875
max_abs_value 4.0390625
text_max 1.421875, text_min -1.490234375, text_median 0.02734375
audio_max 0.15234375, audio_min -0.446044921875, audio_median 0.013278961181640625
max_abs_value 4.0390625
text_max 1.46875, text_min -1.01171875, text_median 0.0078125
audio_max 0.6484375, audio_min -0.484375, audio_median 0.0078125
max_abs_value 4.0390625
text_max 1.26171875, text_min -0.271484375, text_median 0.03125
audio_max 0.32519

  save_qual_results(exp.loc[i], gt_start=sse.loc[i][0], gt_end=sse.loc[i][1])


saving data in  qwen_fs_833
text_max 25.83050537109375, text_min -16.44873046875, text_median 0.32763671875
audio_max 8.627197265625, audio_min -20.4794921875, audio_median 0.030029296875
max_abs_value 25.83050537109375
