In [35]:
import os
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm

eval_df = pd.read_csv('/ceph/dpandya/notsofar/eval_set/240825.1_eval_full_with_GT/MTG/eval.csv')
train_df = pd.read_csv('/ceph/dpandya/notsofar/train_set/240825.1_train/train.csv')

In [62]:
def segment_multiple_audios(audio_files, csv_files, output_dir, output_csv, min_duration=10, max_duration=25):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Initialize the final DataFrame for all metadata
    all_segments = []

    # Process each audio and its corresponding CSV file
    for audio_file, csv_file in zip(audio_files, csv_files):
        # Load the audio file
        audio = AudioSegment.from_file(audio_file)

        # Load the CSV file
        df = pd.read_json(csv_file).sort_values(by='start_time', ascending=True)

        meeting = csv_file.split('/')[-2]

        # Initialize variables
        segments = []
        current_text = ""
        current_start = None
        segment_start = None
        segment_stop = None
        segment_duration = 0

        # Iterate through the rows of the CSV
        for _, row in df.iterrows():
            start = row['start_time'] * 1000  # Convert to milliseconds
            stop = row['end_time'] * 1000   # Convert to milliseconds
            text = row['text']
            duration = stop - start
            #print(duration/1000, start/1000, stop/1000)
            if current_start is None:
                # Initialize the first segment
                current_start = start
                segment_start = start
                current_text = text
                segment_duration = duration
            else:
                # Check if adding this segment keeps duration within limits
                if (stop - segment_start) <= max_duration * 1000:
                    segment_duration += duration
                    segment_stop = stop
                    current_text += " " + text
                else:
                    # Finalize the current segment
                    segments.append((segment_start, segment_stop))
                    all_segments.append({
                        "original_audio_file": (audio_file),
                        "segmented_audio_file": f"{meeting}_{os.path.splitext(os.path.basename(audio_file))[0]}_segment_{len(segments)}.wav",
                        "segmented_text": current_text.strip(),
                        "start": segment_start,
                        "stop": segment_stop
                    })

                    # Start a new segment
                    current_start = start
                    segment_start = start
                    segment_stop = stop
                    current_text = text
                    segment_duration = duration

                # If segment is too short, combine it
                if segment_duration < min_duration * 1000:
                    continue
            #print(segments)

        # Save the last segment if it hasn't been added
        if segment_duration >= min_duration * 1000:
            segments.append((segment_start, segment_stop))
            all_segments.append({
                "original_audio_file": (audio_file),
                "segmented_audio_file": f"{meeting}_{os.path.splitext(os.path.basename(audio_file))[0]}_segment_{len(segments)}.wav",
                "segmented_text": current_text.strip(),
                "start": start,
                "stop": stop
            })

        # Save the audio segments
        for i, (start, stop) in enumerate(segments):
            segment_filename = all_segments[-len(segments) + i]['segmented_audio_file']
            segment = audio[start:stop]
            segment.export(os.path.join(output_dir, segment_filename), format="wav")

    # Save the consolidated CSV file
    final_df = pd.DataFrame(all_segments)
    final_df.to_csv(output_csv, index=False)

    print(f"Processed {len(all_segments)} segments across all files.")
    print(f"All segments saved in: {output_dir}")
    print(f"Consolidated metadata saved in: {output_csv}")

In [65]:
import random

transcript_files = list(train_df['gt_transcription_files'].unique())
audio_files = []

for gtFile in transcript_files:
    audio_files.append(random.choice(list(train_df[train_df['gt_transcription_files']==gtFile]['sc_files'])))

In [66]:
segment_multiple_audios(
    audio_files,
    transcript_files,
    '/ceph/dpandya/notsofar/nsfd_adap_segments/train/',
    '/ceph/dpandya/notsofar/nsfd_adap_segments/train_segments.csv'
)

Processed 1148 segments across all files.
All segments saved in: /ceph/dpandya/notsofar/nsfd_adap_segments/train/
Consolidated metadata saved in: /ceph/dpandya/notsofar/nsfd_adap_segments/train_segments.csv
