<a href="https://colab.research.google.com/github/10udCryp7/TV-command-synthesis/blob/main/src_prototype/Phase3_ForceAlighment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U openai-whisper
!pip install -q -U faster-whisper

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [9

In [None]:
import json
import os
import itertools
from faster_whisper import WhisperModel
from pydub import AudioSegment
import numpy as np
import re
from difflib import SequenceMatcher
import ast
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import cpu_count
import threading
from functools import lru_cache
import torch
import gc
from typing import Optional, Tuple, List, Dict, Any
import time
from pathlib import Path


class OptimizedSpeechCleaning:
    def __init__(self, cant_clean_list=None, max_workers=None, use_gpu=True, model_size="medium"):
        """
        Khởi tạo với các tối ưu:
        - faster-whisper với GPU support
        - Thread pool cho I/O operations
        - Memory management
        """
        # Determine device và compute type
        if use_gpu and torch.cuda.is_available():
            self.device = "cuda"
            # self.compute_type = "float16"
            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        else:
            self.device = "cpu"
            # self.compute_type = "int8"  # Tối ưu cho CPU
            print("Using CPU")

        # Load faster-whisper model
        self.model = WhisperModel(
            model_size,
            device=self.device,
            # compute_type=self.compute_type,
            cpu_threads=cpu_count() if self.device == "cpu" else 4
        )

        self.cant_clean_list = cant_clean_list or []
        self.max_workers = max_workers or min(cpu_count(), 8)  # Giới hạn threads

        # Cache cho text normalization
        self._normalize_cache = {}
        self._lock = threading.Lock()

        print(f"Initialized with {self.max_workers} workers on {self.device}")

    @lru_cache(maxsize=10000)
    def normalize_text_cached(self, text: str) -> str:
        """Cached version của normalize_text"""
        return re.sub(r'[^\w\s]', '', text.lower()).strip()

    def clean_pipeline(self, export_dir: str, data: pd.DataFrame, speech_folder: str, padding=None, batch_size=4):
        """
        Tối ưu pipeline với:
        - Batch processing
        - Parallel execution
        - Memory management
        """
        os.makedirs(export_dir, exist_ok=True)

        # Chia data thành batches để tránh overload GPU
        batches = [data.iloc[i:i+batch_size] for i in range(0, len(data), batch_size)]

        total_processed = 0
        start_time = time.time()

        for batch_idx, batch in enumerate(batches):
            print(f"Processing batch {batch_idx + 1}/{len(batches)}")

            # Process batch với ThreadPoolExecutor
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                futures = []

                for idx, row in batch.iterrows():
                    future = executor.submit(
                        self._process_single_file,
                        export_dir, row, speech_folder, padding
                    )
                    futures.append(future)

                # Collect results
                for future in futures:
                    try:
                        future.result(timeout=300)  # 5 minutes timeout
                        total_processed += 1
                    except Exception as e:
                        print(f"Error processing file: {e}")

            # Memory cleanup after each batch
            if self.device == "cuda":
                torch.cuda.empty_cache()
            gc.collect()

        end_time = time.time()
        print(f"Processed {total_processed} files in {end_time - start_time:.2f} seconds")
        print(f"Average: {(end_time - start_time) / total_processed:.2f} seconds per file")

    def _process_single_file(self, export_dir: str, row: pd.Series, speech_folder: str, padding):
        """Process một file duy nhất"""
        file_name = f"{row['type']}_{row['id']}"

        # Process full file
        self._process_single_segment(
            export_dir, file_name, "full", speech_folder, padding, row
        )

        # Process segments if applicable
        if row['type'] in ['single_mix', 'chain_mix']:
            segment_tasks = []

            # Tạo tasks cho các segments
            for i in range(row['num_segments']):
                segment_tasks.append((export_dir, file_name, f"seg_{i}", speech_folder, padding, row))

            # Process segments song song (nested parallelism với ít threads hơn)
            with ThreadPoolExecutor(max_workers=2) as executor:
                futures = [
                    executor.submit(self._process_single_segment, *task)
                    for task in segment_tasks
                ]

                for future in futures:
                    try:
                        future.result()
                    except Exception as e:
                        print(f"Error processing segment: {e}")

    def _process_single_segment(self, export_dir: str, file_name: str, file_type: str,
                               speech_folder: str, padding, row: pd.Series):
        """Process một segment"""
        try:
            start, end = self.get_clean_range(
                file_name=file_name,
                file_type=file_type,
                padding=padding,
                speech_folder=speech_folder
            )

            if start is not None and end is not None:
                trimmed_audio = self.trim_audio(
                    file_name=file_name,
                    file_type=file_type,
                    start=start,
                    end=end,
                    speech_folder=speech_folder
                )

                # Create output directory
                output_dir = Path(export_dir) / file_name
                output_dir.mkdir(exist_ok=True)

                out_path = output_dir / f"{file_name}_{file_type}_trimmed.wav"
                trimmed_audio.export(str(out_path), format="wav")

        except Exception as e:
            print(f"Error processing {file_name}_{file_type}: {e}")
            with self._lock:
                self.cant_clean_list.append((file_name, file_type, str(e)))

    def get_clean_range(self, file_name: str, file_type: str, speech_folder: str, padding=None):
        """Tối ưu transcription với faster-whisper"""
        file_path = os.path.join(speech_folder, file_name, f"{file_name}_{file_type}.wav")

        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return None, None

        try:
            # Sử dụng faster-whisper với các tối ưu
            segments, info = self.model.transcribe(
                file_path,
                word_timestamps=True,
                vad_filter=True,  # Voice Activity Detection để tối ưu
                vad_parameters=dict(min_silence_duration_ms=500),
                beam_size=1,  # Giảm beam size để tăng tốc
                language="vi" if "vietnamese" in file_path.lower() else None  # Auto-detect hoặc specify
            )

            # Convert segments to list và extract words
            whisper_words = []
            for segment in segments:
                for word in segment.words:
                    whisper_words.append({
                        'word': word.word,
                        'start': word.start,
                        'end': word.end
                    })

            if not whisper_words:
                print(f"{file_name} {file_type} - No words transcribed")
                with self._lock:
                    self.cant_clean_list.append((file_name, file_type, "no words transcribed"))
                return None, None

        except Exception as e:
            print(f"{file_name} {file_type} - Can't transcribe: {e}")
            with self._lock:
                self.cant_clean_list.append((file_name, file_type, "cant transcribe"))
            return None, None

        # Load reference text
        try:
            meta_path = os.path.join(speech_folder, file_name, f"{file_name}.json")
            with open(meta_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            if file_type == "full":
                ref_text = data['command']
            else:
                num_seg = file_type.split("_")[1]
                ref_text = data['text_segments'][num_seg]

        except Exception as e:
            print(f"Error loading reference text for {file_name}: {e}")
            return None, None

        try:
            start, end = self.get_start_end_from_alignment(
                ref_text=ref_text,
                whisper_words=whisper_words
            )

            if padding:
                start = max(0, start - padding)
                end = end + padding

            return start, end

        except Exception as e:
            print(f"{file_name} {file_type} - Can't get start/end: {e}")
            with self._lock:
                self.cant_clean_list.append((file_name, file_type, "cant get start end"))
            return None, None

    def trim_audio(self, file_name: str, file_type: str, start: float, end: float, speech_folder: str):
        """Tối ưu audio trimming"""
        file_path = os.path.join(speech_folder, file_name, f"{file_name}_{file_type}.wav")

        try:
            audio = AudioSegment.from_wav(file_path)
            if start is not None and end is not None:
                # Convert to milliseconds
                start_ms = int(start * 1000)
                end_ms = int(end * 1000)
                trimmed_audio = audio[start_ms:end_ms]
                return trimmed_audio
            else:
                return audio
        except Exception as e:
            print(f"Error trimming audio {file_name}_{file_type}: {e}")
            return AudioSegment.empty()

    def get_start_end_from_alignment(self, ref_text: str, whisper_words: List[Dict]):
        """Tối ưu alignment algorithm"""
        ref_words = self.normalize_text_cached(ref_text).split()
        hypo_words = [self.normalize_text_cached(w['word']) for w in whisper_words]

        if not ref_words or not hypo_words:
            raise ValueError("Empty reference or hypothesis words")

        start_idx, end_idx = self.smith_waterman_fuzzy_optimized(ref_words, hypo_words)

        if start_idx >= len(whisper_words) or end_idx >= len(whisper_words):
            raise ValueError("Alignment indices out of bounds")

        start_time = whisper_words[start_idx]['start']
        end_time = whisper_words[end_idx]['end']

        return start_time, end_time

    def smith_waterman_fuzzy_optimized(self, ref_words: List[str], hypo_words: List[str],
                                     match_score=2, fuzzy_score=1, mismatch=-1, gap=-2):
        """Tối ưu Smith-Waterman algorithm"""
        m, n = len(ref_words), len(hypo_words)

        # Sử dụng numpy để tăng tốc
        score = np.zeros((m+1, n+1), dtype=np.float32)
        max_score = 0
        max_pos = None

        # Pre-compute similarities để tránh tính lại
        similarities = np.zeros((m, n), dtype=np.float32)
        for i in range(m):
            for j in range(n):
                similarities[i, j] = self.word_similarity_fast(ref_words[i], hypo_words[j])

        for i in range(1, m+1):
            for j in range(1, n+1):
                sim = similarities[i-1, j-1]

                if sim == 1:
                    s = match_score
                elif sim >= 0.8:
                    s = fuzzy_score
                else:
                    s = mismatch

                diag = score[i-1, j-1] + s
                delete = score[i-1, j] + gap
                insert = score[i, j-1] + gap
                score[i, j] = max(0, diag, delete, insert)

                if score[i, j] > max_score:
                    max_score = score[i, j]
                    max_pos = (i, j)

        if max_pos is None:
            raise ValueError("No alignment found")

        # Traceback
        i, j = max_pos
        end_j = j - 1

        while i > 0 and j > 0 and score[i, j] > 0:
            sim = similarities[i-1, j-1]
            if sim >= 0.8:
                i -= 1
                j -= 1
            elif score[i-1, j] + gap == score[i, j]:
                i -= 1
            else:
                j -= 1

        start_j = j
        return start_j, end_j

    @lru_cache(maxsize=50000)
    def word_similarity_fast(self, w1: str, w2: str) -> float:
        """Cached version của word similarity"""
        return SequenceMatcher(None, w1, w2).ratio()

    def cleanup(self):
        """Cleanup resources"""
        if hasattr(self.model, 'model'):
            del self.model.model
        if self.device == "cuda":
            torch.cuda.empty_cache()
        gc.collect()

    def get_performance_stats(self) -> Dict[str, Any]:
        """Get performance statistics"""
        return {
            "device": self.device,
            # "compute_type": self.compute_type,
            "max_workers": self.max_workers,
            "failed_files": len(self.cant_clean_list),
            "gpu_available": torch.cuda.is_available(),
            "gpu_memory_allocated": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
            "gpu_memory_reserved": torch.cuda.memory_reserved() if torch.cuda.is_available() else 0
        }

    # Backward compatibility methods
    def str2list(self, list_str: str):
        return ast.literal_eval(list_str)

    def word_similarity(self, w1: str, w2: str) -> float:
        return self.word_similarity_fast(w1, w2)

    def normalize_text(self, text: str) -> str:
        return self.normalize_text_cached(text)


# Utility function để benchmark performance
def benchmark_processing(cleaner: OptimizedSpeechCleaning, test_files: List[str]) -> Dict[str, float]:
    """Benchmark processing performance"""
    results = {}

    for file_path in test_files:
        start_time = time.time()

        # Mock processing
        try:
            segments, info = cleaner.model.transcribe(file_path, word_timestamps=True)
            processing_time = time.time() - start_time
            results[file_path] = processing_time
        except Exception as e:
            results[file_path] = float('inf')
            print(f"Error benchmarking {file_path}: {e}")

    return results

In [None]:
import os
import json
import pandas as pd

def load_json_folders(root_dir: str) -> pd.DataFrame:
    """
    Duyệt tất cả thư mục con trong root_dir,
    mỗi thư mục chứa một file JSON, load vào DataFrame.

    Parameters
    ----------
    root_dir : str
        Thư mục gốc chứa các thư mục con.

    Returns
    -------
    pd.DataFrame
        DataFrame chứa dữ liệu từ tất cả file JSON.
    """
    data = []
    for subdir in os.listdir(root_dir):
        subpath = os.path.join(root_dir, subdir)
        if os.path.isdir(subpath):
            # tìm file json trong subdir
            for fname in os.listdir(subpath):
                if fname.endswith(".json"):
                    fpath = os.path.join(subpath, fname)
                    with open(fpath, "r", encoding="utf-8") as f:
                        try:
                            obj = json.load(f)
                            data.append(obj)
                        except Exception as e:
                            print(f"⚠️ Lỗi đọc {fpath}: {e}")
    return pd.DataFrame(data)


In [None]:
!gdown 1--F60ZQLlD5EKJ32On6MB5CHXW6V6ncJ

Downloading...
From (original): https://drive.google.com/uc?id=1--F60ZQLlD5EKJ32On6MB5CHXW6V6ncJ
From (redirected): https://drive.google.com/uc?id=1--F60ZQLlD5EKJ32On6MB5CHXW6V6ncJ&confirm=t&uuid=d9b156c0-ca8b-4ce1-b044-dabf8691b11f
To: /kaggle/working/sample-5000-speech-synthesis-zip.zip
100%|██████████████████████████████████████| 1.21G/1.21G [00:19<00:00, 62.2MB/s]


In [None]:
!unzip -q sample-5000-speech-synthesis-zip.zip

In [None]:
import os

df_dict = {}
for path in os.listdir('sample-5000-speech-synthesis-zip'):
    root_folder = os.path.join('sample-5000-speech-synthesis-zip', path)
    df = load_json_folders(root_folder)
    df_dict[root_folder] = df

In [None]:
from pathlib import Path

# Example usage
cant_clean_list = []
if __name__ == "__main__":
    # Khởi tạo với các tối ưu
    for speech_folder in df_dict:
        cleaner = OptimizedSpeechCleaning(
            max_workers=8,
            use_gpu=True,
            model_size="medium"  # hoặc "large-v2" cho accuracy cao hơn
        )
        try:
            name = Path(speech_folder).name
            export_dir = f"trimmed_speech/{name}"
            # Process với monitoring
            start_time = time.time()
            cleaner.clean_pipeline(
                data = df_dict[speech_folder],
                export_dir = export_dir,
                speech_folder = speech_folder,
                padding = 0.3,
                batch_size = 4
            )

            total_time = time.time() - start_time
            print(f"Total processing time: {total_time:.2f} seconds")

            # Print performance stats
            stats = cleaner.get_performance_stats()
            print("Performance Statistics:")
            for key, value in stats.items():
                print(f"  {key}: {value}")

            cant_clean_list.append(cleaner.cant_clean_list)
        finally:
            # Cleanup
            cleaner.cleanup()

Using GPU: Tesla T4
Initialized with 8 workers on cuda
Processing batch 1/25
Processing batch 2/25
single_active_3ab1ad53 full - No words transcribed
chain_mix_703ad329 seg_0 - No words transcribed
chain_mix_703ad329 seg_2 - No words transcribed
chain_mix_703ad329 seg_1 - Can't get start/end: No alignment found
chain_mix_703ad329 seg_3 - No words transcribed
Processing batch 3/25
single_active_4927df2c full - No words transcribed
single_active_993460a7 full - Can't get start/end: No alignment found
Processing batch 4/25
single_mix_065210b2 seg_0 - No words transcribed
Processing batch 5/25
Processing batch 6/25
single_active_a95053ca full - No words transcribed
single_active_a64c34dd full - No words transcribed
non_active_ca780a7e full - Can't get start/end: No alignment found
single_mix_a2792a9d seg_0 - Can't get start/end: No alignment found
Processing batch 7/25
single_mix_a2b30150 seg_0 - Can't get start/end: No alignment found
Processing batch 8/25
Processing batch 9/25
Processing

In [None]:
!zip trimmed_speech.zip trimmed_speech -r

  adding: trimmed_speech/ (stored 0%)
  adding: trimmed_speech/synthesis_command_part_43/ (stored 0%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_71c4eae5/ (stored 0%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_71c4eae5/single_mix_71c4eae5_seg_0_trimmed.wav (deflated 24%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_71c4eae5/single_mix_71c4eae5_seg_1_trimmed.wav (deflated 8%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_71c4eae5/single_mix_71c4eae5_full_trimmed.wav (deflated 18%)
  adding: trimmed_speech/synthesis_command_part_43/non_active_27ddd9f1/ (stored 0%)
  adding: trimmed_speech/synthesis_command_part_43/non_active_27ddd9f1/non_active_27ddd9f1_full_trimmed.wav (deflated 10%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_70faf1a0/ (stored 0%)
  adding: trimmed_speech/synthesis_command_part_43/single_mix_70faf1a0/single_mix_70faf1a0_full_trimmed.wav (deflated 7%)
  adding: trimmed_speech/synthe

In [None]:
cant_clean_list

In [None]:
data = cant_clean_list

In [None]:
import pandas as pd

# Flatten list of list of tuple
flat_data = [t for sublist in data for t in sublist]

# Tạo DataFrame
df = pd.DataFrame(flat_data, columns=["id", "segment", "transcription"])

# Xuất ra CSV
df.to_csv("output.csv", index=False, encoding="utf-8")

print(df)


                          id segment         transcription
0     single_active_3ab1ad53    full  no words transcribed
1         chain_mix_703ad329   seg_0  no words transcribed
2         chain_mix_703ad329   seg_2  no words transcribed
3         chain_mix_703ad329   seg_1    cant get start end
4         chain_mix_703ad329   seg_3  no words transcribed
...                      ...     ...                   ...
2363   chain_active_3513ddcf    full    cant get start end
2364     single_mix_134da7b3   seg_1    cant get start end
2365     single_mix_134da7b3   seg_2  no words transcribed
2366     single_mix_134da7b3   seg_0    cant get start end
2367      chain_mix_d226a6e4   seg_0    cant get start end

[2368 rows x 3 columns]


In [None]:
len(set(df['id'].values))

1575

In [None]:
import json
import os
import itertools
import whisper
from pydub import AudioSegment
import numpy as np
import re
from difflib import SequenceMatcher
import ast
import pandas as pd


class SpeechCleaning:
    def __init__(self, cant_clean_list = None):
        self.model = whisper.load_model("medium")
        if not cant_clean_list:
            self.cant_clean_list = []
        else:
            self.cant_clean_list = cant_clean_list
    def clean_pipeline(self, export_dir: str, data: pd.DataFrame, speech_folder: str, padding = None):
        os.makedirs(export_dir, exist_ok = True)
        for idx, row in data.iterrows():

            file_name = f"{row['type']}_{row['id']}"
            file_type = "full"

            start, end = self.get_clean_range(file_name = file_name, file_type = file_type, padding = padding, speech_folder = speech_folder)
            trimmed_audio = self.trim_audio(file_name = file_name, file_type = file_type,
                                                      start = start, end = end, speech_folder = speech_folder)

            os.makedirs(os.path.join(export_dir, file_name), exist_ok = True)
            out_path = os.path.join(export_dir, file_name, f"{file_name}_{file_type}_trimmed.wav")
            trimmed_audio.export(out_path, format="wav")

            if row['type'] in ['single_mix', 'chain_mix']:

               for i in range(row['num_segments']):
                   file_type = f"seg_{i}"
                   start, end = self.get_clean_range(file_name = file_name, file_type = file_type, padding = padding, speech_folder = speech_folder)
                   trimmed_audio = self.trim_audio(file_name = file_name, file_type = file_type,
                                                              start = start, end = end, speech_folder = speech_folder)

                   os.makedirs(os.path.join(export_dir, file_name), exist_ok = True)
                   out_path = os.path.join(export_dir, file_name, f"{file_name}_{file_type}_trimmed.wav")
                   trimmed_audio.export(out_path, format="wav")


    def get_clean_range(self, file_name: str, file_type: str, speech_folder: str, padding = None):
        # prepare whisper words
        file_path = os.path.join(speech_folder, file_name, f"{file_name}_{file_type}.wav")

        transcribe = self.model.transcribe(file_path, word_timestamps=True)
        try:
            whisper_words = [speaker['words'] for speaker in transcribe['segments']]

            whisper_words = list(itertools.chain.from_iterable(whisper_words))
        except Exception as e:
            print(file_name + " " + file_type + " " + 'cant transcribe')
            self.cant_clean_list.append((file_name, file_type, "cant transcribe"))
            return None, None

        # prepare reference text
        meta_path = os.path.join(speech_folder, file_name, f"{file_name}.json")

        with open(meta_path, "r", encoding = "utf-8") as f:
            data = json.load(f)

        if file_type == "full":
            ref_text = data['command']
        else:
            num_seg = file_type.split("_")[1]
            ref_text = data['text_segments'][num_seg]

        try:
            start, end = self.get_start_end_from_alignment(ref_text = ref_text,
                                                  whisper_words = whisper_words)
            if padding:
                start = max(0, start - padding)
                end = end + padding
        except Exception as e:
            # Problems with long synthesis, should use only with segment
            print(file_name + " " + file_type + " " + 'cant get start end')
            print(e)
            print(f'ref_text: {ref_text}')
            print(f'whisper_words: {whisper_words}')
            self.cant_clean_list.append((file_name, file_type, "cant get start end"))
            return None, None
        return start, end

    def trim_audio(self, file_name, file_type, start, end, speech_folder):
        file_path = os.path.join(speech_folder, file_name, f"{file_name}_{file_type}.wav")

        audio = AudioSegment.from_wav(file_path)
        if start and end:

            trimmed_audio = audio[start*1000 : end*1000]
            return trimmed_audio
        else:
            return audio

    def get_start_end_from_alignment(self, ref_text, whisper_words):
        ref_words = self.normalize_text(ref_text).split()
        hypo_words = [self.normalize_text(w['word']) for w in whisper_words]

        start_idx, end_idx = self.smith_waterman_fuzzy(ref_words, hypo_words)
        start_time = whisper_words[start_idx]['start']
        end_time = whisper_words[end_idx]['end']

        return start_time, end_time

    def smith_waterman_fuzzy(self, ref_words, hypo_words, match_score=2, fuzzy_score=1, mismatch=-1, gap=-2):
        m, n = len(ref_words), len(hypo_words)
        score = np.zeros((m+1, n+1))
        max_score = 0
        max_pos = None

        for i in range(1, m+1):
            for j in range(1, n+1):
                sim = self.word_similarity(ref_words[i-1], hypo_words[j-1])
                if sim == 1:
                    s = match_score
                elif sim >= 0.8:
                    s = fuzzy_score
                else:
                    s = mismatch

                diag = score[i-1, j-1] + s
                delete = score[i-1, j] + gap
                insert = score[i, j-1] + gap
                score[i, j] = max(0, diag, delete, insert)

                if score[i, j] > max_score:
                    max_score = score[i, j]
                    max_pos = (i, j)

        # Traceback
        i, j = max_pos
        end_j = j - 1
        while i > 0 and j > 0 and score[i, j] > 0:
            sim = self.word_similarity(ref_words[i-1], hypo_words[j-1])
            if sim >= 0.8:
                i -= 1
                j -= 1
            elif score[i-1, j] + gap == score[i, j]:
                i -= 1
            else:
                j -= 1
        start_j = j

        return start_j, end_j

    def str2list(self, list_str: str):
        return ast.literal_eval(list_str)

    def word_similarity(self, w1, w2):
        return SequenceMatcher(None, w1, w2).ratio()

    def normalize_text(self, text):
        return re.sub(r'[^\w\s]', '', text.lower()).strip()



In [None]:
sc = SpeechCleaning()

In [None]:
df = pd.read_csv("/kaggle/working/100-samples.csv", index_col = 0)

In [None]:
sc.clean_pipeline(export_dir = 'trimmed_speech', speech_folder = '/kaggle/working/synthesis_command', data = df, padding = 0.3)

single_active_4ff7f40d full cant get start end
cannot unpack non-iterable NoneType object
ref_text: show me Cartoon Network
whisper_words: [{'word': ' What', 'start': 0.44000000000000006, 'end': 0.88, 'probability': 0.2911490797996521}, {'word': ' a', 'start': 0.88, 'end': 1.08, 'probability': 0.19421282410621643}, {'word': ' pleasant', 'start': 1.08, 'end': 1.3, 'probability': 0.23018227517604828}, {'word': ' tome,', 'start': 1.3, 'end': 1.7, 'probability': 0.5129103735089302}, {'word': " you're", 'start': 1.76, 'end': 1.94, 'probability': 0.4445706903934479}, {'word': ' a', 'start': 1.94, 'end': 2.0, 'probability': 0.19106259942054749}, {'word': ' first', 'start': 2.0, 'end': 2.26, 'probability': 0.5872543454170227}, {'word': '-gen', 'start': 2.26, 'end': 2.72, 'probability': 0.4452144652605057}, {'word': ' dab,', 'start': 2.72, 'end': 3.14, 'probability': 0.5577821135520935}, {'word': ' and', 'start': 3.46, 'end': 3.64, 'probability': 0.920242965221405}, {'word': ' here', 'start': 3

In [None]:
sc.cant_clean_list

[('single_active_4ff7f40d', 'full', 'cant get start end'),
 ('single_mix_d337321c', 'seg_0', 'cant get start end'),
 ('chain_mix_79b56df5', 'full', 'cant get start end'),
 ('single_mix_5143fc18', 'seg_1', 'cant get start end'),
 ('single_mix_5143fc18', 'seg_3', 'cant get start end'),
 ('single_mix_5143fc18', 'seg_5', 'cant get start end'),
 ('single_mix_5143fc18', 'seg_9', 'cant get start end'),
 ('single_mix_5143fc18', 'seg_10', 'cant get start end'),
 ('single_mix_966300b9', 'seg_0', 'cant get start end'),
 ('single_active_7aa90624', 'full', 'cant get start end'),
 ('single_mix_c6252315', 'seg_0', 'cant get start end'),
 ('single_mix_c6252315', 'seg_1', 'cant get start end'),
 ('single_mix_c6252315', 'seg_2', 'cant get start end'),
 ('single_mix_c6252315', 'seg_3', 'cant get start end'),
 ('single_mix_c6252315', 'seg_4', 'cant get start end'),
 ('single_mix_c6252315', 'seg_5', 'cant get start end'),
 ('single_mix_c6252315', 'seg_7', 'cant get start end'),
 ('single_mix_c6252315', 'se