In [4]:
import os
from langdetect import detect_langs, DetectorFactory

DetectorFactory.seed = 0  # Ensures consistent language detection results

def detect_languages(text):
    try:
        return detect_langs(text)  # Returns a list of detected languages with probabilities
    except:
        return []

def process_text_files(folder_path, output_file):
    count = 0
    video_ids = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):  # Process only text files
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
                detected_languages = detect_languages(content)
                
                # Filter out files that contain only English
                if any("en" not in str(lang) for lang in detected_languages):
                    print(f"\n- Video id: {file_name}")
                    print(f"- Detected languages: {', '.join(str(lang) for lang in detected_languages)}")
                    video_ids.append(file_name)
                    count += 1
    
    with open(output_file, 'w', encoding='utf-8') as out_file:
        out_file.write("\n".join(video_ids))
    
    print(f"\nTotal files with non-English content: {count}")
    print(f"Video IDs saved to {output_file}")

# Example usage
folder_path = "final_transcripts/"  # Replace with the actual folder path
output_file = "non_english_video_ids.txt"
process_text_files(folder_path, output_file)


- Video id: 0gBZLqvonWw.txt
- Detected languages: en:0.8571427894244699, tl:0.14285593002515537

- Video id: 4VT_EhS1IT4.txt
- Detected languages: en:0.7142815788522352, ko:0.285714059042601

- Video id: 57xXQjC83zA.txt
- Detected languages: tl:0.57142985797158, en:0.4285695925683672

- Video id: 61QLtwDaN-w.txt
- Detected languages: en:0.7142829811217422, tl:0.2857161337496648

- Video id: 6yuZbctPGUI.txt
- Detected languages: en:0.8571408059720683, tl:0.1428568546056798

- Video id: 7rHX9QbO_hg.txt
- Detected languages: en:0.8571409255252933, tl:0.1428575772102677

- Video id: 7ZK_w2w9obo.txt
- Detected languages: en:0.8571393101219136, ja:0.1428571426558753

- Video id: 86ZDNvmGZC8.txt
- Detected languages: en:0.714284804865909, et:0.14285694132051302, ko:0.1428563452732496

- Video id: bBET4ReZWME.txt
- Detected languages: tl:0.571429636277132, en:0.42856975846225553

- Video id: ER6HL3A5c8M.txt
- Detected languages: en:0.8571397605731148, tl:0.1428577506023195

- Video id: euOfRk

In [None]:
import os
import subprocess
import googleapiclient.discovery
from pytube import YouTube

# Set up YouTube Data API key
API_KEY = "replace" 
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

def load_video_ids(file_path):
    """Loads video IDs from a text file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def get_video_url(video_id):
    """Fetches video details using YouTube Data API."""
    try:
        youtube = googleapiclient.discovery.build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
        request = youtube.videos().list(part="snippet", id=video_id)
        response = request.execute()

        if "items" in response and response["items"]:
            return f"https://www.youtube.com/watch?v={video_id}"
        else:
            print(f"⚠️ Video {video_id} not found or unavailable.")
            return None
    except googleapiclient.errors.HttpError as e:
        print(f"❌ YouTube API Error: {e}")
        return None


import yt_dlp

def download_audio(video_id, output_folder):
    url = f"https://www.youtube.com/watch?v={video_id}"
    file_path = os.path.join(output_folder, f"{video_id}.mp4")

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': file_path,
        'quiet': True
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    return file_path


def transcribe_audio(file_path, model="medium"):
    """Transcribes audio using Whisper CLI with English/Tagalog only."""
    command = f"whisper '{file_path}' --model {model} --language en --task transcribe"
    subprocess.run(command, shell=True)

def process_videos(video_ids, output_folder, model="medium"):
    """Downloads and transcribes videos using Whisper CLI with English/Tagalog only."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for video_id in video_ids:
        print(f"Processing {video_id}...")
        audio_path = download_audio(video_id, output_folder)
        if audio_path:
            transcribe_audio(audio_path, model)
            print(f"Finished transcribing {video_id}")


video_ids_file = "non_english_video_ids.txt"
output_folder = "youtube_transcripts"

video_ids = load_video_ids(video_ids_file)
process_videos(video_ids, output_folder, model="medium")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
