In [1]:
pip install youtube-transcript-api google-api-python-client pandas

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3


In [2]:
import pandas as pd
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import time

In [3]:
# Replace with your API key
API_KEY = 'AIzaSyBLtnKqV8Xz7TQH2gxqmh7WPFRY_Ju7Y-E'
youtube = build('youtube', 'v3', developerKey=API_KEY)


In [4]:
# Search for videos matching a query
def search_videos(query, max_results=10):
    request = youtube.search().list(
        q=query,
        part='snippet',
        type='video',
        relevanceLanguage='kn',
        maxResults=max_results
    )
    response = request.execute()
    return [item['id']['videoId'] for item in response['items']]



In [5]:
# Fetch subtitles (if available) for a video
def get_subtitles(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['kn'])
        lines = [entry['text'] for entry in transcript]
        return ' '.join(lines)
    except (TranscriptsDisabled, NoTranscriptFound):
        return None
    except Exception as e:
        print(f"Error getting subtitles for {video_id}: {e}")
        return None

In [6]:
# Collect subtitles for all videos under a dialect label
def collect_subtitles_by_topic(topic, label, max_videos=5):
    video_ids = search_videos(topic, max_videos)
    all_data = []

    for vid in video_ids:
        print(f"Fetching subtitles from: {vid}")
        subs = get_subtitles(vid)
        if subs:
            clean_text = preprocess_text(subs)
            all_data.append({'text': clean_text, 'label': label})
        time.sleep(1)

    return all_data

In [7]:
# Basic text cleaner
import re
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\u0C80-\u0CFF\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
# Define search terms per dialect
dialect_topics = {
    "Mysuru": "Mysuru Kannada",
    "Dharwad": "Dharwad Kannada",
    "Bengaluru": "Bengaluru Kannada",
    "Mangaluru": "Mangaluru Kannada"
}

In [9]:
# Scrape subtitles from all topics
all_subs = []
for label, topic in dialect_topics.items():
    data = collect_subtitles_by_topic(topic, label, max_videos=20)
    all_subs.extend(data)



Fetching subtitles from: Eqay-eFu2qg
Fetching subtitles from: G7OMhy1PsQQ
Error getting subtitles for G7OMhy1PsQQ: no element found: line 1, column 0
Fetching subtitles from: K1rxNAuzkiM
Fetching subtitles from: CIYgEe3VdtY
Fetching subtitles from: tjU01oRqk4o
Fetching subtitles from: ZeJRAaO71cc
Fetching subtitles from: sVeGZqr5n64
Error getting subtitles for sVeGZqr5n64: no element found: line 1, column 0
Fetching subtitles from: YnKJMcsmlfs
Fetching subtitles from: v5A2Xc8rwlU
Fetching subtitles from: ZipSeR-z_1k
Fetching subtitles from: MSSnEx-cgLE
Fetching subtitles from: 6J5g_-uo6zU
Fetching subtitles from: Y542NJps214
Error getting subtitles for Y542NJps214: no element found: line 1, column 0
Fetching subtitles from: s4PYBPlLniQ
Fetching subtitles from: HF6-sJHX8U0
Fetching subtitles from: LRLIW8KWMbM
Fetching subtitles from: Z7ZF1LzptbI
Error getting subtitles for Z7ZF1LzptbI: no element found: line 1, column 0
Fetching subtitles from: Rm-DcGu0My0
Fetching subtitles from: -O7NL

In [12]:
# Save to CSV
df = pd.DataFrame(all_subs)
df.to_csv("kannada_dialect_subtitles.csv", index=False, encoding='utf-8-sig')
print("✅ Subtitles saved to kannada_dialect_subtitles.csv")

✅ Subtitles saved to kannada_dialect_subtitles.csv
