In [None]:
pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3


In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd

In [None]:
# --- Step 1: Define your speaker segments ---
speaker_segments = [
    {"speaker": "Poonamben Maadam", "start_time": "00:00:00", "end_time": "00:01:15"},
    {"speaker": "Ashwini Vaishnaw", "start_time": "00:01:15", "end_time": "00:07:25"},
    {"speaker": "Rajeshbhai Chudasama", "start_time": "00:07:25", "end_time": "00:08:45"},
    {"speaker": "Ashwini Vaishnaw", "start_time": "00:08:45", "end_time": "00:09:26"},
    {"speaker": "Rajeshbhai Chudasama", "start_time": "00:09:30", "end_time": "00:10:25"},
    {"speaker": "Ashwini Vaishnaw", "start_time": "00:10:25", "end_time": "00:11:06"},
    {"speaker": "Dr. T Sumathy Alias Thamizhachi Thangapandian", "start_time": "00:11:06", "end_time": "00:12:48"},
    {"speaker": "Ashwini Vaishnaw", "start_time": "00:12:59", "end_time": "00:14:56"},
    {"speaker": "Kiren Rijiju", "start_time": "00:56:19", "end_time": "01:52:19"},
    {"speaker": "Gaurav Gogoi", "start_time": "01:55:29", "end_time": "02:24:49"},
    {"speaker": "Ravi Shankar Prasad", "start_time": "02:25:08", "end_time": "02:49:32"},
    {"speaker": "Akhilesh Yadav", "start_time": "02:49:45", "end_time": "03:10:24"},
    {"speaker": "Kalyan Banerjee", "start_time": "03:10:40", "end_time": "03:45:50"},
    {"speaker": "Andimuthu Raja", "start_time": "03:46:06", "end_time": "04:18:21"},
    {"speaker": "Krishna Prasad Tenneti", "start_time": "04:18:20", "end_time": "04:29:23"},
    {"speaker": "Rajiv Ranjan Singh", "start_time": "04:29:46", "end_time": "04:47:14"},
]

In [None]:
# --- Step 2: Helper function to convert HH:MM:SS to seconds ---
def time_to_seconds(time_str):
    h, m, s = map(int, time_str.split(":"))
    return h * 3600 + m * 60 + s

In [None]:
# --- Step 3: Fetch the Hindi auto-generated transcript ---
video_id = 'AwchAgK9YBw'
transcript_hi = YouTubeTranscriptApi.get_transcript(video_id, languages=['hi'])

In [None]:
# --- Step 4: Match segments to speakers ---
def get_speaker_transcripts(transcript, speaker_segments):
    speaker_transcripts = []

    for segment in speaker_segments:
        start_sec = time_to_seconds(segment["start_time"])
        end_sec = time_to_seconds(segment["end_time"])
        speaker_text = ""

        for entry in transcript:
            if start_sec <= entry['start'] <= end_sec:
                speaker_text += entry['text'] + " "

        if speaker_text.strip():  # Only add if text exists
            speaker_transcripts.append({
                "speaker": segment["speaker"],
                "text": speaker_text.strip()
            })

    return speaker_transcripts

In [None]:
# --- Step 5: Create structured output ---
structured_data = get_speaker_transcripts(transcript_hi, speaker_segments)
df = pd.DataFrame(structured_data)

In [None]:
# --- Step 6: Save to CSV (Optional) ---
df.to_csv("structured_waqf_speeches.csv", index=False, encoding='utf-8-sig')

In [None]:
# --- Display Result ---
print(df.head())

                speaker                                               text
0      Poonamben Maadam  करूंगी कि मेरे क्षेत्र में बहुत सारे काम व डबल...
1      Ashwini Vaishnaw  महोदय ने बहुत ही एक महत्त्वपूर्ण विषय उठाया है...
2  Rajeshbhai Chudasama  अध्यक्ष जी मैं देश की जनता के की ओर से माननीय ...
3      Ashwini Vaishnaw  जी सोमनाथ एक ऐसी हमारी सांस्कृतिक धरोहर है एक ...
4  Rajeshbhai Chudasama  बताऊंगा नल मेंबर माननीय अध्यक्ष जी मेरे संसदीय...
