# YouTube Health Tutorial Scraper

This project automates the extraction of transcripts from health-related YouTube videos. It helps researchers and content creators analyze spoken content efficiently. The script retrieves subtitles using the `youtube_transcript_api` and saves them as text files. This allows for easy data processing and further natural language analysis. With minimal setup, users can extract and store transcripts from multiple videos at once.


In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import re

# List of YouTube video URLs
video_urls = [
    "https://youtu.be/s-HThHRV4uo",
    "https://youtu.be/RSu1who-Bk8",
    "https://youtu.be/yxonJTWhBJQ",
    "https://youtu.be/PykxvdNNSK8",
    "https://youtu.be/tXKGasTUxNI",
    "https://www.youtube.com/watch?v=QRLVrc70KjI",
    "https://youtu.be/MWUM7LIXDeA",
    "https://youtu.be/7PhmyNBWGik",
    "https://youtu.be/q3wtIuJG0oc"
]

def get_video_id(url):
    """Extract video ID from YouTube URL"""
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
    return match.group(1) if match else None

def fetch_transcript(video_id):
    """Fetch and clean transcript for a given video ID"""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        clean_text = " ".join([entry["text"] for entry in transcript])
        return clean_text
    except Exception as e:
        print(f"Error fetching transcript for {video_id}: {e}")
        return None

if __name__ == "__main__":
    for url in video_urls:
        video_id = get_video_id(url)
        if video_id:
            transcript = fetch_transcript(video_id)
            if transcript:
                filename = f"data/{video_id}.txt"
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(transcript)
                print(f"Transcript saved: {filename}")
        else:
            print(f"Invalid YouTube URL: {url}")


Error fetching transcript for s-HThHRV4uo: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=s-HThHRV4uo! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (s-HThHRV4uo) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - no ("Norwegian (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - ab ("Abkhazian")
 - aa ("Afar")
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - ba ("Bashkir")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - br ("Breton")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Corsican")
 - hr ("Croatian")
 - cs ("Czech")
 - da ("Danish")
 - dv ("Dive