In [1]:
!pip install faster-whisper torchvision torchaudio torch isodate yt-dlp pytube3 transformers google-api-python-client
!pip install tortoise-tts

Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting isodate
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting yt-dlp
  Downloading yt_dlp-2025.4.30-py3-none-any.whl.metadata (173 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.3/173.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytube3
  Downloading pytube3-9.6.4-py3-none-any.whl.metadata (16 kB)
Collecting google-api-python-client
  Downloading google_api_python_client-2.169.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.3.0-cp311-cp311-manylinux_2_

In [2]:
import os
import subprocess
from datetime import datetime, timedelta
import random

import torch
import isodate
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModel,
    AutoModelForTextToWaveform,
)
from googleapiclient.discovery import build
from faster_whisper import WhisperModel
from tortoise.utils.tokenizer import Tokenizer


# ==== Model Setup ====
device = "cuda" if torch.cuda.is_available() else "cpu"

# Whisper for transcription
print("Loading Whisper large-v3 model for best accuracy...")
whisper_model = WhisperModel("large-v3", device=device)

# Bart for summarization
print("Loading facebook/bart-large-cnn model for summarization...")
sum_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
sum_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

# Story generation (Falcon-RW-1B)
print("Loading Falcon-RW-1B model for story generation...")
story_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-rw-1b")
story_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-rw-1b").to(device)
story_tokenizer.pad_token = story_tokenizer.eos_token

# ==== Custom Tortoise TTS ====
print("Loading custom Tortoise TTS model (Gatozu35/tortoise-tts)...")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
tts_model = AutoModelForTextToWaveform.from_pretrained("facebook/mms-tts-rus").to(device)

# ==== Config ====
YOUTUBE_API_KEY = "AIzaSyBIKmob4FExTscnJwkTQO_H-ErF9XE9Sgg"  # Replace with your valid API key
OUTPUT_FILE = "final_transcript_story.txt"
AUDIO_OUTPUT_DIR = "generated_audios"
os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)

# ==== Helper Functions ====
def search_youtube_videos_by_keyword(query, max_results, region_code):
    youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
    published_after = (datetime.utcnow() - timedelta(days=1)).isoformat("T") + "Z"

    try:
        search_request = youtube.search().list(
            q=query,
            part="snippet",
            type="video",
            maxResults=max_results,
            regionCode=region_code,
            publishedAfter=published_after
        )
        search_response = search_request.execute()

        video_ids = [item["id"]["videoId"] for item in search_response["items"]]
        if not video_ids:
            print("No videos found for this keyword.")
            return []

        video_details = youtube.videos().list(
            part="contentDetails,snippet",
            id=",".join(video_ids)
        ).execute()

        results = []
        for item in video_details["items"]:
            duration = item["contentDetails"]["duration"]
            try:
                seconds = isodate.parse_duration(duration).total_seconds()
                if 60 <= seconds <= 2400:
                    video_id = item["id"]
                    title = item["snippet"]["title"]
                    description = item["snippet"].get("description", "")
                    url = f"https://www.youtube.com/watch?v={video_id}"
                    results.append((title, description, url))

                    if len(results) >= max_results:
                        break
            except Exception as e:
                print("Duration parse error:", e)
                continue
        return results
    except Exception as e:
        print(f"Error fetching video details for '{query}' in region '{region_code}': {e}")
        return []

def download_audio(video_url, output_basename):
    output_path = f"{output_basename}.mp3"
    print(f"Downloading audio to: {output_path}")
    command = [
        "yt-dlp",
        "-x",
        "--audio-format", "mp3",
        "-o", f"{output_basename}.%(ext)s",
        video_url
    ]
    try:
        subprocess.run(command, check=True)
        return output_path
    except subprocess.CalledProcessError as e:
        print(f"Download error for {video_url}: {e}")
        return None

def transcribe_audio(audio_path):
    if not os.path.exists(audio_path):
        print(f"Error: The audio file {audio_path} does not exist.")
        return ""

    segments, _ = whisper_model.transcribe(
        audio_path,
        beam_size=5,
        language='en',
        vad_filter=False,
        initial_prompt="This is a YouTube video transcript in clear English."
    )
    transcript = " ".join([segment.text for segment in segments])
    return transcript

def summarize_transcript(transcript_text):
    summary_prompt = transcript_text
    inputs = sum_tokenizer(summary_prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    summary_ids = sum_model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = sum_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def generate_story_from_transcript(transcript, title, themes):
    themes_combined = ", ".join(themes)
    final_prompt = (
        f"You are an expert storyteller. Your task is to write a story titled '{title}'. "
        f"The story MUST blend ALL of the following themes: {themes_combined}. "
        f"Create well-developed characters that bring these themes to life. "
        f"Each character should reflect different aspects of these themes through their personalities, decisions, and interactions. "
        f"Ensure the setting, mood, and plot are immersive and compelling, and vividly express the combined themes. "
        f"Base the plot loosely on the following YouTube transcript but feel free to add fictional twists, conflicts, and resolutions to make it engaging: {transcript}. "
        f"The story should maintain a consistent tone and deeply integrate the combined themes from start to finish.")
    inputs = story_tokenizer(final_prompt, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = story_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=400,
        min_length=200,
        do_sample=True,
        top_k=40,
        top_p=0.92,
        temperature=0.95,
        repetition_penalty=1.1,
        no_repeat_ngram_size=3,
        num_return_sequences=1
    )
    story = story_tokenizer.decode(outputs[0], skip_special_tokens=True)
    clean_story = story.replace(final_prompt, "").strip()
    return clean_story

def convert_story_to_audio(story_text, title):
    print(f"Converting story '{title}' to audio using Gatozu35/tortoise-tts...")
    output_path = os.path.join(AUDIO_OUTPUT_DIR, f"{title[:50].replace(' ', '_')}.wav")

    # Encode text input
    inputs = tts_tokenizer(story_text, return_tensors="pt").to(device)
    with torch.no_grad():
        _ = tts_model(**inputs)  # Placeholder forward pass

    with open(output_path, "w") as f:
        f.write(f"[Audio generation placeholder for: {title}]")

    print(f"Audio (simulated) saved to {output_path}")

# ==== Main Pipeline ====
def main():
    print("Let's turn trending videos directly into stories...")

    query_keywords = []
    while True:
        keyword = input("Enter keyword (or 'exit'): ").strip()
        if keyword.lower() == "exit":
            break
        query_keywords.append(keyword)

    if not query_keywords:
        print("No keywords entered. Exiting.")
        return

    theme_input = input("Enter themes (comma-separated, e.g., dramatic, funny, romantic): ").strip().lower()
    themes = [t.strip() for t in theme_input.split(",")]

    region_input = input("Enter region codes (comma-separated, e.g., US, IN, UK): ").strip().upper()
    regions = [r.strip() for r in region_input.split(",")]

    max_results = int(input("Max YouTube results per keyword: "))
    num_random_videos = int(input("Number of videos to process randomly: "))

    custom_prompt_flag = input("Do you want to add your own prompt? (yes/no): ").strip().lower()

    for keyword in query_keywords:
        print(f"\nSearching videos for keyword: '{keyword}'...")
        for region_code in regions:
            videos = search_youtube_videos_by_keyword(keyword, max_results=max_results, region_code=region_code)

            if not videos:
                print(f"No videos found for '{keyword}' in region '{region_code}'. Skipping.")
                continue

            selected_videos = random.sample(videos, min(num_random_videos, len(videos)))

            for title, description, url in selected_videos:
                print(f"\nProcessing video: {title}\n{url}")
                video_id = url.split("v=")[-1]
                audio_file = download_audio(url, output_basename=video_id)

                if not audio_file:
                    print(f"Skipping {url} due to download error.")
                    continue

                print(f"Transcribing {audio_file}...")
                transcript = transcribe_audio(audio_file)
                if not transcript:
                    print("Skipping transcription due to error.")
                    continue

                if len(transcript.split()) < 30:
                    print("Transcript too short (<30 words). Skipping summary.")
                    continue

                print("\nGenerating transcript summary...")
                transcript_summary = summarize_transcript(transcript)
                print("\nTranscript Summary:\n", transcript_summary)

                if custom_prompt_flag == "yes":
                    custom_prompt = input("Enter your custom prompt for the story: ").strip()
                    story = generate_story_from_transcript(transcript, title, [custom_prompt] + themes)
                else:
                    story = generate_story_from_transcript(transcript, title, themes)

                print("\nStory:\n", story)

                with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
                    f.write(f"Title: {title}\nURL: {url}\n\n")
                    f.write("Transcript:\n" + transcript + "\n\n")
                    f.write("Transcript Summary:\n" + transcript_summary + "\n\n")
                    f.write("Story:\n" + story + "\n\n\n")

                convert_story_to_audio(story, title)
                print(f"Saved results to {OUTPUT_FILE}")

    print("\nAll done! Check your output and audio directories for results.")

if __name__ == "__main__":
    main()


Loading Whisper large-v3 model for best accuracy...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading facebook/bart-large-cnn model for summarization...
Loading Falcon-RW-1B model for story generation...


model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

Loading custom Tortoise TTS model (Gatozu35/tortoise-tts)...


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

Let's turn trending videos directly into stories...
Enter keyword (or 'exit'): football
Enter keyword (or 'exit'): exit
Enter themes (comma-separated, e.g., dramatic, funny, romantic): funny
Enter region codes (comma-separated, e.g., US, IN, UK): UK,CA
Max YouTube results per keyword: 4
Number of videos to process randomly: 1
Do you want to add your own prompt? (yes/no): no

Searching videos for keyword: 'football'...

Processing video: Chelsea 3-1 Liverpool | HIGHLIGHTS | Premier League 2024/25
https://www.youtube.com/watch?v=-H8tvnWaYs4
Downloading audio to: -H8tvnWaYs4.mp3
Transcribing -H8tvnWaYs4.mp3...

Generating transcript summary...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Transcript Summary:
 Enzo Fernandes puts Chelsea ahead after just three minutes. Karl Palmer scores an own goal to make it 2-0. Van Dijk scores Chelsea's third. Palmer scores a penalty to seal the win for Chelsea. Chelsea beat Liverpool 2-1 at Stamford Bridge.

Story:
 Please provide at least THREE detailed examples when you begin your project using clear descriptions and visuals including all elements required by each prompt below which may include photos or video clips; please do consider sharing relevant links within this application - thank you!.!).......
Need help writing stories (using multiple narrative styles). Must use APA referencing format only. NO PLAGING OF ANY FORMATS INCLUDING PAST TENSE OR PREPOSITION IS ALLOWED AND REQUIRED IN THIS ASSIGNMENT ONLY ONCE AT MAXIMUM WITH A SITE THAT THAT SHOULD BE AN ACTUAL WEBSITE PAGE NOT JUST WORDS BUT IMAGES VISUALS EXCERPASSTIONS TO THE POINTS USED IN YOUR RESEARCH TOP TIPS TIPS HERE NOW MORE THESE THOSETHATTOTHISITHETOPTOPTIOTROTTO

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Transcript Summary:
 Cole Palmer says he can't wait to see where Daniel Sturridge takes his Chelsea career. The Chelsea midfielder says he has been through difficult periods in his career. Palmer says the one thing that hasn't changed, though, is the personality and resilience to be able to come back over a difficult period.

Story:
 You are an expert storyteller. Your task is to write a story titled '"Football needs Cole Palmer" | Keane, Redknapp & Sturridge heap praise on Chelsea star'. The story MUST blend ALL of the following themes: funny. Create well-developed characters that bring these themes to life. Each character should reflect different aspects of these themes through their personalities, decisions, and interactions. Ensure the setting, mood, and plot are immersive and compelling, and vividly express the combined themes. Base the plot loosely on the following YouTube transcript but feel free to add fictional twists, conflicts, and resolutions to make it engaging:  A play o

In [1]:
!pip install --upgrade transformers

