In [None]:
from pytube import YouTube
import os
import yt_dlp
import os
import unicodedata
from moviepy import VideoFileClip
import whisper

# BAIXA O VIDEO

In [None]:
# === SETTINGS ===
URL_LIST_PATH = 'shorts_list.txt'  # File with one YouTube Shorts URL per line
DOWNLOAD_FOLDER = 'videos'      # Folder to save downloaded videos

def download_video(url, output_path):
    try:
        yt = YouTube(url)
        # Shorts are usually under 60s, 720p is fine
        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        if stream:
            print(f"Downloading: {yt.title}")
            stream.download(output_path=output_path)
            print(f"Downloaded: {yt.title}\n")
        else:
            print(f"No suitable stream found for {url}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

In [None]:
def download_with_ytdlp(url, DOWNLOAD_FOLDER):
    ydl_opts = {
        'outtmpl': f'{DOWNLOAD_FOLDER}/%(title)s.%(ext)s',
        'format': 'mp4',
        'quiet': False,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [None]:
try:
    with open(URL_LIST_PATH, 'r') as file:
        urls = [line.strip() for line in file if line.strip()]
            
except FileNotFoundError:
    print(f"File not found: {URL_LIST_PATH}")

for url in urls:
    download_with_ytdlp(url, DOWNLOAD_FOLDER)

print("Batch download complete.")

# TRANSFORMA O VIDEO EM AUDIO

In [None]:
VIDEOS_FOLDER = 'videos'
TRANSCRIPTS_FOLDER = 'transcription'
AUDIO_FOLDER = 'temp_audio'

os.makedirs(TRANSCRIPTS_FOLDER, exist_ok=True)
os.makedirs(AUDIO_FOLDER, exist_ok=True)

model = whisper.load_model("base")

def sanitize_filename(name):
    # Remove accents and special characters
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
    return ''.join(c for c in name if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()

video_files = [f for f in os.listdir(VIDEOS_FOLDER) if f.lower().endswith(('.mp4', '.mov', '.webm', '.mkv'))]

for video_file in video_files:
    input_path = os.path.join(VIDEOS_FOLDER, video_file)
    
    if not os.path.isfile(input_path):
        print(f"❌ File not found: {input_path}")
        continue

    safe_name = sanitize_filename(os.path.splitext(video_file)[0])
    audio_path = os.path.join(AUDIO_FOLDER, safe_name + '.wav')
    transcript_path = os.path.join(TRANSCRIPTS_FOLDER, safe_name + '.txt')

    try:
        print(f"🔊 Extracting audio from: {video_file}")
        clip = VideoFileClip(input_path)
        if clip.audio is None:
            print(f"❌ No audio track found in {video_file}. Skipping.")
            continue
        clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
        clip.close()
    except Exception as e:
        print(f"❌ Failed to extract audio: {e}")
        continue

# FAZ O AUDIO VIRAR UM TXT

In [None]:
model = whisper.load_model("base")  # Or "tiny", "small", etc.

folder_path = "temp_audio"

for filename in os.listdir(folder_path):
    if filename.lower().endswith('.wav'):
        filepath = os.path.join(folder_path, filename)
        print(f"Transcribing {filename}...")

        result = model.transcribe(filepath)
        text_path = os.path.splitext(filepath)[0] + ".txt"
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(result["text"])
        print(f"Saved: {text_path}")