# Import Libs

In [1]:
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
import os
import random

## Create Transcript Functions

In [2]:
def fetch_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split("v=")[1]
        # Fetch the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Process the transcript data
        text_transcript = "\n".join([entry['text'] for entry in transcript])
        return text_transcript
    except Exception as e:
        return str(e)

import re 
def clean_transcript(transcript):
    # Remove non-speech elements (e.g., laughter, background noises)
    transcript = re.sub(r'\[.*?\]', '', transcript)

    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
    # Example:
    # import nltk
    # transcript = ' '.join(nltk.word_tokenize(transcript))

    # Normalize punctuation and formatting
    transcript = transcript.replace('\n', ' ')  # Remove line breaks
    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces

    # Remove timestamps and annotations
    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)

    # Handle speaker identification (if present)
    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)

    # Remove filler words and phrases
    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
    for word in filler_words:
        transcript = transcript.replace(word, '')
    
    # Replace common contractions with their expanded forms
    transcript = transcript.replace("won't", "will not")
    transcript = transcript.replace("can't", "cannot")
    transcript = transcript.replace("n't", " not")
    transcript = transcript.replace("'ll", " will")
    transcript = transcript.replace("'ve", " have")
    transcript = transcript.replace("'re", " are")
    transcript = transcript.replace("'d", " would")
    transcript = transcript.replace("'s", " is")

    return transcript.strip()  # Trim leading/trailing whitespaces

## Create Summarization LLM 

In [3]:
def chat(llama_pipeline, system_prompt, text):
    """
    It is not a good practice to load the model again and again,
    but for the sake of simlicity for demo, let's keep as it is
    """



    # Format the input text with special tokens for the model
    text = f"""
    <s>[INST] <<SYS>>
    {system_prompt}
    <</SYS>>
    {text}[/INST]
    """

    # Generate sequences using the pipeline with specified parameters
    sequences = llama_pipeline(
        text,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=32000
    )

    # Extract the generated text from the sequences
    generated_text = sequences[0]["generated_text"]
    # Trim the generated text to remove the instruction part
    generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]

    # Return the processed generated text
    return generated_text

def summarize(text, llama_pipeline):
    # Define the maximum input length for each iteration of summarization
    input_len = 10000

    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = chat(llama_pipeline, "", "Summarize the following: " + text[0:input_len])

        if len(text) < input_len:
            return summary
        
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

## Run Pipeline

In [4]:
    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pipeline_llama2 = pipeline(
        "text-generation", #task
        model=model_name,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=max_token_length,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
transcript = clean_transcript(fetch_transcript("https://www.youtube.com/watch?v=5t1vTLU7s40"))


In [9]:
with open('YannLecun_LexPodcast.txt', 'w', encoding='utf-8') as file:
            file.write(transcript)

In [13]:
transcript = clean_transcript(fetch_transcript("https://www.youtube.com/watch?v=_ZJpU43NA0c"))
podcast_summary = summarize(transcript, pipeline_llama2)
print(podcast_summary)
with open('notes\\HowtoGetAheadof99ofPeople(Starting Today).txt', 'w', encoding='utf-8') as file:
            file.write(podcast_summary)

11819
2688

     The speaker argues that the conventional wisdom on success, such as waking up early, removing distractions, and having goals, is overrated and not the key to success. Instead, he suggests that being a correct contrarian and having unpopular beliefs is more important. He cites examples of successful people throughout history who had contrarian ideas that were initially ridiculed but ultimately proved to be correct. The speaker also acknowledges that executing on those ideas is important, but it's not sufficient on its own. He suggests that finding one correct contrarian idea can have a significant impact on a person's career, more than any other factor. However, he also notes that extreme success can have negative consequences, such as a lack of validation and approval from others, and that it may not improve relationships that don't need improvement.

The speaker emphasizes that being correctly contrarian is not a guarantee of success, and that most contrarian beliefs 