# Import Libs

In [1]:
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
import os
import random

## Create Transcript Functions

In [2]:
def fetch_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split("v=")[1]
        # Fetch the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Process the transcript data
        text_transcript = "\n".join([entry['text'] for entry in transcript])
        return text_transcript
    except Exception as e:
        return str(e)

import re 
def clean_transcript(transcript):
    # Remove non-speech elements (e.g., laughter, background noises)
    transcript = re.sub(r'\[.*?\]', '', transcript)

    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
    # Example:
    # import nltk
    # transcript = ' '.join(nltk.word_tokenize(transcript))

    # Normalize punctuation and formatting
    transcript = transcript.replace('\n', ' ')  # Remove line breaks
    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces

    # Remove timestamps and annotations
    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)

    # Handle speaker identification (if present)
    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)

    # Remove filler words and phrases
    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
    for word in filler_words:
        transcript = transcript.replace(word, '')
    
    # Replace common contractions with their expanded forms
    transcript = transcript.replace("won't", "will not")
    transcript = transcript.replace("can't", "cannot")
    transcript = transcript.replace("n't", " not")
    transcript = transcript.replace("'ll", " will")
    transcript = transcript.replace("'ve", " have")
    transcript = transcript.replace("'re", " are")
    transcript = transcript.replace("'d", " would")
    transcript = transcript.replace("'s", " is")

    return transcript.strip()  # Trim leading/trailing whitespaces

## Create Summarization LLM 

In [7]:
def chat(system_prompt, text):
    """
    It is not a good practice to load the model again and again,
    but for the sake of simlicity for demo, let's keep as it is
    """

    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    # Authentication token for Hugging Face API
    # token = os.environ['HUGGINGFACE_TOKEN']

    # # Configure the model to load in a quantized 8-bit format for efficiency
    # bnb_config = BitsAndBytesConfig(
    #     load_in_8bit=True
    # )

    # # Set the device map to load the model on GPU 0
    # device_map = {"": 0}
    # # Load the model from Hugging Face with the specified configuration
    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     quantization_config=bnb_config,
    #     device_map=device_map)

    # # Load the tokenizer for the model
    # tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)

    # # Create a text-generation pipeline with the loaded model and tokenizer
    # llama_pipeline = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    llama_pipeline = pipeline(
        "text-generation", #task
        model=model_name,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=max_token_length,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )

    # Format the input text with special tokens for the model
    text = f"""
    <s>[INST] <<SYS>>
    {system_prompt}
    <</SYS>>
    {text}[/INST]
    """

    # Generate sequences using the pipeline with specified parameters
    sequences = llama_pipeline(
        text,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=32000
    )

    # Extract the generated text from the sequences
    generated_text = sequences[0]["generated_text"]
    # Trim the generated text to remove the instruction part
    generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]

    # Return the processed generated text
    return generated_text

def summarize(text):
    # Define the maximum input length for each iteration of summarization
    input_len = 10000

    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = chat("", "Summarize the following: " + text[0:input_len])

        if len(text) < input_len:
            return summary
        
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

## Run Pipeline

In [5]:
transcript_text_cleaned = clean_transcript(fetch_transcript("https://www.youtube.com/watch?v=48jlHaxZnig"))

In [8]:
podcast_summary = summarize(transcript_text_cleaned)
print(podcast_summary)



7234


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



     The article discusses the findings from the addiction literature that can help overcome procrastination. The key is to put oneself in a more painful state than the current state of procrastination. This can be achieved by engaging in activities that are harder or more uncomfortable than the current state, such as taking a cold shower or immersion. The idea is that by increasing the friction or discomfort, it can help rebound the dopamine trough more quickly, thus motivating the individual to take action. The article emphasizes the importance of finding a tool that is safe and uncomfortable, but not damaging, to help steep the trough and bring the individual back to their baseline level of dopamine more quickly.
