# Import Libs

In [1]:
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
    pipeline,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch

## Create Transcript Functions

In [2]:
def fetch_transcript(video_url):
    try:
        # Extract the video ID from the URL
        video_id = video_url.split("v=")[1]
        # Fetch the transcript for the video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Process the transcript data
        text_transcript = "\n".join([entry['text'] for entry in transcript])
        return text_transcript
    except Exception as e:
        return str(e)

import re 
def clean_transcript(transcript):
    # Remove non-speech elements (e.g., laughter, background noises)
    transcript = re.sub(r'\[.*?\]', '', transcript)

    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
    # Example:
    # import nltk
    # transcript = ' '.join(nltk.word_tokenize(transcript))

    # Normalize punctuation and formatting
    transcript = transcript.replace('\n', ' ')  # Remove line breaks
    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces

    # Remove timestamps and annotations
    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)

    # Handle speaker identification (if present)
    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)

    # Remove filler words and phrases
    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
    for word in filler_words:
        transcript = transcript.replace(word, '')
    
    # Replace common contractions with their expanded forms
    transcript = transcript.replace("won't", "will not")
    transcript = transcript.replace("can't", "cannot")
    transcript = transcript.replace("n't", " not")
    transcript = transcript.replace("'ll", " will")
    transcript = transcript.replace("'ve", " have")
    transcript = transcript.replace("'re", " are")
    transcript = transcript.replace("'d", " would")
    transcript = transcript.replace("'s", " is")

    return transcript.strip()  # Trim leading/trailing whitespaces

## Create Summarization LLM 

In [3]:
def summarize_transcript(text, llama_pipeline):
    def summarize_text(llama_pipeline, system_prompt, text):
        # Format the input text with special tokens for the model
        text = f"""
        <s>[INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        {text}[/INST]
        """
        # Generate sequences using the pipeline with specified parameters
        sequences = llama_pipeline(text)
        # Extract the generated text from the sequences
        generated_text = sequences[0]["generated_text"]
        # Trim the generated text to remove the instruction part
        generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
        # Return the processed generated text
        return generated_text
    # Define the maximum input length for each iteration of summarization
    input_len = 1000
    # Start an infinite loop to repeatedly summarize the text
    while True:
        # Print the current length of the text
        print(len(text))
        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
        summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
        if len(summary) < input_len:
            return summary
        # Concatenate the current summary with the remaining part of the text for the next iteration
        text = summary + " " + text[input_len:]

## Run Pipeline

In [4]:
    # Define the model name to be used for the chat function
    model_name = "meta-llama/Llama-2-7b-chat-hf"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    pipeline_llama2 = pipeline(
        "text-generation", #task
        model=model_name,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=max_token_length,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
transcript = clean_transcript(fetch_transcript("https://www.youtube.com/watch?v=Vst889H1V2I"))
podcast_summary = summarize_transcript(transcript, pipeline_llama2)
print(podcast_summary)
with open('transcript.txt', 'w', encoding='utf-8') as file:
            file.write(podcast_summary)

3134


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



     Summary:

The initial state of a reseted Simics system has several important settings:

1. CR0: Mostly all zeros, with the exception of the protection enable bit, which is set to zero, indicating that the system is in real mode.
2. CR3: Zero.
3. CR4: Zero.
4. CR2: Zero.
5. CR1: Zero.

The bits seen in the architecture 2001 and the PE are also present in the initial state.

The cache disable and non-write through bits are set to ones, indicating that the cache should not be used and that reads should be performed directly from the flash.

It is important to note that the initial state of a virtual machine may not be the same as a physical machine, and that certain attacks have been demonstrated in the past where an attacker could write into the virtual BIOS and persist across reset. Therefore, it is important to always validate the cache and ensure that the system is in a known good state after reset.
