In [None]:
# Import necessary libraries
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re

# Helper function to convert HH:MM:SS to total seconds
def hms_to_seconds(time_str):
    """Convert HH:MM:SS to total seconds."""
    parts = time_str.split(':')
    if len(parts) != 3:
        raise ValueError(f"Invalid time format: {time_str}")
    hours, minutes, seconds = map(int, parts)
    return hours * 3600 + minutes * 60 + seconds

# Define the list of chapters with their timestamps and titles
chapters_raw = [
    "00:00:00 intro: Let’s reproduce GPT-2 (124M)",
    "00:03:39 exploring the GPT-2 (124M) OpenAI checkpoint",
    "00:13:47 SECTION 1: implementing the GPT-2 nn.Module",
    "00:28:08 loading the huggingface/GPT-2 parameters",
    "00:31:00 implementing the forward pass to get logits",
    "00:33:31 sampling init, prefix tokens, tokenization",
    "00:37:02 sampling loop",
    "00:41:47 sample, auto-detect the device",
    "00:45:50 let’s train: data batches (B,T) → logits (B,T,C)",
    "00:52:53 cross entropy loss",
    "00:56:42 optimization loop: overfit a single batch",
    "01:02:00 data loader lite",
    "01:06:14 parameter sharing wte and lm_head",
    "01:13:47 model initialization: std 0.02, residual init",
    "01:22:18 SECTION 2: Let’s make it fast. GPUs, mixed precision, 1000ms",
    "01:28:14 Tensor Cores, timing the code, TF32 precision, 333ms",
    "01:39:38 float16, gradient scalers, bfloat16, 300ms",
    "01:48:15 torch.compile, Python overhead, kernel fusion, 130ms",
    "02:00:18 flash attention, 96ms",
    "02:06:54 nice/ugly numbers. vocab size 50257 → 50304, 93ms",
    "02:14:55 SECTION 3: hyperpamaters, AdamW, gradient clipping",
    "02:21:06 learning rate scheduler: warmup + cosine decay",
    "02:26:21 batch size schedule, weight decay, FusedAdamW, 90ms",
    "02:34:09 gradient accumulation",
    "02:46:52 distributed data parallel (DDP)",
    "03:10:21 datasets used in GPT-2, GPT-3, FineWeb (EDU)",
    "03:23:10 validation data split, validation loss, sampling revive",
    "03:28:23 evaluation: HellaSwag, starting the run",
    "03:43:05 SECTION 4: results in the morning! GPT-2, GPT-3 repro",
    "03:56:21 shoutout to llm.c, equivalent but faster code in raw C/CUDA",
    "03:59:39 summary, phew, build-nanogpt github repo"
]

# Parse the raw chapters into a list of dictionaries with title and start time in seconds
chapters = []
for chapter in chapters_raw:
    match = re.match(r"(\d{2}:\d{2}:\d{2})\s+(.*)", chapter)
    if match:
        time_str, title = match.groups()
        start_seconds = hms_to_seconds(time_str)
        chapters.append({"title": title.strip(), "start": start_seconds})
    else:
        print(f"Invalid chapter format: {chapter}")

# Sort chapters by start time just in case
chapters = sorted(chapters, key=lambda x: x['start'])

# Extract video ID from the URL
video_url = "https://www.youtube.com/watch?v=l8pRSuU81PU"
video_id_match = re.search(r"v=([a-zA-Z0-9_-]{11})", video_url)
if video_id_match:
    video_id = video_id_match.group(1)
else:
    raise ValueError("Invalid YouTube URL or unable to extract video ID.")

# Fetch the transcript using youtube_transcript_api
try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
except Exception as e:
    print(f"Error fetching transcript: {e}")
    transcript = []

# Organize transcript into chapters
organized_transcript = {chapter['title']: [] for chapter in chapters}
current_chapter_index = 0
num_chapters = len(chapters)

for entry in transcript:
    time = entry['start']
    # Move to the correct chapter based on time
    while (current_chapter_index + 1 < num_chapters) and (time >= chapters[current_chapter_index + 1]['start']):
        current_chapter_index += 1
    current_chapter = chapters[current_chapter_index]['title']
    organized_transcript[current_chapter].append(entry['text'])

# Function to format seconds back to HH:MM:SS
def seconds_to_hms(seconds):
    """Convert total seconds to HH:MM:SS format."""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    secs = seconds % 60
    return f"{int(hours):02d}:{int(minutes):02d}:{int(secs):02d}"

# Display the organized transcript
for chapter in chapters:
    title = chapter['title']
    start_time = seconds_to_hms(chapter['start'])
    print(f"### {start_time} {title}\n")
    chapter_text = ' '.join(organized_transcript[title])
    print(f"{chapter_text}\n\n")

# Optionally, save the organized transcript to a Markdown file
save_to_file = True  # Set to True if you want to save the output
if save_to_file:
    with open("organized_transcript.md", "w", encoding="utf-8") as f:
        for chapter in chapters:
            title = chapter['title']
            start_time = seconds_to_hms(chapter['start'])
            f.write(f"### {start_time} {title}\n\n")
            chapter_text = ' '.join(organized_transcript[title])
            f.write(f"{chapter_text}\n\n")
    print("Organized transcript has been saved to 'organized_transcript.md'")


In [None]:
# Uninstall the original fpdf
!pip uninstall fpdf -y

# Install fpdf2
!pip install fpdf2 --quiet


In [3]:
# Install the required library
!pip uninstall fpdf -y  # Uninstall original fpdf if installed
!pip install fpdf2 --quiet  # Install fpdf2

# Import necessary libraries
from fpdf import FPDF
import re
import os
import unicodedata
import traceback

# Define a custom PDF class to handle formatting and two-column layout
class PDF(FPDF):
    def __init__(self, left_margin=15, top_margin=15, right_margin=15, gutter=10):
        """
        Initialize the PDF with custom margins and column settings.

        Parameters:
        - left_margin (int): Left margin in mm.
        - top_margin (int): Top margin in mm.
        - right_margin (int): Right margin in mm.
        - gutter (int): Space between columns in mm.
        """
        super().__init__()
        self.left_margin = left_margin
        self.top_margin = top_margin
        self.right_margin = right_margin
        self.gutter = gutter
        # Calculate column width based on page width and margins
        self.column_width = (self.w - self.left_margin - self.right_margin - self.gutter) / 2
        self.set_auto_page_break(auto=True, margin=15)
        self.add_page()
        self.set_margins(self.left_margin, self.top_margin, self.right_margin)
        # Use built-in Helvetica font
        self.set_font("Helvetica", size=12)  # Default font for body text

    def header(self):
        """
        Draw a vertical line to separate the two columns on each page.
        """
        self.set_draw_color(200, 200, 200)  # Light gray color for the line
        x_start = self.left_margin + self.column_width + self.gutter / 2
        y_start = self.top_margin
        y_end = self.h - self.b_margin
        self.line(x_start, y_start, x_start, y_end)

    def footer(self):
        """
        Add a footer with the page number at the bottom center of each page.
        """
        self.set_y(-15)  # Position 15 mm from the bottom
        self.set_font('Helvetica', 'I', 8)
        self.set_text_color(128)  # Gray color for footer text
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    def chapter_title(self, title):
        """
        Format and add a chapter title centered at the top of the page.

        Parameters:
        - title (str): The chapter title text.
        """
        self.set_font("Helvetica", 'B', 16)  # Bold, 16pt font
        self.set_text_color(0, 102, 204)  # Blue color
        # Calculate the width available for the title
        title_width = self.w - self.left_margin - self.right_margin
        # Set Y position slightly below the top margin
        self.set_y(self.top_margin + 5)
        # Add the chapter title centered
        self.multi_cell(title_width, 10, title, align='C')
        self.ln(10)  # Add space after the title
        self.set_font("Helvetica", size=12)  # Reset to body text font
        self.set_text_color(0)  # Reset text color to black

    def chapter_body(self, text):
        """
        Format and add body text to the left column.

        Parameters:
        - text (str): The body text.
        """
        self.set_font("Helvetica", size=12)  # 12pt Helvetica
        # Single-line spacing: line height equals font size
        self.multi_cell(self.column_width, 12, text)
        # No additional vertical space to ensure single-line spacing

    def add_text_left_column(self, text, is_title=False):
        """
        Add text to the left column, formatting it as a title if specified.

        Parameters:
        - text (str): The text to add.
        - is_title (bool): Whether the text is a chapter title.
        """
        if is_title:
            self.chapter_title(text)
        else:
            self.chapter_body(text)

    def add_page_if_needed(self):
        """
        Check if a new page is needed based on the current y-position.
        """
        if self.get_y() > (self.h - self.b_margin - 20):
            self.add_page()

    def sanitize_text(self, text):
        """
        Sanitize the text by replacing or removing unsupported characters.

        Parameters:
        - text (str): The original text.

        Returns:
        - str: The sanitized text.
        """
        # Normalize the text to decompose combined characters
        text = unicodedata.normalize('NFKD', text)
        # Encode to ASCII bytes, ignore characters that can't be encoded
        text = text.encode('ascii', 'ignore').decode('ascii')
        return text

    def write_content(self, markdown_file):
        """
        Read the markdown file, sanitize its content, and write it to the PDF.
        Each new chapter starts on a new page with the title centered at the top.

        Parameters:
        - markdown_file (str): Path to the markdown file.
        """
        first_chapter = True  # Flag to check if it's the first chapter
        with open(markdown_file, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if line.startswith("###"):
                    # Identify chapter titles (lines starting with ###)
                    title = line.lstrip('#').strip()
                    title = self.sanitize_text(title)
                    if not first_chapter:
                        self.add_page()  # Start a new page for new chapter
                    else:
                        first_chapter = False  # First chapter, no need to add page
                    self.add_text_left_column(title, is_title=True)
                elif line == "":
                    # Handle empty lines by adding minimal vertical space
                    self.ln(2)
                else:
                    # Sanitize and add regular body text
                    sanitized_text = self.sanitize_text(line)
                    self.add_text_left_column(sanitized_text, is_title=False)
                self.add_page_if_needed()

# Function to convert Markdown to PDF with the specified formatting
def markdown_to_pdf(markdown_file, pdf_file):
    """
    Convert a Markdown file to a formatted PDF with two columns.
    
    Parameters:
    - markdown_file (str): Path to the input Markdown file.
    - pdf_file (str): Desired path for the output PDF file.
    """
    try:
        pdf = PDF(left_margin=15, top_margin=15, right_margin=15, gutter=10)
        pdf.write_content(markdown_file)
        pdf.output(pdf_file)
        print(f"PDF has been saved to '{pdf_file}'")
    except Exception as e:
        print(f"An error occurred during PDF generation: {e}")
        traceback.print_exc()

# Example usage:
# Replace 'organized_transcript.md' with the path to your Markdown file
# and 'organized_transcript.pdf' with your desired PDF file name.
markdown_file = "organized_transcript.md"  # Path to your Markdown file
pdf_file = "organized_transcript.pdf"      # Desired PDF output file name
markdown_to_pdf(markdown_file, pdf_file)


[0m

  self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')


PDF has been saved to 'organized_transcript.pdf'


# Splitting the pdf into multiple chapters

In [5]:
# Install the required library
!pip uninstall fpdf -y  # Uninstall original fpdf if installed
!pip install fpdf2 --quiet  # Install fpdf2

# Import necessary libraries
from fpdf import FPDF
import re
import os
import unicodedata
import traceback

# Define a custom PDF class to handle formatting and two-column layout
class PDF(FPDF):
    def __init__(self, left_margin=15, top_margin=15, right_margin=15, gutter=10):
        """
        Initialize the PDF with custom margins and column settings.

        Parameters:
        - left_margin (int): Left margin in mm.
        - top_margin (int): Top margin in mm.
        - right_margin (int): Right margin in mm.
        - gutter (int): Space between columns in mm.
        """
        super().__init__()
        self.left_margin = left_margin
        self.top_margin = top_margin
        self.right_margin = right_margin
        self.gutter = gutter
        # Calculate column width based on page width and margins
        self.column_width = (self.w - self.left_margin - self.right_margin - self.gutter) / 2
        self.set_auto_page_break(auto=True, margin=15)
        # Add a new page in the initializer
        self.add_page()
        self.set_margins(self.left_margin, self.top_margin, self.right_margin)
        # Use built-in Helvetica font
        self.set_font("Helvetica", size=12)  # Default font for body text

    def header(self):
        """
        Draw a vertical line to separate the two columns on each page.
        """
        self.set_draw_color(200, 200, 200)  # Light gray color for the line
        x_start = self.left_margin + self.column_width + self.gutter / 2
        y_start = self.top_margin
        y_end = self.h - self.b_margin
        self.line(x_start, y_start, x_start, y_end)

    def footer(self):
        """
        Add a footer with the page number at the bottom center of each page.
        """
        self.set_y(-15)  # Position 15 mm from the bottom
        self.set_font('Helvetica', 'I', 8)
        self.set_text_color(128)  # Gray color for footer text
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    def chapter_title(self, title):
        """
        Format and add a chapter title centered at the top of the page.

        Parameters:
        - title (str): The chapter title text.
        """
        self.set_font("Helvetica", 'B', 16)  # Bold, 16pt font
        self.set_text_color(0, 102, 204)  # Blue color
        # Calculate the width available for the title
        title_width = self.w - self.left_margin - self.right_margin
        # Set Y position slightly below the top margin
        self.set_y(self.top_margin + 5)
        # Add the chapter title centered
        self.multi_cell(title_width, 10, title, align='C')
        self.ln(10)  # Add space after the title
        self.set_font("Helvetica", size=12)  # Reset to body text font
        self.set_text_color(0)  # Reset text color to black

    def chapter_body(self, text):
        """
        Format and add body text to the left column.

        Parameters:
        - text (str): The body text.
        """
        self.set_font("Helvetica", size=12)  # 12pt Helvetica
        # Single-line spacing: line height equals font size
        self.multi_cell(self.column_width, 12, text)
        # No additional vertical space to ensure single-line spacing

    def add_text_left_column(self, text, is_title=False):
        """
        Add text to the left column, formatting it as a title if specified.

        Parameters:
        - text (str): The text to add.
        - is_title (bool): Whether the text is a chapter title.
        """
        if is_title:
            self.chapter_title(text)
        else:
            self.chapter_body(text)

    def add_page_if_needed(self):
        """
        Check if a new page is needed based on the current y-position.
        """
        if self.get_y() > (self.h - self.b_margin - 20):
            self.add_page()

    def sanitize_text(self, text):
        """
        Sanitize the text by replacing or removing unsupported characters.

        Parameters:
        - text (str): The original text.

        Returns:
        - str: The sanitized text.
        """
        # Normalize the text to decompose combined characters
        text = unicodedata.normalize('NFKD', text)
        # Encode to ASCII bytes, ignore characters that can't be encoded
        text = text.encode('ascii', 'ignore').decode('ascii')
        return text

# Helper function to sanitize filenames
def sanitize_filename(title):
    """
    Sanitize the chapter title to create a valid filename.

    Parameters:
    - title (str): The chapter title.

    Returns:
    - str: A sanitized filename string.
    """
    # Remove or replace characters that are invalid in filenames
    sanitized = re.sub(r'[\\/*?:"<>|]', "", title)
    sanitized = sanitized.replace(" ", "_")  # Replace spaces with underscores
    return sanitized

# Function to convert Markdown to multiple PDFs, each per chapter
def markdown_to_pdfs(markdown_file, output_dir="output_pdfs"):
    """
    Convert a Markdown file to multiple formatted PDFs, each corresponding to a chapter.

    Parameters:
    - markdown_file (str): Path to the input Markdown file.
    - output_dir (str): Directory where the PDF files will be saved.
    """
    try:
        # Create the output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        chapters = []  # List to hold chapters as tuples (title, body)
        current_title = None
        current_body = []

        # Read and parse the markdown file
        with open(markdown_file, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if line.startswith("###"):
                    # If there's an existing chapter, save it
                    if current_title is not None:
                        chapters.append((current_title, "\n".join(current_body)))
                        current_body = []
                    # Start a new chapter
                    current_title = line.lstrip('#').strip()
                elif line == "":
                    # Handle empty lines by adding minimal vertical space
                    current_body.append("")
                else:
                    # Add regular body text
                    current_body.append(line)
            # Append the last chapter after finishing the loop
            if current_title is not None:
                chapters.append((current_title, "\n".join(current_body)))

        # Iterate through each chapter and create a separate PDF
        for idx, (title, body) in enumerate(chapters, 1):
            pdf = PDF(left_margin=15, top_margin=15, right_margin=15, gutter=10)
            sanitized_title = pdf.sanitize_text(title)

            # Add chapter title and body to the PDF
            pdf.add_text_left_column(sanitized_title, is_title=True)

            # Split the body into paragraphs and add to PDF
            paragraphs = body.split('\n')
            for para in paragraphs:
                para = para.strip()
                if para == "":
                    # Add minimal vertical space for empty lines
                    pdf.ln(2)
                else:
                    sanitized_para = pdf.sanitize_text(para)
                    pdf.add_text_left_column(sanitized_para, is_title=False)
                pdf.add_page_if_needed()

            # Define the PDF filename
            filename = f"Chapter_{idx}_{sanitize_filename(title)}.pdf"
            filepath = os.path.join(output_dir, filename)

            # Output the PDF
            pdf.output(filepath)
            print(f"PDF has been saved to '{filepath}'")

    except Exception as e:
        print(f"An error occurred during PDF generation: {e}")
        traceback.print_exc()

# Example usage:
# Replace 'organized_transcript.md' with the path to your Markdown file
# and specify the desired output directory.
markdown_file = "organized_transcript.md"  # Path to your Markdown file
output_directory = "output_pdfs"           # Desired output directory for PDFs
markdown_to_pdfs(markdown_file, output_directory)


[0mPDF has been saved to 'output_pdfs/Chapter_1_000000_intro_Let’s_reproduce_GPT-2_(124M).pdf'
PDF has been saved to 'output_pdfs/Chapter_2_000339_exploring_the_GPT-2_(124M)_OpenAI_checkpoint.pdf'


  self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')


PDF has been saved to 'output_pdfs/Chapter_3_001347_SECTION_1_implementing_the_GPT-2_nn.Module.pdf'
PDF has been saved to 'output_pdfs/Chapter_4_002808_loading_the_huggingfaceGPT-2_parameters.pdf'
PDF has been saved to 'output_pdfs/Chapter_5_003100_implementing_the_forward_pass_to_get_logits.pdf'
PDF has been saved to 'output_pdfs/Chapter_6_003331_sampling_init,_prefix_tokens,_tokenization.pdf'
PDF has been saved to 'output_pdfs/Chapter_7_003702_sampling_loop.pdf'
PDF has been saved to 'output_pdfs/Chapter_8_004147_sample,_auto-detect_the_device.pdf'
PDF has been saved to 'output_pdfs/Chapter_9_004550_let’s_train_data_batches_(B,T)_→_logits_(B,T,C).pdf'
PDF has been saved to 'output_pdfs/Chapter_10_005253_cross_entropy_loss.pdf'
PDF has been saved to 'output_pdfs/Chapter_11_005642_optimization_loop_overfit_a_single_batch.pdf'
PDF has been saved to 'output_pdfs/Chapter_12_010200_data_loader_lite.pdf'
PDF has been saved to 'output_pdfs/Chapter_13_010614_parameter_sharing_wte_and_lm_head.

In [None]:
from fpdf import FPDF

def txt_to_pdf(txt_file, pdf_file):
    # Create a PDF object
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Set font
    pdf.set_font("Arial", size=12)

    # Open the text file in read mode
    with open(txt_file, 'r') as file:
        for line in file:
            pdf.cell(200, 10, txt=line, ln=True)

    # Output the PDF file
    pdf.output(pdf_file)

# Example usage:
txt_file = r"/Users/rafatsiddiqui/Downloads/oLabs/oModels/build-nanogpt/Let's reproduce GPT-2 (124M).txt"  # Replace with your .txt file path
pdf_file = "output.pdf"  # Desired output PDF file name
txt_to_pdf(txt_file, pdf_file)


In [None]:
# Import necessary libraries
import re
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from pytube import YouTube
import os

def extract_video_id(url):
    """
    Extract the YouTube video ID from a URL.
    """
    # Regular expression to extract video ID
    regex = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(regex, url)
    if match:
        return match.group(1)
    else:
        raise ValueError("Invalid YouTube URL. Please provide a valid URL.")

def get_video_title(video_id):
    """
    Get the title of the YouTube video using pytube.
    """
    try:
        yt = YouTube(f"https://www.youtube.com/watch?v=l8pRSuU81PU")
        return yt.title
    except Exception as e:
        print(f"Error fetching video title: {e}")
        return video_id  # Fallback to video ID if title cannot be fetched

def fetch_transcript(video_id):
    """
    Fetch the transcript for the given YouTube video ID.
    """
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        # Prefer English transcript; change 'en' to desired language code if needed
        transcript = transcript_list.find_transcript(['en'])
        transcript_data = transcript.fetch()
        return transcript_data
    except TranscriptsDisabled:
        raise ValueError("Transcripts are disabled for this video.")
    except NoTranscriptFound:
        raise ValueError("No transcript found for this video.")
    except Exception as e:
        raise ValueError(f"An error occurred while fetching the transcript: {e}")

def format_transcript(transcript_data):
    """
    Format the transcript data into a single string.
    """
    formatted_text = ""
    for entry in transcript_data:
        text = entry.get('text', '').replace('\n', ' ').strip()
        formatted_text += text + " "
    # Optionally, you can add more sophisticated formatting here
    return formatted_text.strip()

def save_transcript_to_file(text, filename):
    """
    Save the transcript text to a .txt file.
    """
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(text)
        print(f"Transcript successfully saved to '{filename}'.")
    except Exception as e:
        print(f"Failed to save transcript to file: {e}")

def download_youtube_transcript(url):
    """
    Main function to download the YouTube transcript.
    """
    try:
        # Extract video ID
        video_id = extract_video_id(url)
        print(f"Video ID extracted: {video_id}")

        # Get video title for filename
        title = get_video_title(video_id)
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title)  # Remove illegal filename characters
        filename = f"{safe_title}.txt"

        # Fetch transcript
        print("Fetching transcript...")
        transcript_data = fetch_transcript(video_id)

        # Format transcript
        print("Formatting transcript...")
        formatted_text = format_transcript(transcript_data)

        # Save to file
        save_transcript_to_file(formatted_text, filename)

    except ValueError as ve:
        print(f"ValueError: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Example Usage
youtube_url = "https://www.youtube.com/watch?v=l8pRSuU81PU"  # Replace with your YouTube URL
download_youtube_transcript(youtube_url)
