In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from transformers import pipeline

In [2]:
def get_transcript(link):
#     https://www.youtube.com/watch?v=nt63k3bfXS0&list=PLoROMvodv4rMiGQp3WXShtMGgzqpfVfbU&index=6
    v = link.find("v=")+2
    video_id = link[v:v+11]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return transcript
    

In [3]:
transcript = get_transcript("https://www.youtube.com/watch?v=nt63k3bfXS0&list=PLoROMvodv4rMiGQp3WXShtMGgzqpfVfbU&index=6")
# transcript = get_transcript("https://www.youtube.com/watch?v=bwVrrXk7mtY")

# Preprocessing the transcript

In [4]:
# all_paragraphs = ""
# for words in transcript:
#     all_paragraphs += words['text'] + " " 


def preprocess_transcript(transcript):
    import re
    clean_segments = []
    
    for segment in transcript:
        text = segment['text']
        timestamp = segment['start']

        # a few common filter words
        text = re.sub(r'\b(uh|um|like|you know|sort of)\b', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\s+', ' ', text).strip() 
        
        clean_segments.append({'text': text, 'timestamp': timestamp})
    
    return clean_segments

# cleaned_transcript = preprocess_transcript(transcript)

def chunk_by_timestamp(transcript, max_tokens=500):
    from transformers import AutoTokenizer
    
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    chunks = []
    current_chunk = []
    current_length = 0

    for segment in transcript:
        text = segment['text']
        tokens = tokenizer.encode(text, add_special_tokens=False)
        
        if current_length + len(tokens) > max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
            current_length = 0

        current_chunk.append(segment)
        current_length += len(tokens)

    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

# chunked_transcript = chunk_by_timestamp(cleaned_transcript)

In [5]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_chunks(chunks):
    summaries = []
    for chunk in chunks:
        combined_text = " ".join([seg['text'] for seg in chunk])
        timestamp_range = f"{chunk[0]['timestamp']} - {chunk[-1]['timestamp']}"
        
        summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)
        summaries.append({
            'summary': summary[0]['summary_text'],
            'timestamp': timestamp_range
        })
    
    return summaries

# summaries_with_timestamps = summarize_chunks(chunked_transcript)

Device set to use mps:0


In [6]:
print(summaries_with_timestamps)

NameError: name 'summaries_with_timestamps' is not defined

# OpenAI transcript processing

In [None]:
import os
from openai import OpenAI
client = OpenAI()

In [None]:
api_key = os.environ.get('OPENAI_API_KEY')
client.api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
def preprocess_for_openai(transcript):
    combined_text = ""
    for segment in transcript:
        start_time = segment['start'] / 60
        combined_text += f"[{start_time:.2f}] {segment['text']}\n"
    return combined_text

# cleaned_combined_text = preprocess_for_openai(transcript)
# print(cleaned_combined_text)

In [None]:
def summarize_chunk(text_to_summarize, model="gpt-4o-mini"):
    
    prompt = f"""
    You are an expert at creating detailed, structured lecture notes from transcripts.

    Your task is to convert the provided lecture transcript into comprehensive, well-organized, and insightful lecture notes, as well as provide timestamps that align with the video's original transcript.

    TeX syntax should be utilized to explain and elaborate on the concepts within each subsection. Generate TeX inline math environments as necessary. 
    
    Each set of notes should follow this format:
    
    # Lecture Notes on [Topic Name]
    
    ## Introduction
    - Begin with a summary of the overall lecture and its goals.
    
    ## [Main Section Title]
    ### Subsection 1
    - Elaborate on the main point(s) of the subsection.
    
    ### Subsection 2
    - Continue explaining and expanding.

    ### Additional Analysis
    - Expand on the original content, and write a paragraph or two that integrate your own expertise and insights. This should provide an extensive, educational resource on the subject.
    
    ## [Next Main Section Title]
    - Organize subsequent sections similarly, ensuring a logical flow.
    
    ## Conclusion
    - Summarize the lecture's key takeaways.
    - Provide general advice or actionable insights related to the topic.
    
    ### General Tips
    - Include any advice or practical steps relevant to understanding or applying the concepts.

    Here is the lecture transcript:
    
    {text_to_summarize}

    """
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "developer", "content": prompt}
        ],
        seed=0,
        temperature=0.3
    )
    return completion

# Generate the summary
# summary = summarize_chunk(cleaned_combined_text)

In [None]:
def save_summary_to_text(summarized, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(summarized)

In [None]:
import re

def sanitize_for_latex(text):
    """
    Escapes special LaTeX characters in the given text,
    except for braces `{}` used in LaTeX commands.
    """
    special_chars = {
        "#": r"\#",
        "%": r"\%",
        "_": r"\_",
        "&": r"\&",
        "$": r"\$",
    }
    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)
    return text


def save_summary_to_latex(input_text, output_file):
    """
    Converts structured text to a LaTeX file.

    Args:
        input_text (str): The structured text input.
        output_file (str): The name of the output .tex file.
    """
    # LaTeX preamble and document start
    preamble = r"""
\documentclass{article}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{geometry}
\geometry{margin=1in}
\usepackage{hyperref}
\hypersetup{
    colorlinks=true,
    linkcolor=blue,
    filecolor=magenta,      
    urlcolor=cyan,
}

\title{Lecture Notes}
\author{}
\date{}

\begin{document}
\maketitle
"""
    # End of LaTeX document
    end = r"\end{document}"

    # Convert headers to LaTeX sections
    converted_text = re.sub(r"^# (.+)$", r"\\section*{\1}", input_text, flags=re.MULTILINE)
    converted_text = re.sub(r"^## (.+)$", r"\\subsection*{\1}", converted_text, flags=re.MULTILINE)
    converted_text = re.sub(r"^### (.+)$", r"\\subsubsection*{\1}", converted_text, flags=re.MULTILINE)


    # Escape special LaTeX characters
    converted_text = sanitize_for_latex(converted_text)
    # Fix math environments

    # Handle itemize environments for bullet points
    def wrap_itemize(match):
        items = match.group(1).strip().split("\n")
        formatted_items = "\n".join([f"\\item {item[2:].strip()}" for item in items if item.strip()])
        return f"\\begin{{itemize}}\n{formatted_items}\n\\end{{itemize}}"

    # Match groups of lines starting with "- "
    converted_text = re.sub(r"(?m)(^- .+(?:\n- .+)*)", wrap_itemize, converted_text)

    # Ensure math environments are properly handled
    converted_text = re.sub(r"\\\[([^\\]+)\\\]", r"\\[\1\\]", converted_text)
    converted_text = re.sub(r"\\\((.+?)\\\)", r"$\1$", converted_text)

    # Ensure subscripts and superscripts are properly set in math mode
    converted_text = re.sub(r"([a-zA-Z])_([a-zA-Z0-9]+)", r"{\1_\{\2\}}", converted_text)
    converted_text = re.sub(r"([a-zA-Z])\^([a-zA-Z0-9]+)", r"{\1^\{\2\}}", converted_text)

    # Combine parts into the LaTeX document
    latex_content = f"{preamble}{converted_text}\n{end}"
    # print(latex_content)

    # Write to the output file
    with open(output_file, 'w') as tex_file:
        tex_file.write(latex_content)

    print(f"LaTeX file '{output_file}' generated successfully.")


In [None]:
# print(summary.choices[0].message.content)
save_summary_to_latex(summary.choices[0].message.content)

In [None]:
import subprocess
import os

In [None]:
def tex_to_pdf(tex_file, output_dir=None):
    if not tex_file.endswith(".tex"):
        raise ValueError("Input file must be a .tex file")
    
    if output_dir is None:
        output_dir = os.path.dirname(tex_file) or "."
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    try:
        result = subprocess.run(
            ["pdflatex", "-interaction=nonstopmode", "-file-line-error", "-output-directory", output_dir, tex_file],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        pdf_file = os.path.join(output_dir, os.path.basename(tex_file).replace(".tex", ".pdf"))
        if os.path.exists(pdf_file):
            print(f"PDF generated: {pdf_file}")
            return pdf_file
        else:
            raise FileNotFoundError("PDF generation failed. Check the LaTeX log for errors.")
    except subprocess.CalledProcessError as e:
        print("Error during LaTeX compilation:")
        print(e.stdout)  # Print LaTeX log output for debugging
        raise

In [None]:
tex_file_path = "./output/summary.tex"
output_pdf = tex_to_pdf(tex_file_path, "./output/tmp")

# Everything in one place

In [None]:

def pipeline(link, file_title):
    """
    input: video url
    output: summary of the video in pdf form
    """
    print("fetching transcript...")
    transcript = get_transcript(link)
    print("processing transcript...")
    processed_transcript = preprocess_for_openai(transcript)
    print("summarizing transcript...")
    summarized = summarize_chunk(processed_transcript)
    summarized = summarized.choices[0].message.content
    print("saving to txt...")
    save_summary_to_text(summarized, f'./output/{file_title}.txt')
    # output_file = f'{file_title}.tex'
    print("converting to latex...")
    save_summary_to_latex(summarized, f'./output/{file_title}.tex')
    print("converting to pdf...")
    tex_to_pdf(f'./output/{file_title}.tex', f'./output/{file_title}')
    print("complete!")
    
# pipeline("https://www.youtube.com/watch?v=8NYoQiRANpg&list=PLoROMvodv4rMiGQp3WXShtMGgzqpfVfbU&index=7", 'kernels_lecture')
# https://www.youtube.com/watch?v=9-Jl0dxWQs8
pipeline("https://www.youtube.com/watch?v=9-Jl0dxWQs8", '3b1b_DL7')
    