In [1]:
!pip install openai-whisper notion_client yt_dlp langchain_text_splitters google-generativeai faiss-cpu genanki

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting notion_client
  Downloading notion_client-2.3.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting yt_dlp
  Downloading yt_dlp-2025.3.31-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.2/172.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting genanki
  Downloading genanki-0.13.1-py3-none-any.whl.metadata (7.5 kB)
Collecting cached-property (from genanki)
  Downloading cached_property-2.0.1-py3-none-any.whl.metadata (10 k

In [2]:
#import libraries
import whisper
from notion_client import Client
import google.generativeai as genai
from google.generativeai import configure, GenerativeModel
from notion_client import Client
from google.colab import userdata
import json
import numpy as np
import faiss
import genanki
import yt_dlp
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# model definition 

model = GenerativeModel('gemini-2.0-flash-thinking-exp-01-21',generation_config={"temperature": 0})
embedding_model = 'models/text-embedding-004'

In [4]:
def download_youtube_audio(url, filename="audio.mp3"):
    ydl_opts = {
        'format':'bestaudio/best',
        'outtmpl':'audio.mp3',
        'noplaylist':True,
        'extract_audio':True,
        'audioformat':'mp3',
        'audioquality':192,
        'overwrites': True
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return f"{filename}"

# transcribe the audio
def transcribe_audio(audio_file):
    audio_file = "/kaggle/working/"+audio_file
    model = whisper.load_model("base")
    result = model.transcribe(audio_file)
    return result["text"]

In [5]:

def create_embeddings_and_index(transcript):
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(transcript)

    if not chunks:
        print("Error: No chunks created from the transcript.")
        return None, None, None

    print(f"Generating embeddings for {len(chunks)} chunks...")
    embeddings = []
    try:
        for i, chunk in enumerate(chunks):
            print(f"  Embedding chunk {i+1}/{len(chunks)}")
# Use 'RETRIEVAL_DOCUMENT' for texts being indexed
            result = genai.embed_content(model=embedding_model,
                                         content=chunk,
                                         task_type="RETRIEVAL_DOCUMENT")
            embeddings.append(result['embedding'])

        if not embeddings:
            print("Error: No embeddings were generated.")
            return None, None, None
# FAISS requires float32
        dimension = len(embeddings[0])
        embeddings_np = np.array(embeddings).astype('float32')

        print(f"Creating FAISS index with dimension {dimension}...")
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings_np)
        print(f"FAISS index created successfully with {index.ntotal} vectors.")
        return index, chunks, dimension

    except Exception as e:
        print(f"Error during embedding or indexing: {e}")
        return None, None, None


In [6]:
def extract_key_terms(transcript,num_flashcards,user_prompts):
    print("Extracting key terms...")
    prompt = f"""
    You are an expert in identifying the most important concepts and terminology from educational content.
    Analyze the following transcript and extract MAXIMUM {num_flashcards} key terms or phrases that would be suitable for creating flashcards.
    Focus on {user_prompts}. But ONLY MAXIMUM {num_flashcards} MOST IMPORTANT!

    Return the results as a JSON list under the key "terms".
    Also give me a heading for the transcript. Return this under the key "heading".

    Example JSON format: {{"heading":"place the heading here","terms": ["term1", "concept phrase 2", "important name"]}}


    Transcript:
    ---
    {transcript}
    ---

    Extract key terms and return ONLY the JSON object:
    """
    try:
        response = model.generate_content(prompt)
# Robust parsing attempt
        cleaned_response = response.text.strip().replace('```json', '').replace('```', '')
        data = json.loads(cleaned_response)
        key_terms = data.get("terms", [])
        heading = data.get("heading",[])
        if not isinstance(key_terms, list):
             print(f"Warning: 'terms' field is not a list in the response: {data}")
             return []
        print(f"Extracted {len(key_terms)} key terms.")
        return key_terms, heading
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON response for key terms: {e}")
        print(f"LLM Raw Response: {response.text}")
        return []
    except Exception as e:
        print(f"Error during key term extraction: {e}")
        return []


In [7]:
def generate_flashcards(key_terms, index, chunks, confidence, k_neighbors=2):
    print(f"Generating flashcards for {len(key_terms)} terms...")
    flashcards = []
    if not index or not chunks:
        print("Error: Index or chunks are missing, cannot generate flashcards.")
        return []

    for i, term in enumerate(key_terms):
        print(f"  Processing term {i+1}/{len(key_terms)}: '{term}'")
        try:
# 1. Get query embedding
# Use 'RETRIEVAL_QUERY' for the search term
            query_embedding_result = genai.embed_content(model=embedding_model,
                                                         content=term,
                                                         task_type="RETRIEVAL_QUERY")
            query_embedding = np.array([query_embedding_result['embedding']]).astype('float32')

# 2. Search FAISS index
            distances, indices = index.search(query_embedding, k_neighbors)

# 3. Construct context from retrieved chunks
            context_chunks = [chunks[idx] for idx in indices[0] if idx < len(chunks)] # Safety check
            context = "\n---\n".join(context_chunks) # Separate chunks clearly

            if not context:
                 print(f"    Warning: No context found for term '{term}'. Skipping.")
                 continue

# 4. Generate flashcard with context
            prompt = f"""
            You are an expert flashcard creator. Based *only* on the provided context,
            create a single flashcard for the key term: '{term}'.

            Follow these instructions precisely:
            1.  Give concise and accurate information about the key term '{term}', summarizing the relevant information from the context.
            2.  Include factual information not present in the context related to the key term '{term}'.
            3.  Rate your confidence (from 0.0 to 1.0) in the accuracy and relevance of the information based *solely* on the provided context. 1.0 means high confidence, 0.0 means no confidence.
            4.  Return the result as a single JSON object with the keys "topic", "information", and "confidence".
            5.  Information should be not more than 100 words.

            Key Term: "{term}"

            Context:
            ---
            {context}
            ---

            Generate the flashcard JSON object:
            """
            response = model.generate_content(prompt)
# Robust parsing attempt
            cleaned_response = response.text.strip().replace('```json', '').replace('```', '')
            card = json.loads(cleaned_response)

# 5. Validate and filter
            if not all(k in card for k in ["topic", "information", "confidence"]):
                 print(f"    Warning: Generated card for '{term}' missing required keys. Skipping.")
                 continue

            if card.get("confidence", 0) >= confidence:  # Quality filter
                flashcards.append(card)
                print(f"    Successfully generated flashcard for '{term}'.")
            else:
                print(f"    Skipping card for '{term}' due to low confidence ({card.get('confidence')}).")

        except json.JSONDecodeError as e:
            print(f"    Error decoding JSON response for term '{term}': {e}")
            print(f"    LLM Raw Response: {response.text}")
            continue # Skip to the next term
        except Exception as e:
            print(f"    Error processing term '{term}': {e}")
            continue # Skip to the next term

    print(f"Generated {len(flashcards)} high-confidence flashcards.")
    return flashcards


In [8]:
def upload_to_notion(flashcards, notion_key, page_id,final_link,heading="New Flashcard Set"):
    notion = Client(auth=notion_key)

    # 1. Create a new parent page (optional, but useful for organization)
    parent_page = notion.pages.create(
        parent={"type": "page_id", "page_id": page_id},
        properties={"title": {"title": [{"text": {"content": heading}}]}},
    )

    # 2. Create a new database inside the parent page
    new_database = notion.databases.create(
        parent={"type": "page_id", "page_id": parent_page["id"]},
        title=[{"type": "text", "text": {"content": "Flashcards"}}],
        properties={
            "Topic": {"title": {}},  # Title column (for questions)
            "Information": {"rich_text": {}},  # Rich text column (for answers)
        },
    )

    # 3. Insert flashcards into the new database
    for card in flashcards:
        print(f"Uploading flashcard: {flashcards.index(card)+1}/{len(flashcards)}")
        notion.pages.create(
            parent={"database_id": new_database["id"]},
            properties={
                "Topic": {
                    "title": [{"text": {"content": card["topic"]}}]
                },
                "Information": {
                    "rich_text": [{"text": {"content": card["information"]}}]
                }
            }
        )

    print(f"Created a new Notion database and uploaded {len(flashcards)} cards. You can access them here: {new_database['url']}")
    return new_database["url"]

In [9]:
def create_anki_deck(flashcards, deck_name="YouTube Flashcards"):
    """Create an Anki deck and add flashcards to it."""

    # Define a basic model for our cards
    my_model = genanki.Model(
        1607392319,  # Random model ID
        'Simple Model',
        fields=[
            {'name': 'Question'},
            {'name': 'Answer'},
        ],
        templates=[
            {
                'name': 'Card 1',
                'qfmt': '{{Question}}',
                'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
            },
        ])

    # Create a new deck
    my_deck = genanki.Deck(
        2059400110,  # Random deck ID
        deck_name)

    # Add notes (flashcards) to the deck
    for card in flashcards:
        my_note = genanki.Note(
            model=my_model,
            fields=[card['topic'], card['information']])
        my_deck.add_note(my_note)

    # Save the deck to a file
    genanki.Package(my_deck).write_to_file(f'{deck_name.replace(" ", "_")}.apkg')
    print(f"Created Anki deck with {len(flashcards)} cards: {deck_name.replace(' ', '_')}.apkg")
    return f'{deck_name.replace(" ", "_")}.apkg'

In [10]:
def youtube_to_notion_flashcards(url,google_key,notion_key,page_url, num_flashcards,confidence,user_prompt):
    configure(api_key=google_key)

    # Step 1: Download audio
    print("Downloading YouTube audio...")
    audio_file = download_youtube_audio(url)

    # Step 2: Transcribe
    print("Transcribing audio...")
    transcript = transcribe_audio(audio_file)

    # Step 3: Create embeddings
    print("Create embeddings...")
    index, chunks, dimension = create_embeddings_and_index(transcript)

    # Step 4: Extract key terms
    print("Extracting key terms...")
    if user_prompt is None or user_prompt == "":
        user_prompt = "nouns, technical terms, definitions, important names, or core concepts"
    key_terms, heading = extract_key_terms(transcript,num_flashcards,user_prompt)

    # Step 5: Generate flashcards
    print("Generating flashcards...")
    flashcards = generate_flashcards(key_terms,index,chunks,confidence)

    # # Step 6: Upload to Notion
    print("Uploading to Notion...")
    upload_to_notion(flashcards,notion_key,page_url.split("-")[1],heading)


In [11]:
### Static Inputs
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
google_key = user_secrets.get_secret("API_KEY")                    # Add your Google key from Google AI Studio
notion_key = "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"   # Add your key from notion. Can be created at notion's creator profile. Connection has to be made within the page and creator profile which is very simple process. A quick read into notions API Integration documentation should give more insights. Link - https://developers.notion.com/

# User Inputs
page_url = "https://www.notion.so/flashcards-1ce33276f2848062aec9ddb3763230f6"     # Add the notion page url where you want to store the flashcard.
url = "https://www.youtube.com/watch?v=aDmp2Uim0zQ&t=115s"                         # Give url for youtube video
num_flashcards = 10                                                                 # Input the number of flashcards required
confidence = 0.85                                                                   # Add the confidence coefficient to form accurate flashcards.
user_prompt = ""

youtube_to_notion_flashcards(url,google_key,notion_key,page_url,num_flashcards,confidence,user_prompt)

Downloading YouTube audio...
[youtube] Extracting URL: https://www.youtube.com/watch?v=aDmp2Uim0zQ&t=115s
[youtube] aDmp2Uim0zQ: Downloading webpage
[youtube] aDmp2Uim0zQ: Downloading tv client config
[youtube] aDmp2Uim0zQ: Downloading player 9a279502-main
[youtube] aDmp2Uim0zQ: Downloading tv player API JSON
[youtube] aDmp2Uim0zQ: Downloading ios player API JSON
[youtube] aDmp2Uim0zQ: Downloading m3u8 information
[info] aDmp2Uim0zQ: Downloading 1 format(s): 251
[download] Destination: audio.mp3
[download] 100% of    5.50MiB in 00:00:00 at 40.37MiB/s  
Transcribing audio...


100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 78.4MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Create embeddings...
Splitting text into chunks...
Generating embeddings for 5 chunks...
  Embedding chunk 1/5
  Embedding chunk 2/5
  Embedding chunk 3/5
  Embedding chunk 4/5
  Embedding chunk 5/5
Creating FAISS index with dimension 768...
FAISS index created successfully with 5 vectors.
Extracting key terms...
Extracting key terms...
Extracted 10 key terms.
Generating flashcards...
Generating flashcards for 10 terms...
  Processing term 1/10: 'LLM settings'
    Successfully generated flashcard for 'LLM settings'.
  Processing term 2/10: 'Temperature'
    Successfully generated flashcard for 'Temperature'.
  Processing term 3/10: 'Top P'
    Successfully generated flashcard for 'Top P'.
  Processing term 4/10: 'Top K'
    Successfully generated flashcard for 'Top K'.
  Processing term 5/10: 'Candidate words'
    Successfully generated flashcard for 'Candidate words'.
  Processing term 6/10: 'Probability scores'
    Successfully generated flashcard for 'Probability scores'.
  Processi