In [None]:
import os
# Set API key (Keep this safe)
os.environ["OPENAI_API_KEY"] = "sk-cZbL5KHmx8nDzCDpVs0eq88z6LPq_YGPrjCOMyMSOdT3BlbkFJ-sbY35BLry9WJfwigH3ABIGW07QFwqGVY2snK7XfIA"

In [None]:
# Install Required Libraries
!pip install openai==0.28.0 faiss-cpu numpy pymupdf ipywidgets tiktoken

# Import Dependencies
import openai
import os
import faiss
import numpy as np
import fitz  # PyMuPDF
import json
import ipywidgets as widgets
from IPython.display import display, clear_output

# Set OpenAI API Key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    """Extracts and cleans text from a given PDF file."""
    text_data = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text").strip()
            if len(text) > 50:
                text_data.append(text)
    return "\n".join(text_data)

# Function: Chunk Text for Retrieval
import tiktoken
def chunk_text(text, max_tokens=300):
    """Splits text into chunks while preserving meaning."""
    encoding = tiktoken.get_encoding("cl100k_base")
    words, chunks, chunk = text.split(), [], []
    token_count = 0
    for word in words:
        word_tokens = len(encoding.encode(word))
        if token_count + word_tokens > max_tokens:
            chunks.append(" ".join(chunk))
            chunk, token_count = [], 0
        chunk.append(word)
        token_count += word_tokens
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

# Function: Process PDF & Prepare Chunks
def process_pdf_for_rag(pdf_path, university, course):
    """Processes a PDF and returns structured text chunks."""
    raw_text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(raw_text)
    return [{"university": university, "course": course, "chunk_id": i, "text": chunk} for i, chunk in enumerate(chunks)]

# Load Course Data (Ensure PDFs are uploaded)
course_chunks = process_pdf_for_rag("/content/content/collegebiologysummaryquestions.pdf", "University of Toronto", "BIOL 101")

# Function: Generate OpenAI Embeddings
def generate_embedding(text):
    """Generates an embedding for a given text using OpenAI's API."""
    response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
    return response["data"][0]["embedding"]

# Generate & Store Embeddings
for chunk in course_chunks:
    chunk["embedding"] = generate_embedding(chunk["text"])

# Function: Store Embeddings in FAISS
def store_embeddings_faiss(data):
    """Stores embeddings in a FAISS index for fast retrieval."""
    d = len(data[0]["embedding"])
    index = faiss.IndexFlatL2(d)
    embeddings = np.array([chunk["embedding"] for chunk in data]).astype("float32")
    index.add(embeddings)
    return index

# Store Embeddings in FAISS
faiss_index = store_embeddings_faiss(course_chunks)

# Function: Search FAISS for Relevant Chunks
def search_relevant_chunks(query, faiss_index, data, top_k=3):
    """Finds the most relevant course chunks using FAISS similarity search."""
    query_embedding = np.array([generate_embedding(query)]).astype("float32")
    _, indices = faiss_index.search(query_embedding, top_k)
    return [data[i] for i in indices[0]]

# Function: Generate AI-Powered Flashcards
def generate_flashcards_with_context(context):
    """Generates AI-powered flashcards using retrieved course content."""
    prompt = f"""
    You are an AI assistant that generates concise and high-quality flashcards for students.

    Below is some course material:

    {context}

    Based on the above reference material, generate 10 high-quality flashcards in JSON format.
    Each flashcard should have:
    - A "question" field (formatted as a multiple-choice or open-ended question).
    - An "answer" field (a clear and concise answer).

    Return the flashcards as a JSON list of dictionaries.
    """

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )

    return json.loads(response["choices"][0]["message"]["content"])

# Create Interactive UI for User Input
university_input = widgets.Text(placeholder="Enter University Name", description="University:")
course_input = widgets.Text(placeholder="Enter Course Name", description="Course:")
topic_input = widgets.Text(placeholder="Enter Topic (max 150 chars)", description="Topic:", max_length=150)
generate_button = widgets.Button(description="Generate Flashcards", button_style="success")
redo_button = widgets.Button(description="Redo Flashcards", button_style="warning")
output_area = widgets.Output()

# Hide redo button initially
redo_button.layout.display = 'none'

def on_button_click(b):
    university, course, topic = university_input.value.strip(), course_input.value.strip(), topic_input.value.strip()

    if university and course and topic:
        clear_output(wait=True)
        display(university_input, course_input, topic_input, generate_button, redo_button, output_area)

        print(f"\n🎯 Searching for flashcards on **{topic}** in {course} at {university}...")

        # Retrieve relevant chunks
        global retrieved_chunks
        retrieved_chunks = search_relevant_chunks(topic, faiss_index, course_chunks)

        # Generate AI Flashcards
        context = "\n\n".join([chunk["text"] for chunk in retrieved_chunks])
        flashcards = generate_flashcards_with_context(context)

        # Display Results
        with output_area:
            output_area.clear_output()
            print("\n📚 AI-Generated Flashcards:\n")
            for i, card in enumerate(flashcards, 1):
                print(f"🔹 {i}. **Q:** {card['question']}")
                print(f"   ✅ **A:** {card['answer']}\n")

        # Show "Redo Flashcards" button after first generation
        redo_button.layout.display = 'inline-block'

    else:
        with output_area:
            output_area.clear_output()
            print("⚠️ Please fill in all fields before generating flashcards.")

def on_redo_click(b):
    """Regenerates flashcards from the same retrieved course material."""
    context = "\n\n".join([chunk["text"] for chunk in retrieved_chunks])
    flashcards = generate_flashcards_with_context(context)

    # Display New Flashcards
    with output_area:
        output_area.clear_output()
        print("\n🔄 Regenerating new AI-Generated Flashcards...\n")
        for i, card in enumerate(flashcards, 1):
            print(f"🔹 {i}. **Q:** {card['question']}")
            print(f"   ✅ **A:** {card['answer']}\n")

# Link Buttons to Functions & Display UI
generate_button.on_click(on_button_click)
redo_button.on_click(on_redo_click)
display(university_input, course_input, topic_input, generate_button, redo_button, output_area)

Collecting openai==0.28.0
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.

FileNotFoundError: no such file: '/content/content/collegebiologysummaryquestions.pdf'

In [None]:
import json

# Load the JSON file
with open("islam.json", "r", encoding="utf-8") as f:
    data = json.load(f)  # This assumes your file has an array of JSON objects

# Save as JSONL
with open("islam.jsonl", "w", encoding="utf-8") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")  # Converts each object to a JSON line

print("✅ Conversion to JSONL complete. Upload 'islam.jsonl' to OpenAI.")

JSONDecodeError: Extra data: line 2 column 1 (char 352)