# Workflow of QuizTube

## 1. Extract the YouTube ID from a given link

In [None]:
from pytube import extract

urls = [
    'http://youtu.be/SA2iWivDJiE',
    'http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu',
    'http://www.youtube.com/embed/SA2iWivDJiE',
    'http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US',
    'https://www.youtube.com/watch?v=rTHlyTphWP0&index=6&list=PLjeDyYvG6-40qawYNR4juzvSOg-ezZ2a6',
    'https://www.youtube.com/watch?time_continue=9&v=n0g-Y0oo5Qs&feature=emb_logo'
]

for url in urls:
    video_id = extract.video_id(url)
    # print(video_id)

## 2. Extract & transform video captions

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
video_id = "bcYwiwsDfGE"
transcript = YouTubeTranscriptApi.get_transcript(video_id)
transcript



[{'text': 'Okay, some of you might have heard already,\nbut you can now use Python inside Excel.',
  'start': 0.13,
  'duration': 4.74},
 {'text': "In this video, I will show some basic examples\nof how to use it, and I'll guide you through",
  'start': 4.87,
  'duration': 5.2},
 {'text': 'a real-world example to give you an idea of\nthe possibilities.',
  'start': 10.07,
  'duration': 4.199},
 {'text': "In that example, I'll show you some very useful\npandas features and some more advanced charts.",
  'start': 14.269,
  'duration': 5.301},
 {'text': "Towards the end, I'll also share my personal\nopinion about this new feature and point out",
  'start': 19.57,
  'duration': 4.58},
 {'text': 'some limitations.', 'start': 24.15, 'duration': 1.26},
 {'text': 'But, before we start, a quick shout-out to\nLuke.',
  'start': 25.41,
  'duration': 3.1},
 {'text': 'Luke was kind enough to connect me with people\nfrom Anaconda so that I could test this feature.',
  'start': 28.51,
  'duration': 4

In [None]:
import pysrt
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Hide the main tkinter window
Tk().withdraw()

# Open the file dialog and get the file path
file_path = askopenfilename()

# Open the .srt file
subs = pysrt.open(file_path)

# Convert to list and print the first subtitle text
subs = list(subs)
# print(subs[0].text)

# Extract and combine the text from each subtitle
text = ' '.join(sub.text for sub in subs)

# Now 'text' contains all the subtitle texts combined
# print(text)


In [6]:
# transcript = " ".join([item["text"] for item in transcript])
transcript = text 

## 3. Feed video captions into LLM (OpenAI)

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import LLMChain


#########################################
# getting the API key from the .env file
from dotenv import load_dotenv
import os

load_dotenv()  # This loads the .env file at the application's root

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#########################################


template = f"""
You are a helpful assistant programmed to generate questions based on any text provided. For every chunk of text you receive, you're tasked with designing 5 distinct questions. Each of these questions will be accompanied by 3 possible answers: one correct answer and two incorrect ones. 

For clarity and ease of processing, structure your response in a way that emulates a Python list of lists. 

Your output should be shaped as follows:

1. An outer list that contains 5 inner lists.
2. Each inner list represents a set of question and answers, and contains exactly 4 strings in this order:
- The generated question.
- The correct answer.
- The first incorrect answer.
- The second incorrect answer.

Your output should mirror this structure:
[
    ["Generated Question 1", "Correct Answer 1", "Incorrect Answer 1.1", "Incorrect Answer 1.2"],
    ["Generated Question 2", "Correct Answer 2", "Incorrect Answer 2.1", "Incorrect Answer 2.2"],
    ...
]

It is crucial that you adhere to this format as it's optimized for further Python processing.

"""

system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
chat_prompt = ChatPromptTemplate.from_messages(
    [system_message_prompt, human_message_prompt]
)
chain = LLMChain(
    llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY),
    prompt=chat_prompt,
)
quiz_data = chain.run(transcript)
print(type(quiz_data))
print(quiz_data)

<class 'str'>
[
    ["Who co-authored the book 'An Introduction to Statistical Learning with Applications in R'?", "Rob Tibshirani and Daniela Witten", "Gareth James and Jonathan Taylor", "Trevor and Jonathan Taylor"],
    ["When was the updated edition of the statistical learning edX course developed?", "2021", "2014", "1999"],
    ["What programming language is the new course 'statistical learning with Python' based on?", "Python", "R", "Java"],
    ["What package is provided in the course and book for statistical modeling?", "ISLP", "Jupyter", "Python"],
    ["Where can the PDF of the book be obtained for free?", "Stat Learning website", "Amazon", "Barnes & Noble"]
]


## 4. Transform output

In [8]:
import ast

quiz_data_clean = ast.literal_eval(quiz_data)
print(type(quiz_data_clean))
print(quiz_data_clean)

<class 'list'>
[["Who co-authored the book 'An Introduction to Statistical Learning with Applications in R'?", 'Rob Tibshirani and Daniela Witten', 'Gareth James and Jonathan Taylor', 'Trevor and Jonathan Taylor'], ['When was the updated edition of the statistical learning edX course developed?', '2021', '2014', '1999'], ["What programming language is the new course 'statistical learning with Python' based on?", 'Python', 'R', 'Java'], ['What package is provided in the course and book for statistical modeling?', 'ISLP', 'Jupyter', 'Python'], ['Where can the PDF of the book be obtained for free?', 'Stat Learning website', 'Amazon', 'Barnes & Noble']]


In [4]:
from helpers.openai_utils import correct_text
from helpers.youtube_utils import process_srt_file
import tkinter as tk
from tkinter import filedialog


def open_srt_file():
    # Create a new tkinter root window
    root = tk.Tk()

    # Hide the root window
    root.withdraw()

    # Open a file dialog and get the path to the selected .srt file
    srt_file_path = filedialog.askopenfilename(filetypes=[("SRT files", "*.srt")])

    # Return the path to the .srt file
    return srt_file_path

# Use the function
srt_file_path = open_srt_file()

# Open the file before passing it to the process_srt_file function
with open(srt_file_path, 'r', encoding='ISO-8859-1') as srt_file:
    text, text_copy = process_srt_file(srt_file)



AttributeError: 'str' object has no attribute 'decode'