In [2]:
!pip install youtube-transcript-api
!pip install transformers
%pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90mâ”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”â”[0m [32m76.5/76.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.28.0


In [4]:
import openai
from getpass import getpass
from youtube_transcript_api import YouTubeTranscriptApi

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Load a SpaCy model for English

def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


In [8]:
def fetch_transcript(video_id):
    try:
        # Assuming you are fetching a list of dictionaries with 'text' keys
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        # Combine all text entries into a single string
        transcript = ' '.join([entry['text'] for entry in transcript_list])
        return transcript
    except Exception as e:
        print(f"Error obtaining transcript: {e}")
        return None

In [9]:
def summarize_text(text):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant tasked with providing a comprehensive summary that covers all key points in detail."},
                {"role": "user", "content": text}
            ],
            temperature=0.5,  # Slightly more creative
            max_tokens=600    # Allow for a longer response
        )
        summary = response.choices[0].message['content']  # Correct attribute access for chat responses
        return summary
    except Exception as e:
        print("Error during summarization:", str(e))
        return str(e)

In [5]:
def get_video_id(url):
    from urllib.parse import urlparse, parse_qs
    query = urlparse(url).query
    video_id = parse_qs(query).get('v')
    return video_id[0] if video_id else None

In [10]:
from transformers import pipeline

import json

# Load the T5 model for question generation
question_generator = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

def generate_questions(transcript):
    doc = nlp(transcript)  # Ensure this is a string
    questions = []
    for sent in doc.sents:
        prompt = f"generate question: {sent.text.strip()}"
        result = question_generator(prompt, max_length=50)
        if result:
            question = result[0]['generated_text'].strip()
            if question.endswith('?'):
                questions.append(question)
    return questions

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def filter_sentences(sentences):
    filtered_sentences = []
    for sentence in sentences:
        if len(sentence) > 20 and "example non-informative phrase" not in sentence:
            doc = nlp(sentence)
            has_verb = any(token.pos_ == 'VERB' for token in doc)
            has_noun = any(token.pos_ == 'NOUN' for token in doc)
            if has_verb and has_noun:  # Check if there's at least one verb and one noun
                filtered_sentences.append(sentence)
    return filtered_sentences


In [12]:
def is_redundant(question):
    words = question.lower().strip('?').split()
    # Checking for repetition in key parts of the question
    if len(words) > 4:  # Consider checking repetition if the question has more than 4 words
        half = len(words) // 2
        first_half = words[:half]
        second_half = words[half:]
        if any(word in second_half for word in first_half):
            return True
    return False

def has_sufficient_information(question):
    # Ensure the question has at least 5 unique words, but adjust criteria based on observations
    unique_words = set(question.lower().split())
    # Exclude common question words from the count to focus on content words
    content_words = [word for word in unique_words if word not in ["what", "is", "the", "a", "of"]]
    return len(content_words) > 4

def is_logically_coherent(question):
    # Simplify to check for repetitive phrases
    phrases_to_check = ["what is the", "how does", "where is", "when is", "why does"]
    for phrase in phrases_to_check:
        # Count occurrences of each phrase
        if question.lower().count(phrase) > 1:
            return False
    return True


def filter_questions(questions):
    filtered = []
    for question in questions:
        if (len(question) > 10 and
            question.count('?') == 1 and
            not is_redundant(question) and
            has_sufficient_information(question) and
            is_logically_coherent(question)):
            filtered.append(question)
    return filtered




In [13]:
def video_id_from_url(video_url):
    if "youtu.be" in video_url:
        return video_url.split('/')[-1]
    elif "youtube.com" in video_url:
        return video_url.split('v=')[1].split('&')[0]
    return None

In [15]:
def save_questions_to_json(questions, filename='questions.json'):
    with open(filename, 'w') as file:
        json.dump(questions, file, indent=4)

In [16]:
if __name__ == "__main__":
    video_url = "https://youtu.be/rY9zHNOjGrs?si=uGFsf3nbN9tVsXTa"
    video_id = video_id_from_url(video_url)
    transcript = fetch_transcript(video_id)
    if transcript:
        questions = generate_questions(transcript)
        filtered_questions = filter_questions(questions)
        save_questions_to_json(filtered_questions)  # Save filtered questions to JSON
        print("Filtered questions saved to JSON.")
        for question in filtered_questions:
            print(question)
    else:
        print("Failed to retrieve or process transcript.")

Filtered questions saved to JSON.
What will you not be getting today?
What is the name of the battles and tactics that we will not be describing?
What was the name of the chess game that Abraham Lincoln played against Abraham Lincoln?
What did historians say the Civil War was about slavery?
What did Lincoln say about the slave population?
What was the name of the slaves that constituted a peculiar and powerful interest?
What is the name of the post that was created by Thought Bubble?
What did Lincoln try to downplay in the beginning of the war?
What did Goldfield say northerners would preserve God's plan to extend democracy and Christianity?
What song was a popular song for the Battle Hymn of the Republic?
What did you think I was going to sing?
What was the advantage of mechanization in northern agriculture?
What did JEB Stuart say to his friend and lieutenant?
What was the issue of overcoming class conflicts?
What was the idea that the war of attrition would eventually wear down nort