In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

FILE_PATHS = []

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        FILE_PATHS.append(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install bardapi

In [None]:
!pip install youtube-transcript-api faiss-cpu langchain sentence-transformers gradio-client

In [None]:
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 50

In [None]:
import textwrap
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.docstore.document import Document

In [None]:
f = open(FILE_PATHS[0])
obj = json.load(f)
f.close()

In [None]:
import os
from typing import Any, List, Mapping, Optional
from bardapi import BardCookies
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

cookie_dict = {
    "__Secure-1PSID": obj["_1PSID"],
    "__Secure-1PSIDTS": obj["_1PSIDTS"],
    "__Secure-1PSIDCC": obj["_1PSIDCC"],
    # Any cookie values you want to pass session object.
}

class GPTv1(LLM):
    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        bard = BardCookies(cookie_dict=cookie_dict)
        response = bard.get_answer(prompt)
        return response['content']

In [None]:
from gradio_client import Client
def get_whisper_transcription(video_url):
    client = Client("https://sanchit-gandhi-whisper-jax.hf.space/")
    result = client.predict(
        video_url,  # str  in 'YouTube URL' Textbox component
        "transcribe",  # str  in 'Task' Radio component
        False,  # bool  in 'Return timestamps' Checkbox component
        api_name="/predict_2",
    )
    return result[1]


In [None]:
def create_db_from_youtube_video_url(video_url):
    model_kwargs = {'device': 'gpu'}
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-large-en-v1.5"
    )
    loader = YoutubeLoader.from_youtube_url(video_url, language=[obj["output_lang_code"].lower(), obj["input_lang_code"].lower()], translation=obj["output_lang_code"])
    transcript = loader.load()

    if len(transcript) == 0:
        transcript = get_whisper_transcription(video_url)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        texts = text_splitter.split_text(transcript)
        docs = [Document(page_content=t) for t in texts]
    else:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        docs = text_splitter.split_documents(transcript)

    db = FAISS.from_documents(docs, embeddings)
    return db

In [None]:
def get_response_from_query(db, query, k=4):
    """
    gpt-3.5-turbo can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes
    the number of tokens to analyze.
    """

    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])
    
    chat = GPTv1()
    
    # Template to use for the system message prompt
    template = f"""
        You are a helpful assistant that that can answer questions about youtube videos 
        based on the video's transcript: {'{docs}'}

        The language of the transcript is for you to infer.
        
        Only use the factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed, and also, in this language: {obj["output_lang"]}.

        ANSWER IN {obj["output_lang"]}:
        """

    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Human question prompt
    human_template = "Answer the following question: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        human_template
    )

    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    chain = LLMChain(llm=chat, prompt=chat_prompt)

    response = chain.run(question=query, docs=docs_page_content)
    response = response.replace("\n", "")
    return response

In [None]:
from langchain.prompts import PromptTemplate
from langchain.document_transformers import (
    LongContextReorder,
)
def summarize_video(link):

    loader = YoutubeLoader.from_youtube_url(link, language=[obj["output_lang_code"].lower(), obj["input_lang_code"].lower()], translation=obj["output_lang_code"])
    transcript = loader.load()

    if len(transcript) == 0:
        transcript = get_whisper_transcription(link)
        text_file = open("test.txt", "w")
        n = text_file.write(transcript)
        text_file.close()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        texts = text_splitter.split_text(transcript)
        docs = [Document(page_content=t) for t in texts]
        reordering = LongContextReorder()
        docs = reordering.transform_documents(docs)
    else:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        reordering = LongContextReorder()
        docs = reordering.transform_documents(docs)
        docs = text_splitter.split_documents(transcript)

    prompt_template = """You are given a transcript of a youtube video. The language of the transcript is for you to infer.
    Your job is to summarize the overall point of the video in the same langugage, while
    highlightning any actionable points the video has to offer. Summarize the following:


    {text}


    CONCISE SUMMARY IN """ + obj["output_lang"] + ":"

    gptlm = GPTv1()
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    # Add map_prompt and combine_prompt to the chain for custom summarization
    chain = load_summarize_chain(gptlm, chain_type="map_reduce", map_prompt=PROMPT)
    print(chain.llm_chain.prompt.template)
    print(chain.combine_document_chain.llm_chain.prompt.template)

    output_summary = chain.run(docs)
    response = textwrap.fill(
        output_summary,
        width=100,
        break_long_words=False,
        replace_whitespace=False,
    )
    return response

In [None]:
if obj["type"] == "chat":
    db = create_db_from_youtube_video_url(obj["link"])
    response = get_response_from_query(db, obj["query"])
else:
    response = summarize_video(obj["link"])

In [None]:
text_file = open("out.txt", "w")
n = text_file.write(response)
text_file.close()