In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install faiss-cpu langchain sentence-transformers 

In [None]:
!pip install revChatGPT --upgrade

In [None]:
import json
f = open('/kaggle/input/cocaster-data/data.json')
obj = json.load(f)
f.close()

In [None]:
from revChatGPT.V1 import Chatbot
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from typing import Any, List, Mapping, Optional, Dict

class GPTv1(LLM):
    chatbot = Chatbot(
        config={
            "access_token": obj['token']
        }
    )

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        print(prompt)
        response = ""
        for data in self.chatbot.ask(prompt):
            response = data["message"]
        print("actually using revChatGPT")
        return response

In [None]:
import re
from datetime import datetime

def clean_text(text: str):
    """
    Cleans a block of text by removing newlines, extra whitespace, and HTML tags.

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """
    pattern = re.compile(r"&#\d{1,2};|<.*?>|&quot;|&amp;|&gt;|&lt;")
    return pattern.sub("", text)

def convert_date_unix(string_date: str):
    """
    Converts a string date in the format "%Y-%m-%dT%H:%M:%SZ" to a Unix timestamp.

    Args:
        string_date (str): The string date to convert.

    Returns:
        int: The Unix timestamp.
    """
    date_obj = datetime.strptime(string_date, "%Y-%m-%dT%H:%M:%SZ")
    return int(date_obj.timestamp())

In [None]:
from googleapiclient.discovery import build, Resource


all_comments: List[Dict[str, Any]] = []


def get_comments(video_id: str):
    """
    Given a YouTube video ID, retrieves the top-level comments for the video and returns
    them along with the video's title, description, and publish date.

    Args:
        video_id (str): The ID of the YouTube video to scrape.

    Returns:
        Tuple[List[Dict[str, Any]], Tuple[str, str, str]]: A tuple containing a list of
        dictionaries representing the top-level comments for the video, and a tuple
        containing the video's title, description, and publish date.
    """

    gapi_key = obj['gapi_key']

    # Create a youtube resource object
    youtube = build("youtube", "v3", developerKey=gapi_key)
    
    video_id = video_id.split('?v=')[1]
    get_comments_helper(youtube, video_id, "")


def get_comments_helper(youtube: Resource, video_id: str, token: str = ""):
    """
    Recursive function that retrieves the top-level comments for a given YouTube video.

    Args:
        youtube (Any): The YouTube resource object.
        video_id (str): The ID of the YouTube video to scrape.
        token (str, optional): The token to use when retrieving comments. Defaults to "".
    """

    global all_comments
    total_reply_count = 0
    token_reply = None

    if len(token.strip()) == 0:
        all_comments = []

    if token == "":
        video_response = (
            youtube.commentThreads()
            .list(part="snippet", maxResults=100, videoId=video_id, order="relevance")
            .execute()
        )
    else:
        video_response = (
            youtube.commentThreads()
            .list(
                part="snippet",
                maxResults=100,
                videoId=video_id,
                order="relevance",
                pageToken=token,
            )
            .execute()
        )

    # Loop comments from the video:
    for idx, item in enumerate(video_response["items"]):
        # Append coments:
        cleaned_text_thread = clean_text(
            item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
        )
        if len(cleaned_text_thread.strip()) > 10:
            comment_thread = {
                "comment_text": cleaned_text_thread,
                "comment_author": item["snippet"]["topLevelComment"]["snippet"][
                    "authorDisplayName"
                ],
                "comment_author_url": item["snippet"]["topLevelComment"]["snippet"][
                    "authorChannelUrl"
                ],
                "comment_author_image": item["snippet"]["topLevelComment"]["snippet"][
                    "authorProfileImageUrl"
                ],
                "comment_published_at": item["snippet"]["topLevelComment"]["snippet"][
                    "updatedAt"
                ],
                "comment_published_at_unix": convert_date_unix(
                    item["snippet"]["topLevelComment"]["snippet"]["updatedAt"]
                ),
                "comment_like_count": item["snippet"]["topLevelComment"]["snippet"][
                    "likeCount"
                ],
                "comment_id": item["snippet"]["topLevelComment"]["id"],
                "comment_parent_id": item["snippet"]["topLevelComment"]["id"],
            }
            all_comments.append(comment_thread)

        # Get total reply count:
        total_reply_count = item["snippet"]["totalReplyCount"]

        # If the comment has replies, get them:
        if total_reply_count > 0:
            # Get replies - first batch:
            replies_response = (
                youtube.comments()
                .list(part="snippet", maxResults=100, parentId=item["id"])
                .execute()
            )
            for reply in replies_response["items"]:
                # Append the replies to the main array:
                cleaned_text = clean_text(reply["snippet"]["textDisplay"])
                if len(cleaned_text.strip()) > 10:
                    comment = {
                        "comment_text": cleaned_text,
                        "comment_author": reply["snippet"]["authorDisplayName"],
                        "comment_author_url": reply["snippet"]["authorChannelUrl"],
                        "comment_author_image": reply["snippet"][
                            "authorProfileImageUrl"
                        ],
                        "comment_published_at": reply["snippet"]["updatedAt"],
                        "comment_published_at_unix": convert_date_unix(
                            reply["snippet"]["updatedAt"]
                        ),
                        "comment_like_count": reply["snippet"]["likeCount"],
                        "comment_id": reply["id"],
                        "comment_parent_id": item["id"],
                    }
                    all_comments.append(comment)

            # If the reply has a token for get more replies, loop those replies
            # and add those replies to the main array:
            while "nextPageToken" in replies_response:
                token_reply = replies_response["nextPageToken"]
                replies_response = (
                    youtube.comments()
                    .list(
                        part="snippet",
                        maxResults=100,
                        parentId=item["id"],
                        pageToken=token_reply,
                    )
                    .execute()
                )
                for reply in replies_response["items"]:
                    cleaned_text = clean_text(reply["snippet"]["textDisplay"])
                    if len(cleaned_text.strip()) > 10:
                        comment_more = {
                            "comment_text": cleaned_text,
                            "comment_author": reply["snippet"]["authorDisplayName"],
                            "comment_author_url": reply["snippet"]["authorChannelUrl"],
                            "comment_author_image": reply["snippet"][
                                "authorProfileImageUrl"
                            ],
                            "comment_published_at": reply["snippet"]["updatedAt"],
                            "comment_published_at_unix": convert_date_unix(
                                reply["snippet"]["updatedAt"]
                            ),
                            "comment_like_count": reply["snippet"]["likeCount"],
                            "comment_id": reply["id"],
                            "comment_parent_id": item["id"],
                        }
                        all_comments.append(comment_more)

    if "nextPageToken" in video_response:
        return get_comments_helper(youtube, video_id, video_response["nextPageToken"])
    # Remove empty elements added to the list "due to the return in both functions":
    all_comments = [x for x in all_comments if len(x) > 0]
    print("Fin")
    return []


In [None]:
comments = get_comments(obj['link'])

In [None]:
corpus = ""
count_comments = 0
for i in all_comments:
    corpus += i['comment_text'] + "'\n'"
    count_comments += 1

In [None]:
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.prompts import PromptTemplate

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
texts = text_splitter.split_text(corpus)
docs = [Document(page_content=t) for t in texts]

In [None]:
def summarize_comments(docs):
    
    print(len(docs))
    #gptlm = HuggingFacePipeline.from_model_id(model_id="tiiuae/falcon-7b-instruct", task="text-generation", model_kwargs={"temperature": 0.1, "max_new_tokens": 500})
    gptlm = GPTv1()
    
    prompt_template = """You are given a string of youtube comments, delimited by '\n' character. Your job is to summarize the overall sentiment of these comments, while
    highlightning any constructive criticism that any of these comments might have. Summarize the following:


    {text}


    CONCISE SUMMARY:"""
    
    combine_template = """You are given a sentiments for batches comments of a youtube video, along with any constructive critique they might contain.
    Summarize the sentiment for all batches in just a few sentences. Especially highlight any critique mentioned in a numbered list format, along
    with an actionable step that the creator can take to fix it.
    Summarize the following:


    {text}


    CONCISE SUMMARY:"""
    
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    COMBINE = PromptTemplate(template=combine_template, input_variables=["text"])
    chain = load_summarize_chain(gptlm, chain_type="map_reduce", map_prompt=PROMPT, combine_prompt=COMBINE)
    print(chain.llm_chain.prompt.template)
    print(chain.combine_document_chain.llm_chain.prompt.template)

    output_summary = chain.run(docs)
    return output_summary

In [None]:
def create_db_from_comments(docs):
    model_kwargs = {'device': 'gpu'}
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
    )
    db = FAISS.from_documents(docs, embeddings)
    return db

In [None]:
def get_response_from_query(db, query, k=2):
    """
    gpt-3.5-turbo can handle up to 4097 tokens. Setting the chunksize to 1000 and k to 4 maximizes
    the number of tokens to analyze.
    """

    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])
    
    chat = GPTv1()
    
    # Template to use for the system message prompt
    template = """
        You are a helpful assistant that that can extract a comment base on the main topic: {docs}
        
        Only use information from the comments to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """

    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Human question prompt
    human_template = "Answer the following question: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        human_template
    )

    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    chain = LLMChain(llm=chat, prompt=chat_prompt)

    response = chain.run(question=query, docs=docs_page_content)
    return response

In [None]:
if obj["type"] == "chat":
    db = create_db_from_comments(docs)
    response = get_response_from_query(db, obj["query"])
else:
    response = summarize_comments(docs)

In [None]:
text_file = open("out.txt", "w")
n = text_file.write(response)
text_file.close()