In [1]:
# import libraries
import googleapiclient
from googleapiclient.discovery import build
import pandas as pd
import re
import ollama
from langchain.schema import HumanMessage
import chromadb
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# initialize an instance of the database
# persist ensures that the database is saved to the computer so I can reference it in other scripts
database = chromadb.PersistentClient(path="./youtube_comment_database")
# create a collection (group of documents and their embeddings)
collection = database.create_collection(name="youtube_comments")

# import sentence embedder from huggingface
# using all-MiniLM-L6-v2 since llama2:7B doesn't have an encoder and this one is light enough for me to run
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
api_key = ""
# video id's for videos about the civic si
video_id = ["RrZSuz-e9NY", "gGmdz9tA1Y8", "rGMWjQX5LG8", "nafje4-tv-w", "JcvQC0eYwJA", "DkW0Fr5KGf0", "ezgZCGM-_bg", "f6WAqT6073w", "F8IEZHeycS4", "VY91tZ3m-qU", "wczsTzaIgcE", "1h4MB5K_w1I", "JOp1xZrbuQM", "_e5mIqafwMA", "evTLpZZp6R0", "pUTj3C-Owx8"]
# create a list dictionary that will store the output
output = []
# use api key to create youtube object
youtube = build('youtube', 'v3', developerKey=api_key)

In [6]:
# build function to call youtube api to go through as many pages of comments as possible per youtube video
# video == id of particular video
# output == output list
# comments_to_view == number of comments model will go through per video (not all will have replies)
def getCommentsPerVideo(video, output, comments_to_view=2000):
    # parameter to specify the next page of comments, must be None for the first page
    # each subsequent api call provides to value to load the next page
    nextPageToken=None

    # lets say we want no more than 1000 comments per video, each api call can get up to 100 comments
    # so we will call the api in a 10 iteration loop and exit early if the nextPageToken is not given (means we already went through all comments)
    # realistically most comments won't have a reply, so by iterating through 2000 we will get between 500 and 1000 usable data entries per video
    for i in range(comments_to_view//100):

        apiCall = youtube.commentThreads().list(part=["snippet","replies"], videoId=video, maxResults=100, order="relevance", pageToken=nextPageToken).execute()

        # iterate through the API response to save all comment-reply pairs (ignore comments that don't have any replies)
        # iterate through the comments the api returned
        for j in range(len(apiCall["items"])):
            
            # get comment text
            textOutput = apiCall["items"][j]["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            
            # get count of replies
            replyCount = apiCall["items"][j]["snippet"]["totalReplyCount"]
            
            if replyCount > 0:
                
                # get list of all the returned replies (api usually returns 5 replies)
                replies = apiCall["items"][j]["replies"]["comments"]

                # get the likes per reply
                likes = []
                for reply in replies:
                    likes.append(reply["snippet"]["likeCount"])
                    
                # get index of comment with most likes
                maxIndex = likes.index(max(likes))
                # get reply with most likes
                mostLikedReplyText = replies[maxIndex]["snippet"]["textDisplay"]

                # save comment text and most liked reply text to output list dictionary
                output.append({"comment":textOutput, "reply":mostLikedReplyText})
            
            # update next page token
            next_page_token = apiCall.get("nextPageToken")
            if not next_page_token:
                break
    
    return output


# build function to get comments for all youtube videos specified
# videos == list of video IDs
def fetchYouTubeComments(videos):

    # define output list
    output = []

    # iterate through list of video IDs
    for video in videos:
        
        output = getCommentsPerVideo(video, output, comments_to_view=5000)

    return output

output = fetchYouTubeComments(video_id)

In [7]:
# function for data cleaning
# accepts the output of the fetchYouTubeComments function

def cleanData(output):

    # convert list of dictionaries to dataframe
    output = pd.DataFrame(output)

    # drop any duplicate entries we have
    output.drop_duplicates(inplace=True)

    # in order: remove links, html tags, special characters and punctuation, emojis
    output['comment'] = output['comment'].astype(str) \
        .str.replace(r"http\S+|www\S+|https\S+", "", regex=True) \
        .str.replace(r"<.*?>", "", regex=True) \
        .str.replace(r"[^\w\s]", "", regex=True) \
        .str.replace(r"[\U00010000-\U0010ffff]", "", regex=True)

    output['reply'] = output['reply'].astype(str) \
        .str.replace(r"http\S+|www\S+|https\S+", "", regex=True) \
        .str.replace(r"<.*?>", "", regex=True) \
        .str.replace(r"[^\w\s]", "", regex=True) \
        .str.replace(r"[\U00010000-\U0010ffff]", "", regex=True)
    
    return output

output = cleanData(output)

In [8]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 509 entries, 0 to 23979
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  509 non-null    object
 1   reply    509 non-null    object
dtypes: object(2)
memory usage: 11.9+ KB


In [10]:
# function to add data into vector database
# accepts output from the cleanData function

def uploadToVectorDB(df):

    # initialize an instance of the database
    # persist ensures that the database is saved to the computer so I can reference it in other scripts
    database = chromadb.PersistentClient(path="./youtube_comment_database")
    # create a collection (group of documents and their embeddings)
    collection = database.get_or_create_collection(name="youtube_comments")

    # import sentence embedder from huggingface
    # using all-MiniLM-L6-v2 since llama2:7B doesn't have an encoder and this one is light enough for me to run
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # prepping data for embedding model

    # only going to embed the comments
    # users will prompt the llm with a comment and the llm will draft a reply to the comment based on the comment-reply pairs the semantic search returns
    # when the semantic search is happening, we should only be searching the comments, I want to see the comment-reply pairs for the most similar comments
    # therefore the replies will be stored as metadata in the database while the only the comments will be embedded

    # convert dataframe items to lists
    comments = df["comment"].to_list()
    replies = df["reply"].to_list()

    # convert replies to list of dictionaries so I can pass it as metadata
    replies_dict = [{"reply":reply} for reply in replies]

    # embed comments
    encoded_comments = embedding_model.encode(comments)

    # add data into database
    collection.add(
        ids=[str(i) for i in range(len(comments))],
        embeddings=encoded_comments,
        documents=comments,
        metadatas=replies_dict
    )

    return

uploadToVectorDB(output)

In [11]:
# embed the prompt
# load client
database = chromadb.PersistentClient(path="../Data Collection/youtube_comment_database")
# get collection
collection = database.get_or_create_collection(name="youtube_comments")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
prompt="i love the civic"
promptEncoded = embedding_model.encode(prompt)

# search the database using the encoded query and get 5 most related comment-reply pairs
# distance metric is cosine similarity by default, need to set it when I set up the collection
semantic_search_results = collection.query(query_embeddings=promptEncoded, n_results=6)

In [12]:
semantic_search_results

{'ids': [['366', '128', '46', '144', '344', '300']],
 'embeddings': None,
 'documents': [['The civic has always been pretty good to great I just wish it wasn39t getting bigger and bigger every generation',
   'The latest generation of Civic looks so nice',
   'I feel right in saying this generation makes me like the Civic as someone who39s never owned one',
   'The Civic looks so much better to me than the CorollaCamry competitors',
   'You guys rock  I love watching your Honda videos and listening to your thoughts on the new Civic Si  I think this is the best Civic Si Honda has ever made',
   'Wow I remember when the Civic was a nifty little crackerbox that was a blast to drive They were a blast There was a station wagon too']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'reply': 'I know this civic is 4 INCHES longer than the civic 25 years ago'},
   {'reply': 'insid3493 Spot on dude IMO modern Japanese styling is way above any

In [11]:
apiCall = youtube.commentThreads().list(part=["snippet","replies"], videoId=video_id[0], maxResults=250, order="relevance", pageToken="Z2V0X3JhbmtlZF9zdHJlYW1zLS1Db0lFQ0lBRUZSZTMwVGdhOXdNSzhnTUkyRjhRZ0FRWUJ5TG5BNHVZSDYycW1xSW9Jd1l5b2lBekxobkw3VDBrbkRtMTd3MUJEQXN4UmRGc3NJQ2Mxd211TVZzaEtUOFVLMmpseFRlcjdDbzFHTWJqMDB4RXhseFEyV3NpRkZwTHJtVm9hNGFrWFVubzZHa29tenZyR01vVk1wdmVxRVF5TEozMGRHc0wxQkdzcENTa285QmZSUERzZ0MxdzM4QkVyU2FMemhYR085VWgyRUhwQ2lHb2RRMXlWenZqZFRtTEo0eEdFRGN3UzRoRjlnbVBCbzlrd1ViYWQ4bzd5RkxSeE42S3h3U1hNQmNHSGJHQzRCU25vcEo1NWhoaWFCVVcySldicFljdGtqN25PME1DY3k2VTl1T0JVOGlPZEVKcEFZX1BUY0VhczNKcUZIRlJSRkstVW84d0tteGQteFRoWGk1Rkp3S1RCNEdJbFZLTzBrVGF3Y3hRaFNIRlcxUkZjSTIxcjNVWnFycXBLNmRFa2JCUmRRTWZhSlVjNkw4a19JdzlYUE12RkJONEZiNGlBVlhRbkl2QWhFTG8zaXZIRWxVSklRZ3kwdEI5LW14V1ZhdFVsWmxnaWd6X3h6aGozem90Q0Niemh1aWs2ZnQ5UTJraWRqY09WWko1dExBc3M0clpyb0xST0lHbVM5cXJYUFlVU2xVNjQtTmh6Unh4aVhJX2l5TkpJRUxVekRjWFl6Rnd0aEFsdFhwSkZwblhINlFIVnRHc1Q0U1NpWnJha3FBUHFLZ0lzZ1YxZ2pnNEVFd0hoZ21qMHRNc1lzZTRzQlBCVFJvZE1TaU5yTlpCZHhHNXhHaUp0ZHdoVTdHaWJTRHJPRTVnblEyWWNVN25EU1F4YlZKQkJpWmhlZ0FRWkJJSENJVWdFR1FZQVJJRkNLZ2dHQUFTQlFpSUlCZ0FFZ1VJaHlBWUFCSUhDSVFnRUVJWUFSSUZDSWtnR0FB").execute()

# iterate through the comments the api returned
for i in range(len(apiCall["items"])):
    
    # get comment text
    textOutput = apiCall["items"][i]["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
    
    # get count of replies
    replyCount = apiCall["items"][i]["snippet"]["totalReplyCount"]
    
    if replyCount > 0:
        
        # get list of all the returned replies (api usually returns 5 replies)
        replies = apiCall["items"][i]["replies"]["comments"]

        # get the likes per reply
        likes = []
        for reply in replies:
            likes.append(reply["snippet"]["likeCount"])
            
        # get index of comment with most likes
        maxIndex = likes.index(max(likes))
        # print comment with most likes
        mostLikedReplyText = replies[maxIndex]["snippet"]["textDisplay"]

        # save comment text and most liked reply text to output list dictionary
        output.append({"comment":textOutput, "reply":mostLikedReplyText})

In [4]:
# function to call api and produce a list of comment-reply pairs

# video == specify id of the video to get comments from
# amount_of_comments == amount of comments to retrieve from current video
# output == list item to store comment-reply pairs (pass as argument so function can continuously add to it)
def call_youTube_API(video, amount_of_comments, output):

    # call api to get comments on a particular video using video id
    # order comments by relevance, popular comments are more likely to have replies
    apiCall = youtube.commentThreads().list(part=["snippet","replies"], videoId=video, maxResults=amount_of_comments, order="relevance").execute()
    
    # iterate through the comments the api returned
    for i in range(len(apiCall["items"])):
        
        # get comment text
        textOutput = apiCall["items"][i]["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
        
        # get count of replies
        replyCount = apiCall["items"][i]["snippet"]["totalReplyCount"]
        
        if replyCount > 0:
            
            # get list of all the returned replies (api usually returns 5 replies)
            replies = apiCall["items"][i]["replies"]["comments"]

            # get the likes per reply
            likes = []
            for reply in replies:
                likes.append(reply["snippet"]["likeCount"])
                
            # get index of comment with most likes
            maxIndex = likes.index(max(likes))
            # print comment with most likes
            mostLikedReplyText = replies[maxIndex]["snippet"]["textDisplay"]

            # save comment text and most liked reply text to output list dictionary
            output.append({"comment":textOutput, "reply":mostLikedReplyText})
        
    return

In [5]:
# call function to get comments through all videos
for video in video_id:
    
    call_youTube_API(video, 250, output)

# convert list of dictionaries to dataframe
output = pd.DataFrame(output)

In [6]:
# do some data cleaning

# in order: remove links, html tags, special characters and punctuation, emojis
output['comment'] = output['comment'].astype(str) \
    .str.replace(r"http\S+|www\S+|https\S+", "", regex=True) \
    .str.replace(r"<.*?>", "", regex=True) \
    .str.replace(r"[^\w\s]", "", regex=True) \
    .str.replace(r"[\U00010000-\U0010ffff]", "", regex=True)

output['reply'] = output['reply'].astype(str) \
    .str.replace(r"http\S+|www\S+|https\S+", "", regex=True) \
    .str.replace(r"<.*?>", "", regex=True) \
    .str.replace(r"[^\w\s]", "", regex=True) \
    .str.replace(r"[\U00010000-\U0010ffff]", "", regex=True)

In [7]:
# prepping data for embedding model

# only going to embed the comments
# users will prompt the llm with a comment and the llm will draft a reply to the comment based on the comment-reply pairs the semantic search returns
# when the semantic search is happening, we should only be searching the comments, I want to see the comment-reply pairs for the most similar comments
# therefore the replies will be stored as metadata in the database while the only the comments will be embedded

# convert dataframe items to lists
comments = output["comment"].to_list()
replies = output["reply"].to_list()

# convert replies to list of dictionaries so I can pass it as metadata
replies_dict = [{"reply":reply} for reply in replies]

# embed comments
encoded_comments = embedding_model.encode(comments)

In [8]:
# add data into database
collection.add(
    ids=[str(i) for i in range(len(comments))],
    embeddings=encoded_comments,
    documents=comments,
    metadatas=replies_dict
)