In [None]:
one = 'https://www.youtube.com/watch?v=Ae1X8HZunok'
two = 'https://www.youtube.com/watch?v=-8J4wl9eUe4'
three = 'https://www.youtube.com/watch?v=ouZrZa5pLXk'

In [3]:
video_id = 'Ae1X8HZunok'
output_filename = 'adhd_top3_search'

# SELENIUM

In [50]:
import json # for saving html
import os #for checking if file exists
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import ChromiumOptions

class Hardcoded:
    """This class stores all the xpaths and links that are 
    currently used to get youtube comments, but may change in the future.
    """
    BASE_YOUTUBE_URL = 'https://www.youtube.com/watch?v='
    
    COOKIES_POPUP_PATH = '//ytd-consent-bump-v2-lightbox/tp-yt-paper-dialog'
    COOKIES_AGREE_PATH = "//*[contains(@aria-label,'Agree to the use of cookies')]"

    #for pausing autoplay
    VIDEO_PLAYER_PATH = "//div[@id='movie_player']"
    
    #for video stats + info:
    # DEPENDING ON THE FORMAT YOUTUBE SUPPLIES, THIS ONE GET RAW HTML (NO DETAIL EXTRACTED YET)
    VIDEO_METADATA_PATH = "//ytd-watch-metadata"
    DESCRIPTION_PATH = "//div[@id='description']"
    DESCRIPTION_MORE_PATH = "//tp-yt-paper-button[@id='more']"
    
    # DEPENDING ON THE FORMAT YOUTUBE SUPPLIES, THIS ONE GETS FULL DETAILS
    VIDEO_TITLE_PATH = "//ytd-video-primary-info-renderer/div[@id='container']/h1"
    _DESC_LOC_PATH_ = "//div[@id='meta-contents']/ytd-video-secondary-info-renderer/div[@id='container']/ytd-expander"
    VIDEO_DESC_TEXT_PATH = _DESC_LOC_PATH_ + "/div[@id='content']/div[@id='description']/yt-formatted-string"
    VIDEO_EXPAND_DESC_BTN_PATH = _DESC_LOC_PATH_ + "/tp-yt-paper-button[@id='more']"
    VIDEO_VIEWS_CLASS = "view-count"
    VIDEO_LIKES_PATH = "//*[contains(@aria-label,' likes')]" #use .get_attribute('aria-label')
    VIDEO_DISLIKES_PATH = "//*[contains(@aria-label,' dislikes')]" 
    VIDEO_TOTAL_COMMENTS_PATH = "//*[starts-with(@class,'count-text')]"
    PARENT_PATH = "parent::*"
    AUTHOR_PATH = "//*[contains(@class,'ytd-channel-name')]" #href=channel url, .text = channel name
    AUTHOR_CHANNEL_URL = "//ytd-video-owner-renderer/a"
    AUTHOR_SUB_COUNT_PATH = "//*[@id='owner-sub-count']" #aria-label = num subscribers
    
    #for sorting comments
    SORT_COMMENTS_DROPDOWN_PATH = "//tp-yt-paper-button[@aria-label='Sort comments']"
    SORT_COMMENTS_DESELECTED_PATH = "//a[@aria-selected='false']" 
    INDIVIDUAL_COMMENT_PATH = "//ytd-comment-thread-renderer"
    
    #for loading more comments
    COMMENT_PAGINATOR_PATH = "//ytd-continuation-item-renderer"
    COMMENT_PAGINATOR_SPINNER_PATH = COMMENT_PAGINATOR_PATH + "/tp-yt-paper-spinner"  #aria-hidden='true'
    COMMENT_RENDERER_BLOCK_PATH = "//ytd-item-section-renderer[@id='sections']"
    
    SET_COMMENT_EXTRACTION_ABSOLUTE_MAX = 1*10**9 #(1 billion)
    
def handle_cookies_popup(driver_):
    """Accepts the cookies policy if it is blocking the page
    """
    try:
        pageload = WebDriverWait(driver_, 5).until(
        EC.presence_of_element_located(
            (By.XPATH, Hardcoded.COOKIES_POPUP_PATH)
        ))
        cookies_popup = driver_.find_element(
            By.XPATH, Hardcoded.COOKIES_POPUP_PATH)
        
        if cookies_popup.is_displayed():
            print('Handling cookies popup...')
            
            accept_cookies_btn = cookies_popup.find_element(
                By.XPATH, Hardcoded.COOKIES_AGREE_PATH)
            print('... agree button found')
            
            accept_cookies_btn.click()
            print('... Cookies popup has been agreed to!\n')
            return
    except:
        return
    
def pause_autoplay(driver_):
    """Pauses video that autoplays within 10 seconds of page loading.
    """
    print('Overriding Autoplay...\n')
    wait_till_video_playing = WebDriverWait(driver_, 10).until(
        EC.text_to_be_present_in_element_attribute(
            (By.XPATH, Hardcoded.VIDEO_PLAYER_PATH),
            'class','playing-mode'
        ))
    driver_.find_element(
        By.XPATH, Hardcoded.VIDEO_PLAYER_PATH
    ).click()
    return

def get_video_detail(driver_):
    """Returns the video name and description, number of views/likes/dislikes/comments
    """
    def get_title(driver_):
        print('\tgetting video title...')
        video_title = driver_.find_element(By.XPATH, Hardcoded.VIDEO_TITLE_PATH).text
        return video_title
    
    def get_views(driver_):
        print('\tgetting current view count...')
        video_views = driver_.find_element(By.CLASS_NAME,Hardcoded.VIDEO_VIEWS_CLASS).text
        return video_views

    def get_likes(driver_):
        print('\tgetting current likes count...')
        video_likes = driver_.find_element(By.XPATH, Hardcoded.VIDEO_LIKES_PATH).get_attribute('aria-label')
        return video_likes
    
    def get_dislikes(driver_):
        print('\tgetting current dislikes count...')
        video_dislikes = driver_.find_element(By.XPATH, Hardcoded.VIDEO_DISLIKES_PATH).get_attribute('aria-label')
        return video_dislikes
    
    def get_description(driver_):
        print("\tgetting video description...")
        btn_path = Hardcoded.VIDEO_EXPAND_DESC_BTN_PATH
        btn = driver_.find_element(By.XPATH, btn_path)
        
        # scroll to dropdown
        driver_.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
        
        if btn.is_displayed():
            print('...expanding description to get full text')
            btn.click()
        else:
            pass
        
        wait_till_desc_loaded = WebDriverWait(driver_, 10).until(
                EC.element_to_be_clickable(
                    (By.XPATH, btn_path.replace("id='more'","id='less'"))
                ))
        video_desc = driver_.find_element(By.XPATH, Hardcoded.VIDEO_DESC_TEXT_PATH).text
        return video_desc
    
    def get_comment_count(driver_):
        """Returns the current total comments count from the video.
        
        note this scrolls down to the comments section, which also loads the box
         for sorting comments by Newest First, which is needed for the next part.
        """
        print('\tgetting comment count')
        #scroll to load first batch of comments
        comments_section = WebDriverWait(driver_, 30).until(
            EC.presence_of_element_located(
                (By.ID, "comments")
            ))
        driver_.execute_script("arguments[0].scrollIntoView();", comments_section)
        ensure_fullyloaded = WebDriverWait(driver_, 30).until(
            EC.presence_of_element_located(
                (By.XPATH, Hardcoded.VIDEO_TOTAL_COMMENTS_PATH)
            ))
        num_comments = driver_.find_element(
            By.XPATH, Hardcoded.VIDEO_TOTAL_COMMENTS_PATH
        ).text
        return num_comments
    
    detail_fncs = {
        'video_title':get_title, 
        'video_views':get_views, 
        'video_likes':get_likes, 
        'video_dislikes':get_dislikes, 
        'video_desc':get_description,
        'video_num_comments':get_comment_count
    }
    # run all the 6 functions above and save this video metadata to a dictionary
    video_details = {}
    for fnc_name, fnc in detail_fncs.items():
        try:
            video_details[fnc_name] = fnc(driver_)
        except:
            print(f'Failed to extract {fnc_name}')
        
    return video_details

def get_author_detail(driver_):
    """Returns video uploader name, their channel url, their number of subscribers
    """
    def get_author_name(driver_):
        print("\tgetting video author...")
        author_name = driver_.find_element(
            By.XPATH, Hardcoded.AUTHOR_PATH).text
        return author_name
    
    def get_author_url(driver_):
        print("\tgetting video author url...")
        author_url = driver_.find_element(
            By.XPATH, Hardcoded.AUTHOR_CHANNEL_URL).get_attribute('href')
        return author_url
    
    def get_author_subcount(driver_):
        print("\tgetting video author subscriber count...")
        author_subs = driver_.find_element(
            By.XPATH, Hardcoded.AUTHOR_SUB_COUNT_PATH).get_attribute('aria-label')
        return author_subs
    
    author_details = {
        'author_name':get_author_name(driver_),
        'author_url':get_author_url(driver_),
        'author_subs':get_author_subcount(driver_)
    }
    return author_details

def get_metadata(driver_):
    """This function gets all the video details from other functions."""
    print('Getting Video Metadata:\n')
    metadata = {}
    
    print('...expanding description to get full text')
    md_elem = driver_.find_element(By.XPATH, Hardcoded.VIDEO_METADATA_PATH)
    
    if not md_elem.is_displayed():
        print('Getting Video Metadata:\n')
        metadata = {}
        metadata.update(get_video_detail(driver_))
        if len(metadata.keys()) < 6:
            print("missing some of the metadata")
            md_elem = driver_.find_element(By.XPATH, Hardcoded.VIDEO_METADATA_PATH)
            metadata[driver_.current_url] = md_elem.get_attribute("innerHTML")
        else:
            pass
        
        metadata.update(get_author_detail(driver_))
        return metadata
    
    else:
        print("Couldn't extract metadata, you will need to do this manually later.")
    
        # scroll to dropdown
        driver_.execute_script("arguments[0].scrollIntoView({block:'center'});", metadata_element)

        try:
            exp_desc_elem = md_elem.find_element(By.XPATH, Hardcoded.DESCRIPTION_PATH
                                                ).find_element(By.XPATH, Hardcoded.DESCRIPTION_MORE_PATH)

            if exp_desc_elem.is_displayed():
                exp_desc_elem.click()

            while driver_.find_element(By.XPATH, Hardcoded.VIDEO_METADATA_PATH
                                      ).find_element(By.XPATH, Hardcoded.DESCRIPTION_PATH
                                                    ).find_element(By.XPATH, Hardcoded.DESCRIPTION_MORE_PATH
            ).is_displayed():
                WebDriverWait(driver_,1)
        except:
            pass

        md_elem = driver_.find_element(By.XPATH, Hardcoded.VIDEO_METADATA_PATH)
        metadata[driver_.current_url] = md_elem.get_attribute("innerHTML")
        return metadata

def sort_comments(driver_,sort_choice='Newest first'):
    """Either sort by 'Newest first' or by 'Top comments'
    Newest first - for repeat scraping of the same video over time
    Top comments - for when you only want a sample of comments per video
    """    
    if sort_choice not in ["Newest first","Top comments"]:
        print("Error: sort_choice is case sensitive, choose from 'Newest first' and 'Top comments'")
    
    print(f"Sorting Comments by {sort_choice}")
    
    # scroll to load first batch of comments
    comments_section = WebDriverWait(driver_, 30).until(
        EC.presence_of_element_located(
            (By.ID, "comments")
        ))
    
    driver_.execute_script("arguments[0].scrollIntoView({block:'center'});",
                           comments_section)
    
    wait_till_first_cmt_loaded = WebDriverWait(driver_, 30).until(
        EC.presence_of_element_located(
            (By.XPATH, Hardcoded.INDIVIDUAL_COMMENT_PATH)
        ))
    
    dropdown_elem = driver_.find_element(
        By.XPATH, Hardcoded.SORT_COMMENTS_DROPDOWN_PATH)

    # expand the sort options dropdown
    if dropdown_elem.get_attribute('aria-expanded')=='false':

          # scroll to dropdown
        driver_.execute_script("arguments[0].scrollIntoView({block:'center'});",
                               dropdown_elem)

        wait_till_clickable = WebDriverWait(driver_, 10).until(
            EC.element_to_be_clickable(dropdown_elem))

        dropdown_elem.click()

        # wait for dropdown options to load
        wait_till_loaded = WebDriverWait(driver_, 10).until(
            EC.text_to_be_present_in_element_attribute(
                (By.XPATH, Hardcoded.SORT_COMMENTS_DROPDOWN_PATH),
                'aria-expanded','true'))
    
    # find the sort options buttons
    dropdown_elem = driver_.find_element(
        By.XPATH, Hardcoded.SORT_COMMENTS_DROPDOWN_PATH)

    # scroll to dropdown
    driver_.execute_script("arguments[0].scrollIntoView({block:'center'});",
                           dropdown_elem)
    
    wait_till_dropdown_loads = WebDriverWait(driver_,10).until(
        EC.element_to_be_clickable(
            (By.LINK_TEXT, sort_choice)
        ))
    
    comments_sorter = driver_.find_element(
        By.LINK_TEXT, sort_choice
    ).get_attribute('class')
    
    print('comments sorter',comments_sorter)
    
    if 'iron-selected' in comments_sorter:
        #reclose dropdown if your preference is already selected.
        
        dropdown_elem = driver_.find_element(
            By.XPATH, Hardcoded.SORT_COMMENTS_DROPDOWN_PATH)
        
        dropdown_elem.click()
        pass
    
    else:    
        deselected_option = dropdown_elem.find_element(
            By.XPATH, Hardcoded.SORT_COMMENTS_DESELECTED_PATH)
    
        deselected_option.click()
    
        wait_till_comments_reload = WebDriverWait(driver_, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, Hardcoded.INDIVIDUAL_COMMENT_PATH)
            ))
    return

def get_comments(driver_, max_comments=None):
    """Scrolls down to load more comments until you have seen them all"""
    if type(max_comments) == type(None):
        max_comments = Hardcoded.SET_COMMENT_EXTRACTION_ABSOLUTE_MAX
        
    current_comment_elements = driver_.find_elements(
        By.XPATH, Hardcoded.INDIVIDUAL_COMMENT_PATH
    )
    visible_comments = len(current_comment_elements)

    while visible_comments <= max_comments:
        print(f"Pre-scroll count = {visible_comments}")

        driver_.execute_script("arguments[0].scrollIntoView();",
                               current_comment_elements[-1]) #yt_paginator_element)
        
        stage_failed = ''
        try:
            # spinner element is present when the ajax? request is being made
            stage_failed = 'wait_till_request_made'
            wait_till_request_made = WebDriverWait(driver_, 10).until(
                EC.text_to_be_present_in_element_attribute(
                    (By.XPATH, Hardcoded.COMMENT_PAGINATOR_SPINNER_PATH),
                    'aria-hidden','true'))
            
            # the attribute 'can-show-more' only appears when comments are rendering
            stage_failed = 'wait_till_render_start'
            wait_till_render_start = WebDriverWait(driver_, 20).until(
                EC.element_attribute_to_include(
                    (By.XPATH, Hardcoded.COMMENT_RENDERER_BLOCK_PATH),
                    'can-show-more'))
            
            # wait for 'can-show-more' to disappear
            stage_failed = 'wait_till_render_end'
            wait_till_render_end = WebDriverWait(driver_, 20).until_not(
                EC.element_attribute_to_include(
                    (By.XPATH, Hardcoded.COMMENT_RENDERER_BLOCK_PATH),
                    'can-show-more'))

            # recount comments now
            stage_failed = 'current_comment_elements'
            current_comment_elements = driver_.find_elements(
                By.XPATH, Hardcoded.INDIVIDUAL_COMMENT_PATH)
            visible_comments = len(current_comment_elements)

        except:
            if stage_failed != '':
                print(f'Failed at stage: {stage_failed}')
                
            # NOTE: this exception will occur when the comments are all loaded.
            total_comments = [x.get_attribute("innerHTML") for x in current_comment_elements]

            print(f'\nWe read a total of {visible_comments} comments')
            return total_comments
    
    print(f'Returning {max_comments} which is the limit set in the function. You can adjust this.')
    limited_comments = current_comment_elements[:max_comments]
    limited_comments = [x.get_attribute("innerHTML") for x in limited_comments]
    return limited_comments

def save_json_output(filename_, metadata_dict,comments_list):
    if os.path.exists(filename_):
        print('you already had a file saved, not yet set up to append. Exiting.')
        return
    else:
        data = metadata_dict
        data['comments_soup'] = comments_list
        
        try:
            extracted_comments_text = [
                bs4.BeautifulSoup(soup).find('yt-formatted-string',
                                             {'id':'content-text'}
                                            ).get_text() for soup in data['comments_text']
            ]
            data['comments_text'] = extracted_comments_text
            
        except:
            data['comments_text'] = ['not extracted yet from soup']
            print("couldn't extract comment text, saved as raw html list instead")
            
        with open(f'{filename_}.json','a') as f:
            json.dump(data, f)
        return


In [51]:
def get_youtube_video_comments(video_id_, output_filename_, sort_comments_by="Newest first"):
    if os.path.exists(output_filename_):
#         probably only a few tweaks to the function save_json_output needed to append new data
# AND also the get_comments function should stop loading comments when it reaches any comments taken from the past, when sorted by newest first.
        return print('you have already scraped this video. this is not yet set up to run repeat scraping')
    else:
#         opt = ChromiumOptions()
#         opt.add_argument("--headless")
#         opt.add_argument("--blink-settings=imagesEnabled=false")
#         with webdriver.Chrome(options=opt) as browser:
        with webdriver.Chrome() as browser:
            video_url = Hardcoded.BASE_YOUTUBE_URL + video_id_
            browser.get(video_url)

            handle_cookies_popup(browser)
            pause_autoplay(browser)

            video_metadata = get_metadata(browser)

            sort_comments(browser,sort_choice=sort_comments_by)

            video_comments = get_comments(browser, max_comments=None)
            
            print('\n\t | Closing browser. |')

            save_json_output(output_filename_, video_metadata, video_comments)

        return print("\n-- Finished! --\n")



In [53]:
# actual scraper:

get_youtube_video_comments(video_id_=video_id, output_filename_=output_filename, sort_comments_by="Top comments")





Handling cookies popup...
... agree button found
... Cookies popup has been agreed to!

Overriding Autoplay...

Getting Video Metadata:

...expanding description to get full text
Getting Video Metadata:

	getting video title...
	getting current view count...
	getting current likes count...
	getting current dislikes count...
Failed to extract video_dislikes
	getting video description...
...expanding description to get full text
	getting comment count
missing some of the metadata
	getting video author...
	getting video author url...
	getting video author subscriber count...
Sorting Comments by Top comments
comments sorter yt-simple-endpoint style-scope yt-dropdown-menu iron-selected
Pre-scroll count = 20
Pre-scroll count = 40
Pre-scroll count = 60
Pre-scroll count = 80
Failed at stage: wait_till_render_start

We read a total of 80 comments

	 | Closing browser. |
couldn't extract comment text, saved as raw html list instead

-- Finished! --



In [7]:
# with webdriver.Chrome() as browser:
    
#     video_url = Hardcoded.BASE_YOUTUBE_URL + video_id
#     browser.get(video_url)

#     handle_cookies_popup(browser)
#     pause_autoplay(browser)

#     video_metadata = get_metadata(browser)

#     sort_comments(browser,sort_choice="Newest first")

#     video_comments = get_comments(browser, max_comments=None)

#     save_json_output(output_filename, video_metadata, video_comments)

#     print("finished! Closing browser.")

NOTE: when you scroll down, the comments renderer div changes class temporarily until the new comments are all rendered

<img src='files/loading new comments.png'>

# extract info from raw html on comments

In [10]:
import bs4

In [8]:
with open(f"{output_filename}.json",'r') as f:
    data = json.load(f)

In [9]:
data.keys()

dict_keys(['video_title', 'video_views', 'video_likes', 'video_dislikes', 'video_desc', 'video_num_comments', 'author_name', 'author_url', 'author_subs', 'comments_text'])

In [13]:
len(data['comments_text'])

544

In [27]:
extracted_comments_text = [bs4.BeautifulSoup(soup
                                            ).find(
    'yt-formatted-string',{'id':'content-text'}
).get_text() for soup in data['comments_text']
                          ]

In [29]:
data_extracted = {k:v for k,v in data.items() if k!='comments_text'}
data_extracted['comments_list'] = 

'Attention deficit hyperactivity disorder (ADHD) is a commonly misunderstood mental health condition. This video discusses what ADHD is and what the symptoms are (ADD and ADHD). The video also discusses treatment options to help with ADHD symptoms: medication and psychotherapy. Finally, the video touches on recent research that highlights the use of video games as a treatment option for ADHD.\n\nThis video was made by McMaster Demystifying Medicine students Abhiti Kuhad, Veronica Iskander, Sundas Ramzan and Nirushaa Pathmanathan. \n\nCopyright McMaster University 2019\n\nReferences:\n\nRamtekkar, U. P., Reiersen, A. M., Todorov, A. A., & Todd, R. D. (2010). Sex and age differences in attention-deficit/hyperactivity disorder symptoms and diagnoses: implications for DSM-V and ICD-11. Journal of the American Academy of Child & Adolescent Psychiatry, 49(3), 217-228. Retrieved from https://www.ncbi.nlm.nih.gov/pmc/arti...\n\nRucklidge, J. J. (2010). Gender differences in attention-deficit/h

In [21]:
# #comments were loaded by manually scrolling down in youtube, then saving the page as Format: WebPage, Complete
# with open('Unexpected Guest _ John Lewis & Partners _ Christmas Ad 2021 - YouTube.html','r') as f:
#     soup = bs4.BeautifulSoup(f)

In [18]:
comment_blocks = soup.find_all('ytd-comment-thread-renderer')

cleaned_details = []
for cm_block in comment_blocks:
    details = {}
    author = cm_block.find('a',{'id':'author-text'})
    details['author_name'] = author.get_text()
    details['author_channel'] = author['href'] + '/about'
    details['comment_text'] = cm_block.find('yt-formatted-string',{'id':'content-text'}).get_text()
    details['comment_likes'] = cm_block.find('span',{'id':'vote-count-left'}).get_text()
    
    def text_cleaner(html_tag):
        return html_tag.replace('\n',' ').strip()
    for d in details:
        details[d] = text_cleaner(details[d])
    
    cleaned_details.append(details)

# https://www.youtube.com/channel/UC8pW1RS00Y9ILi9XSiTSEVg/about
# get influence of commenter:
#     'https://www.youtube.com/channel/' <their channel id eg: UC8pW1RS00Y9ILi9XSiTSEVg> '/about' #gets bio



In [19]:
cleaned_details

[]

In [67]:
import pandas as pd

In [68]:
df = pd.DataFrame(cleaned_details)
df.head()

Unnamed: 0,author_name,author_channel,comment_text,comment_likes
0,Joyride F,https://www.youtube.com/channel/UC-9ZN4ZUtUk3A...,"The man on the moon was my favourite, this is ...",2
1,Francis Smith,https://www.youtube.com/channel/UCPa7xCHWAja5O...,Good trailer. When is this film being released...,0
2,mark layton,https://www.youtube.com/channel/UCxOKYsMj-q19X...,The music is good ....but spaceships dont mak...,0
3,Stephen,https://www.youtube.com/channel/UCy6DSJBFoUyjY...,Worst JL advert!,5
4,Lewis 96,https://www.youtube.com/channel/UCgFBYKPbuS2qm...,Her outfit kinda reminds of Jack Frost. It’s s...,0


In [70]:
df.comment_likes.unique()

array(['2', '0', '5', '1', '6', '4', '3', '32', '9', '7', '8', '16', '13',
       '45', '41', '44', '29', '20', '30', '21', '100', '31', '15', '175',
       '25', '68', '90', '24', '10', '144', '11', '35', '27', '352', '33',
       '54', '14', '18', '17', '34', '42', '186', '22', '118', '12', '56',
       '77', '180', '36', '70', '127', '92', '19', '297', '65', '55',
       '84', '572', '95', '99', '37', '91', '43', '131', '78', '40', '51',
       '28', '114', '282', '62', '87', '226', '88', '230', '253', '293',
       '396', '507', '193', '239', '250', '50', '121', '518', '46',
       '1.7K', '98', '49', '64', '97', '215', '196'], dtype=object)

In [105]:
df.comment_likes = (df.comment_likes.replace(r'[KM]+$', '', regex=True).astype(float) \
        * df.comment_likes.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1) \
            .replace(['K','M'], [10**3, 10**6]).astype(int))

In [158]:
df.to_csv('yt_JL_comments_Monday8th.csv',index=False)

Author: George Goldberg (Nov 2021)

# IF I DID COMMENT REPLIES:

In [None]:


# #wait for sorting options to load (timeout after 30 seconds)
# xpath_sort_btn = "//tp-yt-paper-button[@aria-label='Sort comments']"
# elem = WebDriverWait(browser, 30).until(
#     EC.presence_of_element_located((By.XPATH, xpath_sort_btn))
#     )

# #PLAN : once sort btn is loaded above, you need to:
# # - set it as an element
# # - click it, or change the attribute 'aria-expanded' from 'false' to 'true' for the link that is attached to the text "Newest First"
# # hopefully, this replaces the current sorting and reloads comments (check visually if the browser has changed)
# # - then wait for EC.presence_of_element_located for 

# COMMENT STRUCTURE:
# comment container tag <ytd-comments id='comments'>
#  --- top level spinner (dont use) <tp-yt-paper-spinner-lite aria-hidden="true" #( or "false" if still loading)
#  --- comment block <ytd-item-section-renderer id='sections'>
#     --- comment header/metadata (total comments, sort dropdown etc) <div id='header'>
#         --- <ytd-comments-header-renderer>
#             --- <div id='title'>
#                 --- <h2 id='count'>.text
# # (from above=TOTAL VIDEO COMMENTS as string:will need to remove "Comments" and strip() and remove commas too and format as number)

#                 --- <span id='sort-menu'>
#                     --- <yt-sort-filter-sub-menu-renderer>
#                         --- <yt-dropdown-menu icon-label='Sort by'>
#                             --- <tp-yt-paper-menu-button>
#                                 --- <div id='trigger'>
#                                     --- <tp-yt-paper-button id='label' role='button' aria-label='Sort comments'> 
# # (use aria-label='Sort comments' as locator here??? as it's quite deep into the html tags)
# # (.click this button above to open the dropdown options)
                                         
#                                 --- <tp-yt-iron-dropdown id='dropdown' aria-hidden='true'> (hidden until dropdown clicked)
#                                     --- <div id='contentWrapper'><div slot='dropdown-content'><tp-yt-paper-listbox id='menu'>
#                                           --- <a aria-selected='true' (also has 'iron-selected' in the class name)
#                                               --- <tp-yt-paper-item role='option'>
#                                                   --- <div class='item'>.text = "Top comments"
#                                           --- <a aria-selected='false'
#                                               --- <tp-yt-paper-item role='option'>
#                                                   --- <div class='item'>.text = "Newest first"
#     --- comment spinner (has 'active' property whilst it is loading/rendering comments.)
#     --- comment contents 
#        |--- first comment <ytd-comment-thread-renderer>
#        |--- second comment <ytd-comment-thread-renderer>
#        |--- ...
#        |--- last rendered comment <ytd-comment-thread-renderer id='comment'>
#             --- comment itself <ytd-comment-renderer>
#                 --- <div id='body'><div id='main'>
#                    |--- <ytd-expander id='expander'>
#                         |--- <div id='content'> <yt-formatted-string id='content-text'>.text
# # (from above=COMMENT TEXT)

#                         |--- tp-yt-paper-button id='less'
#                         |--- <tp-yt-paper-button id='more'> hidden='' 
# # (from above= EXPAND LONGER COMMENTS TO SEE FULL TEXT)
# #             hidden property will be there if the comment can't be expanded.
# #             to get full text: use visiblity_of_element/visiblity_of_element_located/invisiblity_of_element_located/element_to_be_clickable
# #             might be better to save this until the end - save it all THEN expand and resave, incase it breaks and you lose everything
            
#                         |--- <ytd-comment-action-buttons-renderer id='action-buttons'>
#                              --- <div id='toolbar'> <span id="vote-count-left" aria-label='90K likes'> 
# # (from above=COMMENT LIKES: get attribute aria-label from span tag) Then pandas formating to get it as a number:
#     ## pandas regex converting '20K' to float(20000)
#     #     df.comment_likes = (df.comment_likes.replace(r'[KM]+$', '', regex=True).astype(float) \
#     #         * df.comment_likes.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1) \
#     #             .replace(['K','M'], [10**3, 10**6]).astype(int))
                
                
#             --- comment replies div <div id='replies'> 
#                     <ytd-comment-replies-renderer>
#                         <div id='expander'>
#                             |
#                             --- <ytd-button-renderer>
#                                     <a class='yt-simple-endpoint style-scope ytd-button-renderer'>
#                                         <tp-yt-paper-button id='button' role='button' aria-label='View Reply OR View 180 Replies'> 
#                                         (CLICK THIS TO LOAD MORE) + use get_attribute('aria-label') to get the replies count
#                             |
#                             --- <div id='expander-contents'>
#                                  ---<div id='contents'>
                                
#                                    |--- comment reply 1 <ytd-comment-renderer>
#                                         --- <div id='body'><div id='main'>
#                                                  |--- <div id='header'> (reply author + date)
#                                                      --- <div id='header-author'>
#                                                         ---<h3><a id='author-text'> 
# (from above=COMMENT REPLY AUTHOR CHANNEL url from <a>href attribute)
# (from above=COMMENT REPLY AUTHOR NAME from .text either on <h3>.text (use parent xpath locator "parent::*") / <h3><a>.text / <h3><a><span>.text
                                                            
#                                                  |--- <ytd-expander id='expander'> (reply text)
#                                                     |--- <div id='content'>
#                                                         --- <yt-formatted-string id='content-text'> .text 
# (from above=COMMENT REPLY TEXT)
                                                            
#                                                     |--- <tp-yt-paper-button id='less' role="button" hidden="">
#                                                     |--- <tp-yt-paper-button id='more' role="button" hidden=""> If property=hidden, do nothing, else click for replies to this comment reply. 
#                                                  |--- reply date <yt-formatted-string class='published-time-text'>.text
#                                                  |--- reply likes <ytd-comment-action-buttons-renderer id='action-buttons'>
#                                                      --- <div id='toolbar'> <span id="vote-count-left" aria-label='90K likes'> get attribute aria-label
                                                    
#                                     |--- comment reply 2 <ytd-comment-renderer>
#                                     |--- comment reply N
                                        
                                        

#             --- replies block (to a comment) sometimes will be hidden??? <div id="replies" class="style-scope ytd-comment-thread-renderer"><ytd-comment-replies-renderer class="style-scope ytd-comment-thread-renderer"><!--css-build:shady--><div id="expander" class="style-scope ytd-comment-replies-renderer">
#                 |
#                 --- more replies button container
#                     <ytd-button-renderer id="more-replies" class="more-button style-scope ytd-comment-replies-renderer" aria-expanded="false" noink="" use-keyboard-focused="" button-renderer="true" is-paper-button-with-icon="" is-paper-button=""><a class="yt-simple-endpoint style-scope ytd-button-renderer" tabindex="-1"><tp-yt-paper-button id="button" class="style-scope ytd-button-renderer" role="button" tabindex="0" animated="" elevation="0" aria-disabled="false" aria-label="View reply"><!--css-build:shady--><yt-icon class="style-scope ytd-button-renderer"><svg viewBox="0 0 24 24" preserveAspectRatio="xMidYMid meet" focusable="false" class="style-scope yt-icon" style="pointer-events: none; display: block; width: 100%; height: 100%;"><g class="style-scope yt-icon"><path d="M18,9l-6,6L6,9H18z" class="style-scope yt-icon"></path></g></svg><!--css-build:shady--></yt-icon><yt-formatted-string id="text" class="style-scope ytd-button-renderer">View reply</yt-formatted-string></tp-yt-paper-button></a></ytd-button-renderer>
#                 --- fewer replies button container
#                     <ytd-button-renderer id="less-replies" class="less-button style-scope ytd-comment-replies-renderer" aria-expanded="true" noink="" hidden="" use-keyboard-focused="" button-renderer="true" is-paper-button-with-icon="" is-paper-button=""><a class="yt-simple-endpoint style-scope ytd-button-renderer" tabindex="-1"><tp-yt-paper-button id="button" class="style-scope ytd-button-renderer" role="button" tabindex="0" animated="" elevation="0" aria-disabled="false" aria-label="Hide reply"><!--css-build:shady--><yt-icon class="style-scope ytd-button-renderer"><svg viewBox="0 0 24 24" preserveAspectRatio="xMidYMid meet" focusable="false" class="style-scope yt-icon" style="pointer-events: none; display: block; width: 100%; height: 100%;"><g class="style-scope yt-icon"><path d="M18,15H6l6-6L18,15z" class="style-scope yt-icon"></path></g></svg><!--css-build:shady--></yt-icon><yt-formatted-string id="text" class="style-scope ytd-button-renderer">Hide reply</yt-formatted-string></tp-yt-paper-button></a></ytd-button-renderer>
#                 --- expander div (default is hidden - where the replies are rendered when you click for more)
#                     <div id="expander-contents" class="style-scope ytd-comment-replies-renderer" hidden="">
                  
                  
# #     <div id="contents" class="style-scope ytd-comment-replies-renderer"><ytd-continuation-item-renderer class="style-scope ytd-comment-replies-renderer"><!--css-build:shady--><div id="ghost-cards" class="style-scope ytd-continuation-item-renderer"></div>
# # <tp-yt-paper-spinner id="spinner" class="style-scope ytd-continuation-item-renderer" hidden="" aria-hidden="true"><!--css-build:shady--><div id="spinnerContainer" class="active cooldown style-scope tp-yt-paper-spinner"><div class="spinner-layer layer-1 style-scope tp-yt-paper-spinner"><div class="circle-clipper left style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div><div class="circle-clipper right style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div></div><div class="spinner-layer layer-2 style-scope tp-yt-paper-spinner"><div class="circle-clipper left style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div><div class="circle-clipper right style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div></div><div class="spinner-layer layer-3 style-scope tp-yt-paper-spinner"><div class="circle-clipper left style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div><div class="circle-clipper right style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div></div><div class="spinner-layer layer-4 style-scope tp-yt-paper-spinner"><div class="circle-clipper left style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div><div class="circle-clipper right style-scope tp-yt-paper-spinner"><div class="circle style-scope tp-yt-paper-spinner"></div></div></div></div></tp-yt-paper-spinner>
# # <div id="button" class="style-scope ytd-continuation-item-renderer"></div>
# # </ytd-continuation-item-renderer></div>
# #     <div class="cont-button style-scope ytd-comment-replies-renderer" id="continuation"></div>
# #     <yt-next-continuation id="fake-continuation" class="cont-button style-scope ytd-comment-replies-renderer" show-button="true" show-icon="true" hidden=""><!--css-build:shady-->
# # <dom-if class="style-scope yt-next-continuation"><template is="dom-if"></template></dom-if>

# #   <tp-yt-paper-button class="style-scope yt-next-continuation" role="button" tabindex="0" animated="" elevation="0" aria-disabled="false"><!--css-build:shady-->
# #     <yt-icon icon="subdirectory_arrow_right" class="style-scope yt-next-continuation"><svg viewBox="0 0 24 24" preserveAspectRatio="xMidYMid meet" focusable="false" class="style-scope yt-icon" style="pointer-events: none; display: block; width: 100%; height: 100%;"><g class="style-scope yt-icon"><path fill="none" d="M0 0h24v24H0V0z" class="style-scope yt-icon"></path><path d="M19 15l-6 6-1.42-1.42L15.17 16H4V4h2v10h9.17l-3.59-3.58L13 9l6 6z" class="style-scope yt-icon"></path></g></svg><!--css-build:shady--></yt-icon>
# #     <yt-formatted-string class="style-scope yt-next-continuation"></yt-formatted-string>
# #   </tp-yt-paper-button>
# # <dom-if class="style-scope yt-next-continuation"><template is="dom-if"></template></dom-if>
# # </yt-next-continuation>
# #   </div>
# # </div>
# # <div id="teaser-replies" class="style-scope ytd-comment-replies-renderer"></div>
# # </ytd-comment-replies-renderer></div>
        
#         --- final tag: <ytd-continuation-item-renderer>  (is this hidden when all comments have loaded?)
#            |
#             --- another spinner (for loading more comments) 
#                 <tp-yt-paper-spinner class="style-scope ytd-continuation-item-renderer" aria-hidden="true">
#             --- (HIDDEN) button for loading next batch: 
#                 <div id="button" class="style-scope ytd-continuation-item-renderer" hidden=""></div>
# #                 (either [scroll into view] OR [click] OR [unhide/remove hide property and click])
    
# #         (continuation tag shows there are more commnets? 
# #         use this in while loop - while it is visible, keep looping for more comments)
        
# #         NOTE: (SEEMS TO BE 20 COMMENTS AT ONCE.)




    
# # COMMENTS BLOCK <ytd-comments id='comments'>
# # def COMMENTS LOAD CHECKER()??????:
# #     WebDriverWait((browser, 30)).until(
# #     comments.find_element(
# #         By.XPATH,"tp-yt-paper-spinner-lite"
# #     ).get_attribute('area-hidden')=='true')


# # >inside comments block, at top: spinner for loading comments which will be hidden if loaded. Wait for true (inside  as <tp-yt-paper-spinner-lite aria-hidden="true")
# # (COMMENTS ARE: inside block <ytd-comments id='comments'> 
# # then subblock <ytd-item-section-renderer>
# #  > div id= header has the total comments metadata
# #  > div id=contents contains all the individual comments (each as its own tag: ytd-comment-thread-renderer)
# # then get the Likes, the commentername, and channel, the comment text, and date?












# # sorting_block = browser.find_element(By.XPATH, "//tp-yt-paper-button[@aria-label='Sort comments']")
# #toggle expand: set the aria-expanded attribute to "true" (OR click the element)
# # wait a second for it to load/expand? then..
# # XPATH, "//tp-yt-paper-listbox[@id=menu]/a[1 (is it zero indexed? you need the second one (Newest FIrst))]"
# # click the 'a' class or toggle the aria-selected to true and? class name append iron-selected and remove it from the other (top comments) anchor tag
# # inside the anchor class you want to toggle: tp-yt-paper-item-body/div[@class=item].text (or text of its parent elemnt - should be "Newest First")

    
    
# sort_dropdown = browser.find_element(By.XPATH, xpath_sort_btn)

