## System setup 

Before we start, make sure to install the required libraries
    
    pip install bs4
    pip install selenium


e.g. For Chrome, download the appropriate webdriver from here: http://chromedriver.chromium.org/downloads, unzip it and save in current directory.

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time
import html
import string


from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
 


In [3]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('/Users/macuser/Downloads/chromedriver', options=options)#Need to change this to your path

In [4]:
#Use this function for when scraping the channels the person is subscribed to
def get_subs_soup(url,driver):
    driver.get(url)
    SCROLL_PAUSE_TIME = 10

    # Get scroll height
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    while True:
       # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        #print("new" , new_height)
        if new_height == last_height:
            break
        last_height = new_height
        
        
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
        

    return soup


#Use this function for normal scraping
def get_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') 
    return soup

In [5]:
#extracts all channels that the person is subscribed to
def scrape_subs_page(dir_url,driver):
    print ('-'*20,'Scraping subscriptions page','-'*20)
    channel_links = []
    base_url = 'https://www.youtube.com'
    channel_names = []
    
            
    #execute js on webpage to load channel listings on webpage and get ready to parse the loaded HTML 
    soup = get_soup(dir_url,driver)  
    
    
    new_link = soup.find_all('ytd-grid-channel-renderer', class_ = 'style-scope ytd-grid-renderer') #get list of all channels
    
    #Get list of all channel links
    for link_holder in new_link: 
        rel_link = link_holder.find('a')['href']  
        channel_links.append(base_url+rel_link)
    
    
    #Get list of all channel names
    link = soup.find_all('span', class_ = 'style-scope ytd-grid-channel-renderer')
    for i in range(0,int(len(link)/2)):
        channel_name = link[2*i].text
        channel_names.append(channel_name)
        
            
    print ('-'*20,'Found {} channels urls'.format(len(channel_links)),'-'*20)
    return channel_links, channel_names

In [6]:
#For a particular channel collects the links for the last 30 videos
def get_all_video_links(channel_url,driver):
    
    #load the HTML for each subscription channel
    soup_1 = get_soup(channel_url,driver)
    
    #Click on the videos to show the list of all the videos for the channel
    driver.find_element_by_xpath('//*[@id="tabsContent"]/tp-yt-paper-tab[2]').click()
    video_html = driver.current_url
    
    #load the HTML for the videos page
    soup_2 = get_soup(video_html, driver)
    
    videos_links = []
    video_titles = []
    videos_url = []
    
    new_link = soup_2.find_all('ytd-grid-video-renderer', class_ = 'style-scope ytd-grid-renderer')  #get list of all videos
    
    base_url = 'https://www.youtube.com'
    #get the links for the last recent 30 videos
    for link_holder in new_link: 
        rel_link = link_holder.find('a')['href']  
        video_titles.append(driver.find_element_by_xpath('//*[@id="video-title"]').text)
        videos_links.append(video_html+rel_link)
        videos_url.append(base_url+rel_link)
    
    print ('Found {} videos'.format(len(videos_links)))
    return videos_links, video_titles, videos_url



In [7]:
#Gets the number of videos that were posted in the last month for a particular channel
def get_dates_scrape(video_url, driver):
    #load the HTML file for the channel page
    soup = get_soup(video_url,driver)
    
    
    dates = []
    
    #get the list of the date posted for each video
    link = soup.find_all('span', class_ = 'style-scope ytd-grid-video-renderer')
    for i in range(0,int(len(link)/2)):
        date = link[2*i+1].text
        dates.append(date)
        
    count = 0
    #videos that were posted in the last 24 hours
    for j in range(25):
        for i in range(len(dates)):
            date = dates[i]
            string = str(j) + ' hours ago'
            if (date == string):
                count = count+1
                
    #videos that were posted in the last day  
    for i in range(len(dates)):
        date = dates[i]
        string = '1 day ago'
        if (date == string):
            count = count+1
    
    #videos that were posted in the last month
    for j in range(31):
        for i in range(len(dates)):
            date = dates[i]
            string = str(j) + ' days ago'
            if (date == string):
                count = count+1
                
    #videos that were posted in the last 4 weeks            
    for k in range(5):
        for i in range(len(dates)):
            date = dates[i]
            string = str(k) + ' weeks ago'
            if (date == string):
                count = count+1
                
            
    return count

In [8]:
#Gets the title and description of each video
def get_stuff(url, driver):
    #open each video that is valid
    #driver.find_element_by_xpath('//*[@id="items"]/ytd-grid-video-renderer['+str(n+1)+']').click()
    wait = WebDriverWait(driver, 10)
    driver.get(url)
    
    
    #get title
    v_title = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR,"h1.title yt-formatted-string"))).text
    #get description
    v_description =  wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR,"div#description yt-formatted-string"))).text
    
    
    
    return v_title, v_description

In [14]:
#Find any deals or promotions from the descriptions
def find_deal(desc):
    
    matched_lines = ""
    
    for line in desc.split("\n"):
        if ("%" in line and "off" in line) or ("$" in line and "off" in line):
            matched_lines = line.strip()
        elif ("%" in line and "OFF" in line) or ("$" in line and "OFF" in line):
            matched_lines = line.strip()
        elif ("sponsored" in line) or ("sponsoring" in line):
            matched_lines = line.strip()
        elif ("discount" in line) or ("DISCOUNT" in line):
            matched_lines = line.strip()
        elif ("giveaway" in line) or ("GIVEAWAY" in line):
            matched_lines = line.strip()
    
    return matched_lines

# Test Section

In [10]:
#TEST SECTION 1

dir_url = 'https://www.youtube.com/channel/UC2Gv-b5RwRR0Bk74-ll6f1Q/channels?view=56&shelf_id=0' #url for the subscriptions page
channel_links, channel_names = scrape_subs_page(dir_url,driver)


for i in range(len(channel_names)):
    print (channel_names[i])
    videos_links,video_titles, videos_url = get_all_video_links(channel_links[i],driver)
    
    n = get_dates_scrape(videos_links[i], driver)
    print("Number of videos in the last month: ", n)
    for j in range(n):
        title, descr = get_stuff(videos_url[j],driver)
        
        deals = find_deal(descr)
        
        if (deals == ""):
            print('Any deals?: No')
        else:
            print('Any deals?: Yes')
            print('Deals:', deals, 'from', videos_url[j])
    print('-'*40)

-------------------- Scraping subscriptions page --------------------
-------------------- Found 13 channels urls --------------------
ur mom ashley
Found 30 videos
Number of videos in the last month:  5
Any deals?: Yes
Deals: now this is interesting....🤪 btw freebeat has a holiday deal for a limited time! Click here https://bit.ly/3FKWcLm & use CODE: ashleyyspecial for an extra $50 off on top of the existing deals on their website! Or on Amazon,  use my codes to get 5% off! from https://www.youtube.com/watch?v=laYMhcMJl80
Any deals?: Yes
Deals: are my roomies stylish or what 😎 BTW get an EXTRA 30% off your 1st thredUP order and FREE shipping w/ code: ASHLEY30 https://bit.ly/thredUPnov21_ASHLEY30 (Offer expires 12/31/21. Applies to US & Canada customers only. See site for full terms!) from https://www.youtube.com/watch?v=LnGyzNJ7IME
Any deals?: No
Any deals?: No
Any deals?: No
----------------------------------------
krist & yu
Found 30 videos
Number of videos in the last month:  3
Any

In [None]:
#TEST SECTION 2

dir_url = 'https://www.youtube.com/channel/UCSm4whN-PJgZ4uzioQDlUCQ/channels?view=56&shelf_id=0' #url for the subscriptions page
channel_links, channel_names = scrape_subs_page(dir_url,driver)


for i in range(len(channel_names)):
    print (channel_names[i])
    videos_links,video_titles, videos_url = get_all_video_links(channel_links[i],driver)
    
    n = get_dates_scrape(videos_links[i], driver)
    print("Number of videos in the last month: ", n)
    for j in range(n):
        title, descr = get_stuff(videos_url[j],driver)
        
        deals = find_deal(descr)
        
        if (deals == ""):
            print('Any deals?: No')
        else:
            print('Any deals?: Yes')
            print('Deals:', deals, 'from', videos_url[j])
    print('-'*40)

-------------------- Scraping subscriptions page --------------------
-------------------- Found 6 channels urls --------------------
Study To Success
Found 30 videos
Number of videos in the last month:  3
Any deals?: Yes
Deals: - This giveaway is international and open to all countries! The shipping is free (completely covered by me) 🥰 However, if you’re from India, there’s a small chance you might have to pay an import tax depending on where you live. Over the past year I’ve sent prizes to nine Indians and two had to pay an import tax (apparently it wasn’t very high though). So please ask your parents if they would be okay with paying the tax if you end up having to before entering! This may apply to other countries I haven't shipped packages to yet, so be sure to double check your country's policies! from https://www.youtube.com/watch?v=nDFL37juBiY
Any deals?: No
Any deals?: No
----------------------------------------
Fatima Bah
Found 30 videos
Number of videos in the last month:  9