In [1]:
import json
import pandas as pd
import os
import time
from googleapiclient.discovery import build 
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
#import networkx as nx
import random
from igraph import Graph
import plotly.graph_objects as go
from textblob import TextBlob

In [2]:
# change function name??
def get_related_data(driver):
    
    #driver = webdriver.Chrome(executable_path = 'chromedriver_linux64/chromedriver')
    #driver.get(url)
    
    # possibly excessive amount of time to wait for data to load
    time.sleep(2)
    
    # selected video title
    selected_title_path = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string')
    selected_title = selected_title_path.text
    
    # get titles - unnecessary due to API calls but does work consistently
    related_videos = driver.find_elements_by_xpath("//*[@id='video-title']")
    related_video_titles = [videos.text for videos in related_videos]
    related_video_titles = related_video_titles[:10]
    # add selected title to top of list
    related_video_titles.insert(0, selected_title)
    
    # get links
    related_links = driver.find_elements_by_xpath("//*[@id='dismissible']/div/div[1]/a")
    related_links_out = [link.get_attribute('href') for link in related_links]
    related_links_out = related_links_out[:10]
    # add selected link to top of list
    related_links_out.insert(0, url)
    
    # we need to remove playlists from suggestions, as they require a different API call.
    # dictionary to zip links/titles together for filtering. 
    data_dict = dict(zip(related_links_out, related_video_titles))
    # store filtered values.
    filt_dict = {}
    
    # removing links with substring 'list' will avoid all suggested playlists/mixes.
    for link in data_dict.keys():
        if 'list' not in link:
            filt_dict.update({link:data_dict[link]})        
    
    
    # store in dataframe
    out_df = pd.DataFrame()
    out_df['Title'] = filt_dict.values()
    out_df['Link'] = filt_dict.keys()
    
    # parse for URI for API use
    out_df['Id'] = out_df['Link'].apply(lambda x: x.split('=')[1])
    
    return out_df

In [3]:
def related_api_requests(in_df):
    # update func/variable names
    # limit to 50 at a time??
    # build youtube resource object
    youtube = build('youtube','v3',developerKey='AIzaSyCgwz5GP-y0t2u1srVZvaolyZkfARwKNwM') 
    
    # video Ids to feed into API
    related_Ids = list(in_df['Id'])
    
    # contentDetails videos request to get video length
    vid_request = youtube.videos().list(
        part = 'contentDetails',
        id = related_Ids)
    vid_response = vid_request.execute()
    
    # loop through durations
    durations = []
    for item in vid_response['items']:
        durations.append(item['contentDetails']['duration'])
    
    # stat request for likes, dislikes, comment counts, and view counts
    stat_request = youtube.videos().list(
        part = 'statistics',
        id = related_Ids)
    stat_response = stat_request.execute()
    
    # empty lists to store data
    likes = []
    dislikes = []
    views = []
    comments = []

    # loop through stats
    for stat in stat_response['items']:
        try:
            likes.append(stat['statistics']['likeCount'])
        except KeyError:
            likes.append(0)
        try:
            dislikes.append(stat['statistics']['dislikeCount'])
        except KeyError:
            dislikes.append(0)
        try:
            views.append(stat['statistics']['viewCount'])
        except KeyError:
            views.append(0)
        try:
            comments.append(stat['statistics']['commentCount'])
        except KeyError:
            comments.append(0)
            
    # get channel titles
    snip_request = youtube.videos().list(
        part = 'snippet',
        id = related_Ids)
    snip_response = snip_request.execute()
    
    # lists for titles
    channels = []
    #titles = []
    upload_date = []
    
    # loop through snippets
    for snip in snip_response['items']:
        try:
            channels.append(snip['snippet']['channelTitle'])
        except:
            channels.append('api_error')
        #titles.append(snip['snippet']['title'])
        try:
            upload_date.append(snip['snippet']['publishedAt'])
        except:
            upload_date.append('api_error')
        
    # add fields to dataframe
    #fields = [durations, likes, dislikes, views, comments]
    df = pd.DataFrame()
    df['Title'] = in_df['Title']
    df['Channel'] = channels
    df['Length'] = durations
    df['Likes'] = likes
    df['Dislikes'] = dislikes
    df['Views'] = views
    #df['LikeRatio'] = 
    df['Comments'] = comments
    df['Uploaded'] = upload_date
    df['Depth'] = in_df['depth']
    
    # convert to int
    fields = ['Likes', 'Dislikes', 'Views', 'Comments']
    #fields = ['Likes', 'Dislikes', 'Views']
    for field in fields:
        df[field] = df[field].apply(lambda x: int(x))
        
    # create LikeRatio
    df['LikeRatio'] = df['Likes'] / (df['Likes'] + df['Dislikes'])
    return df

In [5]:
def get_video_data(df):
    out_df = pd.DataFrame()
    for i in range(0,len(df)-50,50):
        temp = related_api_requests(df[i:i+50])
        out_df = out_df.append(temp)
    # verbose way to get the starting index of the remainder
    remainder = len(df) - (len(df) % 50)
    temp = related_api_requests(df[remainder:remainder + len(df) % 50])
    out_df = out_df.append(temp)
    index = [i for i in range(len(out_df))]
    out_df['index'] = index
    return out_df

In [6]:
#def make_dataframe(driver):
    # get links and titles of related videos to feed into API
#    df = get_related_data(driver)
#    # feed into api
#    out_df = related_api_requests(df)
    
#    return out_df

In [7]:
#url = 'https://www.youtube.com/watch?v=XcOG5iZpV-k' # community
#url = 'https://www.youtube.com/watch?v=a7RoP1LKMeM' # office
#url = 'https://www.youtube.com/watch?v=TUTAL9LDHRc' # p&r
#url = 'https://www.youtube.com/watch?v=avfVD6Par0M' # friends
#url = 'https://www.youtube.com/watch?v=TuXL9RN70Bo' # 30 rock

In [8]:
/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[2]/div[1]/div/div[1]/a/h3/span
/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[4]/div[1]/div/div[1]/a/h3/span
/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[3]/div[1]/div/div[1]/a/h3/span

NameError: name 'html' is not defined

In [9]:
xpath = "/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-\
next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[" + str(3) + "]/div[1]/div/div[1]/a/h3/span"

In [10]:
xpath

'/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[3]/div[1]/div/div[1]/a/h3/span'

In [14]:
def rabbit_hole(url, channel_name):
    chrome_options = Options()
    #chrome_options.add_argument("--headless")

    driver = webdriver.Chrome(executable_path = 'chromedriver_linux64/chromedriver', options = chrome_options)

    final_df = pd.DataFrame()
    selected_ids = []
    ctr = 0

    driver.get(url)
    # max number of videos...TBS rules all
    for i in range(50):
        # go to selected video
        #driver.get(url)
        # get video title, link, and id
        df = get_related_data(driver)
        # add depth
        df['depth'] = i
        # append to output dataframe
        final_df = final_df.append(df)
        # select next video (random)
        # selected video is at top of df, so we start at 1 to avoid repeats
        rand = random.randint(1, len(df) - 1)
        #url = df['Link'][rand]
        # getting the proper xpath. I tried using backslash for a newline, but it doesn't work within the
        # function for some reason. it works in the cell above...something to do with jupyter maybe?
        vid1 = "/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[12]/ytd-watch-"
        vid2 = "next-secondary-results-renderer/div[2]/ytd-compact-video-renderer[" + str(rand) + "]/div[1]/div/div[1]/a/h3/span"
        vid = vid1 + vid2
        selected = driver.find_element_by_xpath(vid)
        # sometimes the click method gives an ElementClickInterceptedException, so this is an 
        # attempt to handle that exception. just go straight to the url instead of clicking on the video. 
        try:
            selected.click()
        except:
            driver.get(df['Link'][rand])
        #selected_ids.append(rand + ctr)
        #ctr += len(df)
        # keep it on the same channel, the official one for the show. if the channel isn't the official one,
        # end the loop, as the user has exited the channel's sphere of influence
        channel = driver.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[9]/div[2]/ytd-video-secondary-info-renderer/div/div/ytd-video-owner-renderer/div[1]/ytd-channel-name/div/div/yt-formatted-string/a')
        name = channel.text
        if name != channel_name:
            break
        selected_ids.append(rand + ctr)
        ctr += len(df)
    #driver.quit()
    # manually add an index because the append method starts the 
    # index at 0 for every temp df added
    index = [i for i in range(len(final_df))]
    final_df['index'] = index
    return final_df, selected_ids

In [11]:
url = 'https://www.youtube.com/watch?v=TUTAL9LDHRc' # 'Parks and Recreation'

In [92]:
df, selected = rabbit_hole(url, 'Parks and Recreation')

In [93]:
df[df['index'].isin(selected)]

Unnamed: 0,Title,Link,Id,depth,index
5,"Ron, April and Andy's Wild Puzzle - Parks and ...",https://www.youtube.com/watch?v=fsCeZQ4cjGk,fsCeZQ4cjGk,0,5
10,Digging Up Dirt - Parks and Recreation,https://www.youtube.com/watch?v=4iUeVSeZtoc,4iUeVSeZtoc,1,21
1,Best Of Duke Silver - Parks and Recreation,https://www.youtube.com/watch?v=UJdPeMZrcMw,UJdPeMZrcMw,2,23
6,Ron's Terrible Confession - Parks and Recreation,https://www.youtube.com/watch?v=yVjnidcT2ts,yVjnidcT2ts,3,39
4,Burt Macklin's Pregnancy Investigation - Parks...,https://www.youtube.com/watch?v=oxU-NWXoZnI,oxU-NWXoZnI,4,48
6,Tom's Bistro Disaster Run - Parks and Recreation,https://www.youtube.com/watch?v=S55FMFWAOxw,S55FMFWAOxw,5,61
3,April the Wine Connoisseur - Parks and Recreation,https://www.youtube.com/watch?v=Kf_4LSvNrsc,Kf_4LSvNrsc,6,69
4,April's All-Time Insults for Ann (Supercut) - ...,https://www.youtube.com/watch?v=9qLuGDEtcnc,9qLuGDEtcnc,7,81
7,Andy And Lord Covintgon - Parks and Recreation,https://www.youtube.com/watch?v=70_Y9WcOsZk,70_Y9WcOsZk,8,95
4,Educating Andy Dwyer - Parks and Recreation,https://www.youtube.com/watch?v=O_TsgP4ls5g,O_TsgP4ls5g,9,103


In [95]:
full_df = get_video_data(df)

In [96]:
full_df

Unnamed: 0,Title,Channel,Length,Likes,Dislikes,Views,Comments,Uploaded,Depth,LikeRatio,index
0,Ron Swanson and the Coffeepot Mystery - Parks ...,Parks and Recreation,PT1M17S,53457,252,1477918,1111,2020-04-28T13:00:25Z,0,0.995308,0
1,Best of Grumpy Ron Swanson - Parks and Recreation,Parks and Recreation,PT10M15S,9359,135,678493,368,2021-03-09T17:00:02Z,0,0.985780,1
2,"Survival Skills: Oh, I just used the trident t...",Survival Skills TCX,PT8M28S,37,5,1776,2,2021-07-20T12:00:12Z,0,0.880952,2
3,Into The Wild With RON SWANSON | Parks and Rec...,Comedy Bites,PT9M55S,12218,202,911929,494,2020-09-02T17:00:08Z,0,0.983736,3
4,Jim's Pranks Against Dwight - The Office US,The Office,PT12M3S,643165,9053,44186838,13779,2017-07-08T16:00:09Z,0,0.986120,4
...,...,...,...,...,...,...,...,...,...,...,...
5,Andy & April Visit The Hospital - Parks and Re...,Parks and Recreation,PT5M12S,14439,157,1055148,315,2018-09-24T16:00:02Z,9,0.989244,104
6,Best of Jean Ralphio - Parks and Recreation,Parks and Recreation,PT10M3S,23665,404,1919757,1148,2018-07-02T16:00:03Z,9,0.983215,105
7,Best Of Dr. Harris - Parks and Recreation,Parks and Recreation,PT10M8S,14509,248,1224608,444,2019-08-28T15:45:00Z,9,0.983194,106
8,April Meets Tynnyfer - Parks and Recreation,Parks and Recreation,PT4M3S,24301,397,1991204,818,2018-10-11T16:00:06Z,9,0.983926,107


In [97]:
full_df[full_df['index'].isin(selected)]

Unnamed: 0,Title,Channel,Length,Likes,Dislikes,Views,Comments,Uploaded,Depth,LikeRatio,index
5,"Ron, April and Andy's Wild Puzzle - Parks and ...",Parks and Recreation,PT5M49S,29149,340,1916811,566,2020-02-04T14:00:14Z,0,0.98847,5
10,Digging Up Dirt - Parks and Recreation,Parks and Recreation,PT4M29S,16622,201,1269074,564,2020-06-18T16:00:01Z,1,0.988052,21
1,Best Of Duke Silver - Parks and Recreation,Parks and Recreation,PT9M45S,20046,254,1685744,718,2018-09-12T16:00:07Z,2,0.987488,23
6,Ron's Terrible Confession - Parks and Recreation,Parks and Recreation,PT4M10S,22370,350,2605734,679,2015-01-21T05:00:00Z,3,0.984595,39
4,Burt Macklin's Pregnancy Investigation - Parks...,Parks and Recreation,PT4M55S,10993,108,692790,339,2020-06-06T16:00:18Z,4,0.990271,48
6,Tom's Bistro Disaster Run - Parks and Recreation,Parks and Recreation,PT4M35S,6504,100,549770,236,2020-05-07T16:00:26Z,5,0.984858,61
3,April the Wine Connoisseur - Parks and Recreation,Parks and Recreation,PT2M28S,4111,45,249516,177,2020-12-10T17:00:07Z,6,0.989172,69
4,April's All-Time Insults for Ann (Supercut) - ...,Parks and Recreation,PT2M36S,20697,250,2182148,581,2014-09-02T17:21:25Z,7,0.988065,81
7,Andy And Lord Covintgon - Parks and Recreation,Parks and Recreation,PT4M58S,17761,122,936956,825,2019-12-19T17:00:01Z,8,0.993178,95
4,Educating Andy Dwyer - Parks and Recreation,Parks and Recreation,PT10M6S,21174,341,1585252,628,2020-01-01T17:00:04Z,9,0.984151,103


In [12]:
# put a loop inside it, return a dictionary of the dataframes...
def generate_data(url, channel):
    data_dict = {}
    selected_dict = {}
    for i in range(100):
        # go down the rabbit hole, current max of 50 vids (changed to 10)
        df, selected = rabbit_hole(url, channel)
        # get metrics like views, comments, date, likes, dislikes
        full_df = get_video_data(df)
        # isolate the videos that were actually 'clicked' on
        selected_df = full_df[full_df['index'].isin(selected)]
        data_dict[i] = full_df
        selected_dict[i] = selected_df
        print(i)
    return data_dict, selected_dict

In [15]:
url = 'https://www.youtube.com/watch?v=TuXL9RN70Bo'
channel = '30 Rock Official'
full, selected = generate_data(url, channel)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [16]:
selected[6].head()

Unnamed: 0,Title,Channel,Length,Likes,Dislikes,Views,Comments,Uploaded,Depth,LikeRatio,index
2,Liz Lemon's Boyfriends - 30 Rock,30 Rock Official,PT10M3S,6648,185,1031527,464,2017-12-29T17:00:05Z,0,0.972926,2
8,Liz Exposes Abby - 30 Rock,30 Rock Official,PT2M38S,10805,292,1502943,761,2018-01-03T17:00:08Z,1,0.973687,19
1,Liz Lemon Was A BULLY! | Liz Lemon Was A MEAN ...,30 Rock Official,PT5M17S,5917,95,542076,533,2020-11-26T14:00:05Z,2,0.984198,23
3,Jack Dates Crazy Claire - 30 Rock,30 Rock Official,PT3M24S,3287,64,577451,258,2019-09-14T13:00:04Z,3,0.980901,36
8,Liz Lemon Dates Her Cousin - 30 Rock,30 Rock Official,PT2M45S,1900,36,202266,297,2020-01-18T14:00:01Z,4,0.981405,52


In [17]:
import pickle

try:
    file = open('30rock_full', 'wb')
    pickle.dump(full, file)
    file.close()
  
except:   
    print("Something went wrong")
    
    
try:
    file = open('30rock_selected', 'wb')
    pickle.dump(selected, file)
    file.close()
  
except:   
    print("Something went wrong")

In [156]:
hope = pd.read_pickle(r'parks_rec_full')

In [157]:
hope[8]

Unnamed: 0,Title,Channel,Length,Likes,Dislikes,Views,Comments,Uploaded,Depth,LikeRatio,index
0,Ron Swanson and the Coffeepot Mystery - Parks ...,Parks and Recreation,PT1M17S,53486,252,1478610,1111,2020-04-28T13:00:25Z,0,0.995311,0
1,Best of Grumpy Ron Swanson - Parks and Recreation,Parks and Recreation,PT10M15S,9375,135,679703,369,2021-03-09T17:00:02Z,0,0.985804,1
2,"Ron, April and Andy's Wild Puzzle - Parks and ...",Parks and Recreation,PT5M49S,29157,340,1917483,566,2020-02-04T14:00:14Z,0,0.988473,2
3,parks and rec but the moments i audibly laughe...,Remy,PT11M6S,90374,858,2768540,2320,2021-01-30T12:00:13Z,0,0.990595,3
4,April Ludgate: The Worst Assistant in the Worl...,Comedy Bites,PT9M32S,12033,214,670393,395,2021-03-31T16:30:03Z,0,0.982526,4
5,So...who broke it? [We Bare Bears],soniana252,PT1M4S,13676,133,273776,298,2019-01-18T22:28:42Z,0,0.990369,5
6,Best Of The Shoeshine Stand - Parks and Recrea...,Parks and Recreation,PT10M4S,15043,234,1207165,421,2019-07-03T15:45:00Z,0,0.984683,6
7,ron swanson is a relatable king | Parks & Recr...,Comedy Bites,PT10M18S,4035,33,173491,118,2021-07-14T16:30:05Z,0,0.991888,7
8,Jerry's Perfect Life (and Wife!) | Parks & Rec...,Comedy Bites,PT10M43S,12384,100,726654,722,2021-03-24T17:30:04Z,0,0.99199,8
9,April and Ron: The Student and Master - Parks ...,Parks and Recreation,PT10M2S,51668,746,3834462,1273,2018-12-05T17:00:08Z,0,0.985767,9
