In [1]:
import os
import re
from datetime import datetime
from itertools import groupby
from pprint import PrettyPrinter
from time import sleep

import pandas as pd
from func_timeout import FunctionTimedOut, func_timeout  # type: ignore
from googleapiclient.discovery import build  # type: ignore
from selenium import webdriver
from tqdm import tqdm

from apikeys import youtubeCreds


tqdm.pandas()
pp = PrettyPrinter().pprint
y = build("youtube", "v3", developerKey=youtubeCreds)

In [2]:
def startDriver() -> webdriver:
    """
    Starts webdriver instance

    Returns:
        WebDriver: selenium Webdriver
    """
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument('--lang=en')
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(options=options)
    return driver


def formatNumber(numberStr: str) -> int:
    """
    Formats a str such as '1,245.6K' as an int 1245600
    * removes commas
    * replaces K, B, M
    ! expects input to be a whole number

    Args:
        numberStr (str): number as string

    Returns:
        int: number as int
    """
    assert type(numberStr) is str
    x = numberStr.replace(',', '')
    if 'K' in x:  # thousands
        if len(x) > 1:
            return int(float(x.replace('K', '')) * 1000)
        return 1000
    elif 'M' in x:  # millions
        if len(x) > 1:
            return int(float(x.replace('M', '')) * 1000000)
        return 1000000
    elif 'B' in x:  # billions
        return int(float(x.replace('B', '')) * 1000000000)
    else:
        return int(x)


def to_days(mydate: str) -> int:
    """
    Returns time periods in days

    Args:
        date (str): something like "3m" or "1 year"

    Returns:
        int: 90 or 365
    """
    mydate = str(mydate)

    def split_text(s):
        for k, g in groupby(s, str.isalpha):
            yield ''.join(g)
    a = [c.strip() for c in list(split_text(mydate))]
    try:
        if a[-1].lower() in ['m', 'month', 'months']:
            return int(a[0]) * 30
        elif a[-1].lower() in ['y', 'year', 'years']:
            return int(a[0]) * 365
        else:
            return int(a[0])
    except ValueError:
        print('Malformed range')
        return 100000


def findID(channelname: str, apiInstance):
    """
    From a description/name, finds a channel ID.
    Prints channel name and description for confirmation

    Args:
        channelname (str): Name, eg: "actualol"
        apiInstance (_type_): Initialized Google API v3

    Returns:
        str: channelID
        None: no results
    """
    request = apiInstance.search().list(
        part="id, snippet", q=channelname, maxResults=5)
    res = request.execute()
    for item in res['items']:
        if item['id']['kind'] == 'youtube#channel':
            print('Channel Name: ', item['snippet']['title'], '\n',
                  'Description: ', item['snippet']['description'], sep='')
            return item['id']['channelId']
    print('No Results')


def pageScraper(url: str, driver: webdriver) -> dict[str, None]:
    """
    Scrapes views, likes, comments, and game name from a video url given a webdriver instance
    returns a dict with the keys 'likes', 'views', 'comments', 'game'

    Args:
        url (str): url for video page
        driver (webdriver): selenium webdriver instance

    Returns:
        dict[str, None]: likes, views, comments, and name of game
    """
    driver.get(url)
    container = {
        'likes': None,
        'views': None,
        'comments': None,
        'game': None
    }
    substitutions = {'likes': '''video along with ([\d,KM]+)''',
                     'views': '''"views":\\{"simpleText":"\s*([\d,KM]+)\s*views"\\}''',
                     'comments': '''\\{"text":"Comments"\\}\\]\\},"contextualInfo":\\{"runs":\\[\\{"text":"([0-9,.KG]+)"\\}\\]\\}''',
                     'game': '''"simpleText":"([\w\\-.\\?\s:'"!\\$@&|\\+\\*]+)"\\},(?:"subtitle":\\{"simpleText":"[0-9]+"\\},)?"callToAction":\\{"runs":\\[\\{"text":"Browse game"\\}\\]\\}'''}
    for key in container:
        try:
            container[key] = re.findall(re.compile(
                substitutions[key]), driver.page_source)[0]
        except IndexError:
            if key == 'game':
                continue
            print(key, ' empty for ',
                  # ! exception, continuing
                  driver.title[:-10], ' (', url, ')', sep='')
            substitutions[key] = 0
            continue

    for key in container:
        if key == 'game':
            continue
        try:
            container[key] = formatNumber(container[key])
        except AssertionError:  # Nonetype, empty result
            pass
    return container


def videoListScraper(channelId: str, apiInstance, daterange='100000') -> pd.DataFrame:
    """
    Given a channel id, returns a list of all videos uploaded by that channel in the past daterange

    Args:
        channelId (str): youtube channelID, eg 'UC7_YxT-KID8kRbqZo7MyscQ'
        apiInstance (): initialized google discovery API v3
        daterange (str): range of videos to scrape, can be int or str like '2 years' or '3m'

    Returns:
        pd.DataFrame: dataframe of videos on the channel
    """
    container = []
    # channel 'uploads' playlist is hardcoded as this
    uploadsId = 'UU' + channelId[2:]
    request = apiInstance.playlistItems().list(
        part='snippet', playlistId=uploadsId, maxResults=50)
    res = request.execute()
    for item in res['items']:
        data = {'channel': item['snippet']['channelTitle'], 'title': item['snippet']['title'],
                'videoID': item['snippet']['resourceId']['videoId'], 'date': item['snippet']['publishedAt']}
        if pd.to_datetime(data['date']).date() < datetime.today().date() - pd.Timedelta(to_days(daterange), unit='D'):
            container.append(data)
            df = pd.DataFrame(container)
            print(df.head())
            # df.date =pd.to_datetime(df.date).dt.date
            return df
        container.append(data)
    currentToken = res['nextPageToken']
    # def generator():
    #     '''
    #     infinite generator for tqdm
    #     '''
    #     while True:
    #         yield
    # for _ in tqdm(generator()):
    t = tqdm(total=round(res['pageInfo']['totalResults']/50))
    while True:
        t.update(1)
        request = apiInstance.playlistItems().list(part='snippet', playlistId=uploadsId,
                                                   maxResults=50, pageToken=currentToken)
        # sleep(1)
        res = request.execute()
        for item in tqdm(res['items'], leave=False):
            data = {'channel': item['snippet']['channelTitle'], 'title': item['snippet']['title'],
                    'videoID': item['snippet']['resourceId']['videoId'], 'date': item['snippet']['publishedAt']}
            container.append(data)
        try:
            currentToken = res['nextPageToken']
        except:
            t.close()
            break
        # TEMP
        # print(container)
        # break
    df = pd.DataFrame(container)
    df.date = pd.to_datetime(df.date).dt.date
    return df


def videoInfoScraper(dfWithVideoID: pd.DataFrame, apiInstance, daterange='100000') -> pd.DataFrame:
    """
    Given a dataframe of videos, returns likes, downloads, etc

    Args:
        dfWithVideoID (DataFrame): dataframe of videos # ! with  videoId at index 3
        daterange (str): range of videos to scrape, can be int or str like '2 years' or '3m'

    Returns:
        pd.DataFrame: dataframe with scraped data
    """
    dfWithVideoID = dfWithVideoID.reindex(columns=dfWithVideoID.columns.tolist(
    ) + ['likes', 'comments', 'views', 'favourites'])

    # failed = False  # boolean for if it's incomplete
    # try:
    # dfWithVideoID = pd.read_pickle('./_videoInfoScrape.pkl')
    # except:
    # dfWithVideoID = dfWithVideoID.reindex(
    # columns=dfWithVideoID.columns.tolist() + ['likes', 'comments', 'views', 'favourites'])
    # try:
    # driver.quit()
    # except:
    # pass
    # driver = startDriver()
    dfWithVideoID.date = pd.to_datetime(dfWithVideoID.date).dt.date
    dfWithVideoID = dfWithVideoID[dfWithVideoID.date > datetime.today(
    ).date() - pd.Timedelta(to_days(daterange), unit='D')]
    t = tqdm(dfWithVideoID.itertuples(), total=dfWithVideoID.shape[0])
    for row in t:
        idx = row[0]
        # skipping rows for restart
        # if (dfWithVideoID.loc[idx, 'likes'] > 0) or (dfWithVideoID.loc[idx, 'comments'] > 0) or (dfWithVideoID.loc[idx, 'views'] > 0):
        # continue
        url = row[3]
        request = apiInstance.videos().list(part='statistics', id=url)
        res = request.execute()
        thing = res['items'][0]['statistics']

        for column, stat in [('likes', 'likeCount'), ('comments', 'commentCount'), ('views', 'viewCount'), ('favourites', 'favoriteCount')]:
            try:
                dfWithVideoID.at[idx, column] = int(thing.get(stat))
            except TypeError:
                dfWithVideoID.at[idx, column] = 0
    return dfWithVideoID
    #     url = 'https://www.youtube.com/watch?v=' + row[3]
    #     try:
    #         result = func_timeout(90, pageScraper, args=(url, driver))
    #     except FunctionTimedOut:  # ! rate limiting catch
    #         print("Timeout @", idx)
    #         # saves at breakpoint
    #         dfWithVideoID.to_pickle('_videoInfoScrape.pkl')
    #         driver.quit()
    #         sleep(10)
    #         failed = True
    #         break
    #     for column in result:
    #         dfWithVideoID.at[idx, column] = result[column]
    #     t.set_postfix({"Page": driver.title[:-10]})
    #     # if idx + 1 % 500 == 0:
    #     #     # pp.pprint(result)
    #     #     driver.quit()
    #     #     sleep(2)
    #     #     driver = startDriver()
    # if failed:
    #     return videoInfoScraper(dfWithVideoID)
    # try:
    #     os.remove('_videoInfoScrape.pkl')  # backup
    # except OSError:
    #     pass
    # driver.quit()
    # return dfWithVideoID


def streamerPipeline(channelID: str, apiInstance, daterange='100000') -> pd.DataFrame:
    """
    Pipeline to get youtube videos for a streamer

    Args:
        channelId (str): youtube channelID
        apiInstance (_type_): intialized youtube API instance
        daterange (str): range of videos to scrape, can be int or str like '2 years' or '3m'

    Raises:
        NameError: Can't find channel from ID

    Returns:
        pd.DataFrame: Dataframe with all videos and video details
    """
    try:
        videoDf = videoListScraper(channelID, apiInstance, daterange)
    except:
        print("Couldn't get videos")
        raise NameError
    print('Done getting videos for {}'.format(videoDf.channel.iloc[0]))
    df = videoInfoScraper(videoDf, apiInstance, daterange)
    return df

In [3]:
# smant = 'UC0VVYtw21rg2cokUystu2Dw'
# jacksepticeye = 'UCYzPXprvl5Y-Sf0g4vX-m6g'
# markiplier = 'UC7_YxT-KID8kRbqZo7MyscQ'
# mathas = 'UCOHBVUV8aDg4tQiHnUqi_QA'
# dantdm = 'UCS5Oz6CHmeoF7vSad0qqXfw'
# vanos = 'UCKqH_9mk1waLgBiL2vT5b9g'
# matn = 'UCDYZxJE8kLZ-o6nL8E1bXdQ'
# h20del = 'UCClNRixXlagwAd--5MwJKCw' #* group
# radbrad = 'UCpqXJOEqGS-TCnazcHCo0rA'
# smosh = 'UCJ2ZDzMRgSrxmwphstrm8Ww' #* group
# seananers = 'UCq54nlcoX-0pLcN5RhxHyug' #* group
northernlion = 'UC3tNpTOHsTnkmbwztCs30sA'
# quill18 = 'UCbx1TZgxfIauUZyPuBzEwZg'
# splattercat = 'UC8nZUXCwCTffxthKLtOp6ng'
# blitz = 'UCK3eoeo-HGHH11Pevo1MzfQ'
# kathysky = 'UCTIV3KbAvaGEyNjoMoNaGtQ'
bestinslot = 'UCWB212HLA4MyZBpn0qlBpXA'
partyelite = 'UCg3BiSs8eBE9hA9EWYTgtAg'
nerd3 = 'UCKab3hYnOoTZZbEUQBMx-ww'
# angryjoe = 'UCsgv2QHkT2ljEixyulzOnUQ'

In [4]:
rahdoid = findID('rahdo', y)
rahdodf = videoListScraper(
    rahdoid, y, daterange='1y')
rahdo = videoInfoScraper(rahdodf, y, daterange='1y')
rahdo.sort_values(by=['views'], ascending=False).head(20)

Channel Name: Rahdo
Description: Hi, welcome to the channel, where we specialize in filming boardgame runthrough and preview videos that show, rather than tell, ...


 99%|█████████▉| 137/138 [00:23<00:00,  5.88it/s]
100%|██████████| 556/556 [00:48<00:00, 11.39it/s]


Unnamed: 0,channel,title,videoID,date,likes,comments,views,favourites
360,Rahdo,Rahdo Runs Through►►► Top 30 Games of 2023 (pr...,BpM9vpHjf2g,2023-12-25,513.0,123.0,19944.0,0.0
528,Rahdo,Dragon Eclipse | Shea's Final Thoughts,K490ZfKnNTs,2023-09-16,186.0,22.0,18353.0,0.0
41,Rahdo,Our 20 MUST-HAVE Games of Gencon 2024!,TmG3FgLRJsk,2024-07-29,354.0,63.0,17823.0,0.0
493,Rahdo,Apiary | Rahdo's Final Thoughts,IVCq5UR5tl4,2023-10-02,444.0,55.0,17764.0,0.0
367,Rahdo,Top 16 Board Games of 2023 with Cole Wehrle of...,FrHUnieFChE,2023-12-20,295.0,25.0,16743.0,0.0
36,Rahdo,Arcs - What did we think?,uLiJ6tUcGWc,2024-08-01,398.0,44.0,15368.0,0.0
542,Rahdo,Nucleum | Rahdo's Final Thoughts,fIIVSHtdVTk,2023-09-09,324.0,29.0,14275.0,0.0
502,Rahdo,Rahdo Previews►►► Cargo Empire,W1PrkS1eKis,2023-09-27,144.0,7.0,13901.0,0.0
332,Rahdo,Earthborne Rangers | Maggie's Final Thoughts,qghKPaZXIdw,2024-01-15,405.0,30.0,12882.0,0.0
495,Rahdo,Rahdo Runs Through►►► Top 20 Essen Spiel Must-...,lF1RkyfLQwI,2023-10-01,288.0,30.0,12248.0,0.0


In [7]:
dtid = findID('The Dice Tower', y)
dtdf = videoListScraper(
    dtid, y, daterange='5y')
dt = videoInfoScraper(dtdf, y, daterange='5y')
dt.sort_values(by=['views'], ascending=False).head(20)

Channel Name: The Dice Tower
Description: Video reviews of games, top 10 lists, live plays, variety shows and more! Tom Vasel and the gang love games, so we're hoping ...


 10%|█         | 42/400 [00:11<01:14,  4.80it/s]

KeyboardInterrupt: 

In [6]:
{'viewCount': '832', 'likeCount': '21',
    'favoriteCount': '0', 'commentCount': '1'}.get()

TypeError: get expected at least 1 argument, got 0

In [None]:
dir(y)

In [None]:
a = y.videos().list(part='statistics', id=[
    'aqdKVri2f1Q']).execute(
)

a

In [None]:
a['items'][0]['statistics']['viewCount']

In [None]:
def geval(dic, keypath):
    for key in keypath:
        dic = dic.get(key)
    return dic


x = a['items'][0]
b = ['statistics', 'viewCount']

geval(x, ['statistics', 'likeCount'])

In [None]:
a

In [None]:
c = a['items'][0]
c

In [None]:
c.get('statistics')

In [None]:
for column, stat in [('likes', 'likeCount'), ('comments', 'commentCount'), ('views', 'viewcount'), ('favourites', 'favouritecount')]:
    print(column, stat)

In [None]:
request = y.videos().list(part='statistics', id='4bc1Sa7X3xk')
res = request.execute()
thing = res['items'][0]['statistics']

for column, stat in [('likes', 'likeCount'), ('comments', 'commentCount'), ('views', 'viewCount'), ('favourites', 'favoriteCount')]:
    print(column, thing.get(stat))

In [None]:
thing

In [16]:
a = videoListScraper(dtid, y, daterange='1m')


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [17]:
a.count()

channel    20000
title      20000
videoID    20000
date       20000
dtype: int64

In [19]:
a.tail(20)

Unnamed: 0,channel,title,videoID,date
19980,The Dice Tower,"Q & A with Tom Vasel - January 12, 2015",LER1c8Nggg8,2015-01-12
19981,The Dice Tower,Board Game Breakfast: Episode 59 - Game Weight,4dL-hYGOzHk,2015-01-12
19982,The Dice Tower,"Miami Dice, Episode 153 - Temporum",x-IHRy6Xh80,2015-01-10
19983,The Dice Tower,Foreclosed Review - with Tom Vasel,7ViAUQOQQZo,2015-01-10
19984,The Dice Tower,Dead Man's Draw Review - with Tom Vasel,CjpFr0GkYHQ,2015-01-10
19985,The Dice Tower,"Miami Dice, Episode 152 - Rattlebones",bl1N6MeGBog,2015-01-09
19986,The Dice Tower,"Miami Dice, Episode 151 - The Ancient World",44u0me4zDj0,2015-01-09
19987,The Dice Tower,Carcassonne Gold Rush Review - with the Game B...,Ichy2lR5TDM,2015-01-09
19988,The Dice Tower,Warring Kingdom Review - with Indiana John,nJ4zlhtgg4U,2015-01-09
19989,The Dice Tower,Live: Star Wars: Queen's Gambit Showdown!,F0DxNp0HJok,2015-01-08


In [20]:
to_days('1m')

30

In [22]:
pd.to_datetime('2024-08-12').date() < datetime.today().date() - \
    pd.Timedelta(to_days('1m'), unit='D')

False

In [34]:
request = y.playlistItems().list(
    part='snippet', playlistId='UUiwBbXQlljGjKtKhcdMliRA', maxResults=50)
res = request.execute()
for item in res['items']:
    data = {'channel': item['snippet']['channelTitle'], 'title': item['snippet']['title'],
            'videoID': item['snippet']['resourceId']['videoId'], 'date': item['snippet']['publishedAt']}
    print(pd.to_datetime(data['date']).date() < datetime.today(
    ).date() - pd.Timedelta(to_days('15'), unit='D'))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [23]:
dtid

'UCiwBbXQlljGjKtKhcdMliRA'

In [35]:
item = res['items'][-1]

In [36]:
item

{'kind': 'youtube#playlistItem',
 'etag': 'IQukveyNNY9VKeErVWxs_1IfHMY',
 'id': 'VVVpd0JiWFFsbGpHakt0S2hjZE1saVJBLmpCVHlhQnM0YTI4',
 'snippet': {'publishedAt': '2024-08-18T05:30:13Z',
  'channelId': 'UCiwBbXQlljGjKtKhcdMliRA',
  'title': 'Hungry Monkey Review with Tom Vasel',
  'description': 'Tom Vasel takes a look at Hungry Monkey.\n\nSee the full video at https://www.youtube.com/watch?v=_zP2tRfJU8Q\n\nCheck out Great Tables, Games, & Bags at: https://www.allplay.com\n\nFind Conventions, Merchandise, and Connect With Us:  https://linktr.ee/dicetower\n\nDice Tower Amazon Storefront: https://www.amazon.com/shop/thedicetower\nAs an Amazon Associate, we earn from qualifying purchases.\n\nBGG Link: https://boardgamegeek.com/boardgame/355735/hungry-monkey\n\n#dicetower #thedicetower',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/jBTyaBs4a28/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/jBTyaBs4a28/mqdefault.jpg',
    'width':

In [37]:
data = {'channel': item['snippet']['channelTitle'], 'title': item['snippet']['title'],
        'videoID': item['snippet']['resourceId']['videoId'], 'date': item['snippet']['publishedAt']}

data['date']

'2024-08-18T05:30:13Z'

In [41]:
pd.to_datetime(data['date']).date() < datetime.today(
).date() - pd.Timedelta(to_days('10'), unit='D')

True