## **Dependencies & Imports**

In [None]:
!pip install -q youtube_transcript_api
!pip install -q google-api-python-client

In [None]:
import pandas as pd
import numpy as np
import pickle
import googleapiclient.discovery
import time

from itertools import chain
from functools import reduce
from typing import Union, Callable, Generator, Tuple
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm.notebook import tqdm

## **Class `YouTubeClient`**


In [None]:
class YouTubeClient():
  """
  Only to scrape data. Not to upload or delete videos. 
  """
  def __init__(self, API_KEY: str):
    self.__API_KEY = API_KEY
    self._api_service_name = "youtube"
    self._api_version = "v3"
    self.youtube_metadata_client = googleapiclient.discovery.build(self._api_service_name, 
                                                                   self._api_version, 
                                                                   developerKey=self.__API_KEY)
    self.youtube_transcript_client = YouTubeTranscriptApi()
    self.state = {'units_consumed': 0, 'daily_quota': 10000}

  def _execute_query(self, query_kind: str, query: str, **query_params) -> list:
    """
    """
    assert query_kind == 'metadata' or query_kind == 'transcript', f"'query_kind' must be one of ['metadata', 'transcript']"
    if query_kind == 'metadata':
      resource, action = query.split('.')
      query_params["pageToken"] = query_params.get("pageToken", "")
      has_next_page = True
      while has_next_page:
        response = getattr(getattr(self.youtube_metadata_client, resource.replace("()", ""))(), action)(**query_params).execute()
        query_params["pageToken"] = response.get("nextPageToken", "")
        has_next_page = bool(query_params.get("pageToken", ""))
        self.state["units_consumed"] += query_params.get("maxResults", 1)
        yield response['items']
    if query_kind == 'transcript':
      try:
        response = getattr(self.youtube_transcript_client, query)(**query_params)
        time.sleep(8)
        yield response
      except:
        yield [{'duration': np.NaN, 'start': np.NaN, 'text': np.NaN}]

  def _process_query(self, query_kind: str, response: Generator) -> pd.DataFrame:
    """
    A generic processing function that should apply to any query. 
    To achieve more specific processing modify code in the specific public methods.
    """
    assert query_kind == 'metadata' or query_kind == 'transcript', f"'query_kind' must be one of ['metadata', 'transcript']"
    df = pd.DataFrame(list(chain.from_iterable(response)))
    if query_kind == "metadata":
      try:
        df.drop(columns=['kind', 'etag'], inplace=True)
      except KeyError:
        pass
    if query_kind == "transcript":
      subtitles = df.text.to_list()
      timestamps = list(zip(df.start.to_list(), (df.start + df.duration).to_list()))
      df = pd.DataFrame({'subtitles': [subtitles], 
                         'timestamps': [timestamps]})
    return df

  def _extract_and_add_as_column(self, df: pd.DataFrame, extract_dict: dict, clean_up: bool) -> pd.DataFrame:
    """
    """
    target_columns = list(map(operator.itemgetter(0), [[x] if not isinstance(x, list) else x for x in extract_dict["from"]]))
    assert len(extract_dict.keys()) == 2 and extract_dict.keys() == {'extract', 'from'}, "Passed dict should have only two fields: ('extract', 'from')"
    assert len(extract_dict['extract']) == len(extract_dict['from']), "Fields to be extracted are not equal to the columns specified"
    assert (pd.Series(target_columns).isin(df.columns).all()), "Column(s) from which fields are to be extracted, do not exist in the passed pd.DataFrame object"
    for extract, from_column in zip(*extract_dict.values()):
      if type(from_column) == list:
        df[extract] = df[from_column[0]].apply(lambda x: reduce(operator.getitem, from_column[1:] + [extract], x))
      else:
        df[extract] = df[from_column].apply(lambda x: x.get(extract, np.NaN))
    if clean_up:
      df.drop(columns=list(set(target_columns)), inplace=True)
    return df

  def _align(self, *list_of_dfs: pd.DataFrame, on: str, how: str) -> pd.DataFrame:
    """
    """
    assert how in ['inner', 'outer'], f"The argument 'how' should be one of [{'inner', 'outer'}]"
    return reduce(lambda x,y: pd.merge(x,y, on=on, how=how), list_of_dfs)

  def _get_channel_upload_id(self, username: str) -> Tuple[str, str]:
    response = self._execute_query(query_kind="metadata", query="channels().list", part="contentDetails", forUsername=username)
    metadata = self._process_query(query_kind="metadata", response=response)
    channel_id = metadata.id.to_list().pop()
    channel_upload_id = metadata.contentDetails.apply(lambda x: x['relatedPlaylists']['uploads']).to_list().pop()
    return channel_id, channel_upload_id

  def _from_playlist_ids(self, *playlist_ids: Union[str, list]) -> pd.DataFrame:
    """
    """
    extract_dict = {'extract': ['title', 
                                'description', 
                                'itemCount'], 
                    'from': ['snippet', 
                             'snippet', 
                             'contentDetails']}
    metadata = pd.concat([self._process_query(query_kind="metadata", 
                                              response=self._execute_query(query_kind="metadata", 
                                                                           query="playlists().list", 
                                                                           id=playlist_id, 
                                                                           part="contentDetails, snippet")) 
                          for playlist_id in tqdm(playlist_ids)])
    metadata = self._extract_and_add_as_column(metadata, extract_dict, clean_up=True)
    metadata.rename(columns={'id': 'playlistId', 'title': 'playlist_title', 'description': 'playlist_description'}, inplace=True)
    return metadata

  def _from_video_ids(self, *video_ids: Union[str, list]) -> pd.DataFrame:
    """
    """
    extract_dict = {'extract': ['definition', 
                                'defaultAudioLanguage',
                                'publishedAt',
                                'description',
                                'title',
                                'tags',
                                'url',
                                'commentCount',
                                'dislikeCount',
                                'favoriteCount',
                                'likeCount',
                                'viewCount'],
                    'from': ['contentDetails', 
                             'snippet',
                             'snippet',
                             'snippet',
                             'snippet',
                             'snippet',
                             ['snippet', 'thumbnails', 'high'],
                             'statistics',
                             'statistics',
                             'statistics',
                             'statistics',
                             'statistics']}
    metadata = pd.concat([self._process_query(query_kind="metadata", 
                                              response=self._execute_query(query_kind="metadata", 
                                                                           query="videos().list", 
                                                                           id=video_id, 
                                                                           part="snippet, contentDetails, statistics")) 
                          for video_id in tqdm(video_ids)])
    metadata = self._extract_and_add_as_column(metadata, extract_dict, clean_up=True)
    metadata.rename(columns={'id': 'videoId', 'title': 'video_title', 'description': 'video_description'}, inplace=True)
    return metadata

  def from_playlist(self, *playlists: Union[str, list]) -> pd.DataFrame:
    """
    """
    is_url = lambda playlist: True if playlist.startswith('https://www.youtube.com/watch?') else False
    extract_playlist_id = lambda playlist: playlist[playlist.find('&list=')+6:]
    playlist_ids = [extract_playlist_id(playlist) if is_url(playlist) else playlist for playlist in playlists] 
    playlist_metadata = self._from_playlist_ids(*playlist_ids)
    extract_dict = {'extract': ['playlistId', 
                                'videoId'], 
                    'from': ['snippet', 
                             'contentDetails']}
    metadata = pd.concat([self._process_query(query_kind="metadata", 
                                              response=self._execute_query(query_kind="metadata", 
                                                                           query="playlistItems().list", 
                                                                           playlistId=playlist_id, 
                                                                           part="snippet, contentDetails", 
                                                                           maxResults=50)) 
                          for playlist_id in tqdm(playlist_ids)])
    metadata = self._extract_and_add_as_column(metadata, extract_dict, clean_up=True)
    metadata.drop(columns=['id'], inplace=True)
    video_metadata = self._from_video_ids(*metadata.videoId.to_list())
    transcript = pd.concat([self._process_query(query_kind="transcript", 
                                                response=self._execute_query(query_kind="transcript", 
                                                                             query="get_transcript", 
                                                                             video_id=video_id))
                            for video_id in tqdm(video_metadata.videoId.values)])
    transcript["videoId"] = video_metadata.videoId.to_list()

    # merge on 'videoId', 'playlistId' and return
    video_data_merged = self._align(metadata, transcript, video_metadata, on="videoId", how="inner")
    playlist_data_merged = self._align(video_data_merged, playlist_metadata, on="playlistId", how="outer").drop_duplicates(subset='videoId')
    dataset = playlist_data_merged.set_index(['playlistId', 'itemCount', 'playlist_title', 'playlist_description', 'videoId'])
    return dataset

  def from_channel(self, username: str) -> pd.DataFrame:
    """
    """
    channel_id, channel_upload_id = self._get_channel_upload_id(username=username)
    dataset = self.from_playlist(channel_upload_id)
    return dataset

In [None]:
extract_dict = {'extract': ['definition', 
                                'defaultAudioLanguage',
                                'publishedAt',
                                'description',
                                'title',
                                'tags',
                                'url',
                                'commentCount',
                                'dislikeCount',
                                'favoriteCount',
                                'likeCount',
                                'viewCount'],
                    'from': ['contentDetails', 
                             'snippet',
                             'snippet',
                             'snippet',
                             'snippet',
                             'snippet',
                             ['snippet', 'thumbnails', 'default'],
                             'statistics',
                             'statistics',
                             'statistics',
                             'statistics',
                             'statistics']}

## **Scrape YouTube**

In [None]:
# first, let us define our playlists
# it is from these playlists that we will be downloading transcripts

playlists = '''

https://www.youtube.com/watch?v=Dl8MUnLfEsk&list=PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT
https://www.youtube.com/watch?v=cxoQdEhHaT8&list=PL3uDtbb3OvDNXmmy_3Q7SCHIZdz9ja4SG
https://www.youtube.com/watch?v=yL_fgyXXnSM&list=PL3uDtbb3OvDPHh7DWhekbw-ywA7SCnstr
https://www.youtube.com/watch?v=O1B0lDS1Jnw&list=PL3uDtbb3OvDNsDLMnmyR94MTfGHQh6HtP
https://www.youtube.com/watch?v=zO8QzMWZbN4&list=PL3uDtbb3OvDMpNqWoWfsY9qqT7UZijw0w
https://www.youtube.com/watch?v=4OBLAW7oQYo&list=PL3uDtbb3OvDPup8tDy1viWElFkPZcL4pM
https://www.youtube.com/watch?v=GM0lU5Dq7eA&list=PL3uDtbb3OvDPZG2coablWM-9XX6JQtSQT
https://www.youtube.com/watch?v=vvntRXe6YcU&list=PL3uDtbb3OvDPBGzSYKBeEFlrG48_0DBC4
https://www.youtube.com/watch?v=DTWMwHtF-UA&list=PL3uDtbb3OvDNnH5j_UFzZwR2KWg4TShJn
https://www.youtube.com/watch?v=kAMvYHqTWs0&list=PL3uDtbb3OvDPt8Ayn5QQ_13Juo98-EDxP
https://www.youtube.com/watch?v=xswUGZOVdc4&list=PL3uDtbb3OvDPLLMGlDi3C3-uAwyTBXtnR
https://www.youtube.com/watch?v=3J-cYxxHQGQ&list=PL3uDtbb3OvDNxpFp3baiPKRM4tGtD3_Me
https://www.youtube.com/watch?v=f7-lwz_FacE&list=PL3uDtbb3OvDMjs6pYa27tCweTBKBgxUij
https://www.youtube.com/watch?v=a6danRWYxpo&list=PL3uDtbb3OvDNVQJSz1__CuW-IS2s2kw2A
https://www.youtube.com/watch?v=uoIXz3KcwME&list=PL3uDtbb3OvDMgLTgfZe4fDN48SYfEtesX
https://www.youtube.com/watch?v=bJggjXvB52c&list=PL3uDtbb3OvDMHDwKA8sPrEi2SV3IKnT0S
https://www.youtube.com/watch?v=4OBLAW7oQYo&list=PL3uDtbb3OvDPAcaMIq68euWqHvZosh8JI
https://www.youtube.com/watch?v=QAsJvKsd2Xk&list=PL3uDtbb3OvDNWKnzD4MJRQRX_wBAT9iDC
https://www.youtube.com/watch?v=UT_nWVLi4Ws&list=PL3uDtbb3OvDMMbCg-hvVjXYZ3osM4rpr2
https://www.youtube.com/watch?v=X_fHa73_nOg&list=PL3uDtbb3OvDNo0TvQIHbB6TLndA7jEMTR
https://www.youtube.com/watch?v=HIkgY0Rz1jU&list=PL3uDtbb3OvDMaNezBWgE_SNQ6QkeYkV1w
https://www.youtube.com/watch?v=AHS1c_vqjxI&list=PL3uDtbb3OvDMBO-NUpWCvV_zhJh1pFlEX
https://www.youtube.com/watch?v=diFkCJ802vY&list=PL3uDtbb3OvDMdjRscdox0QYkcE9cghmvx
https://www.youtube.com/watch?v=rbYdXbEVm6E&list=PL3uDtbb3OvDONMcvq4e82gs33laM4IJ_z
https://www.youtube.com/watch?v=235gIzWOkrM&list=PL3uDtbb3OvDNDjm-mp82KCJB6VDpqZi3I

'''

playlists = playlists.strip().split('\n')

In [None]:
client = YouTubeClient("AIzaSyD3z85nkRIPH-mdseO27yZgxXDJzClVXV0")
youtube_scrapped = client.from_playlist(playlists[0])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [None]:
youtube_scrapped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,subtitles,timestamps,definition,defaultAudioLanguage,publishedAt,video_description,video_title,tags,url,commentCount,dislikeCount,favoriteCount,likeCount,viewCount
playlistId,itemCount,playlist_title,playlist_description,videoId,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",Dl8MUnLfEsk,[Sadhguru: But he must be punished? (Laughter)...,"[(17.086, 19.820999999999998), (26.261, 30.329...",hd,en,2015-07-13T12:51:39Z,Sadhguru looks at how human relationships are ...,How to Deal with Relationships? | Sadhguru,"[Sadhguru (Person), Isha Foundation, Guru, Mys...",https://i.ytimg.com/vi/Dl8MUnLfEsk/hqdefault.jpg,136,1484,0,70081,3391064
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",o3C7pz8E_9c,"[Heart is not trying to say anything., It is j...","[(0.14, 2.65), (2.65, 4.4399999999999995), (4....",hd,en,2018-12-19T07:59:13Z,Sadhguru shares his wisdom on how to make deci...,"Head or Heart, which one to listen to? - Sadhguru","[Sadhguru 2018, sad guru, Sadguru, satguru, sa...",https://i.ytimg.com/vi/o3C7pz8E_9c/hqdefault.jpg,1140,1605,0,60986,2618689
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",Kgowgm1KeZ4,"[Sadhguru: Good evening everyone., Juhi Chawla...","[(66.161, 68.421), (68.421, 71.17500000000001)...",hd,en,2014-11-05T05:30:00Z,In an episode of the In Conversation With The ...,Of Love and Life - Juhi Chawla In Conversation...,"[enlightenment, Spiritual Journey, Spiritual T...",https://i.ytimg.com/vi/Kgowgm1KeZ4/hqdefault.jpg,227,1068,0,20717,2368603
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",mtETTbjKdWw,"[Questioner: Sadhguru, Namaskaram, how to cond...","[(6.546, 8.617), (8.617, 12.700000000000001), ...",hd,en,2014-08-07T06:16:06Z,Responding to a question about exploitative re...,How to Deal with an Exploitative Spouse? Sadhguru,"[sadhguru, spiritual discourse, yoga, how to d...",https://i.ytimg.com/vi/mtETTbjKdWw/hqdefault.jpg,76,1247,0,19898,1813185
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",UT_nWVLi4Ws,"[Questioner: My name is Ananth., My question i...","[(6.58, 8.346), (8.346, 11.475999999999999), (...",hd,en,2015-03-02T07:01:08Z,"​Why does the institution of marriage exist, a...",Sadhguru on Marriage – Choosing Consciously,"[Meditation for Health, Yoga Practice, marriag...",https://i.ytimg.com/vi/UT_nWVLi4Ws/hqdefault.jpg,22,543,0,22488,1730233
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",wHzsmZ5t_K0,[Moderator (Gaganpreet): \nWhen people say tha...,"[(0.16, 2.12), (2.12, 4.18), (4.18, 7.4), (7.4...",hd,en,2018-09-29T14:30:02Z,Sadhguru answers a question about why even the...,Why do Couples Fall Out of Love After Some Tim...,"[Sadhguru 2018, youth and truth sadhguru 2018,...",https://i.ytimg.com/vi/wHzsmZ5t_K0/hqdefault.jpg,720,620,0,38277,1650369
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",qdVJsJJXPtY,"[Sadhguru: So,, ‘How did I ever marry you?’, I...","[(18.773, 20.235), (23.605, 25.825), (27.605, ...",hd,en,2015-11-02T11:06:14Z,Sadhguru answers a question about relationship...,Stop Digging Into The Past - Sadhguru,"[Sadhguru (Person), Isha Foundation, Guru, Mys...",https://i.ytimg.com/vi/qdVJsJJXPtY/hqdefault.jpg,87,612,0,21532,1525065
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",5nkbmQeJvr8,"[(Questioner): Sadhguru, what are your views o...","[(6.282, 9.663), (9.663, 12.582), (15.252, 18....",sd,en,2012-11-21T09:11:25Z,Adultery in a relationship is not about right ...,Is it Wrong to be Sexually Involved Outside of...,"[marital affair, cheating, Spirituality, affai...",https://i.ytimg.com/vi/5nkbmQeJvr8/hqdefault.jpg,60,446,0,13087,1121058
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",fTkdd9Grm4Q,[Juhi Chawla: We have one question from the so...,"[(6.035, 11.526), (11.526, 13.982), (13.982, 1...",hd,en,2014-08-20T15:29:45Z,Juhi Chawla and Sadhguru look at the fundament...,What is the Real Meaning of Love - Juhi Chawla...,"[Yoga Techniques, enlightenment, Mysticism, Yo...",https://i.ytimg.com/vi/fTkdd9Grm4Q/hqdefault.jpg,24,213,0,16556,819798
PL3uDtbb3OvDOWpCZ8ERCXHMcslGaBEOBT,29,Love & Relationships,"If you become a source of joy by yourself, you will have wonderful relationships. - Sadhguru",Cz2NOxarkMs,[Sadhguru: Love has become a relevant thing in...,"[(11.274, 14.937), (14.94, 22.28), (22.28, 25....",hd,en,2015-01-12T11:38:47Z,"Sadhguru looks at the significance of love, an...",The Guaranteed Love Affair - Sadhguru,"[kinds of love, How to Meditate, SADHGURU MEDI...",https://i.ytimg.com/vi/Cz2NOxarkMs/hqdefault.jpg,71,248,0,11664,811918


In [None]:
youtube_scrapped.to_pickle('youtube_scrapped.pickle')

## **Dev Space**

Development of `from_channel()`

In [None]:
# get all videos from channel directly
client = YouTubeClient("AIzaSyD3z85nkRIPH-mdseO27yZgxXDJzClVXV0")
youtube_scrapped_complete = client.from_channel(username="sadhguru")

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1970.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1970.0), HTML(value='')))