In [1]:
import json
import requests
from tqdm import tqdm


class YTstats:

    def __init__(self, api_key, channel_id):
        self.api_key = api_key
        self.channel_id = channel_id
        self.channel_statistics = None
        self.video_data = None

    def extract_all(self):
        self.get_channel_statistics()
        self.get_channel_video_data()

    def get_channel_statistics(self):
        """Extract the channel statistics"""
        print('get channel statistics...')
        url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
        pbar = tqdm(total=1)
        
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0]['statistics']
        except KeyError:
            print('Could not get channel statistics')
            data = {}

        self.channel_statistics = data
        pbar.update()
        pbar.close()
        return data

    def get_channel_video_data(self):
        "Extract all video information of the channel"
        print('get video data...')
        channel_videos, channel_playlists = self._get_channel_content(limit=50)

        parts=["snippet", "statistics","contentDetails", "topicDetails"]
        for video_id in tqdm(channel_videos):
            for part in parts:
                data = self._get_single_video_data(video_id, part)
                channel_videos[video_id].update(data)

        self.video_data = channel_videos
        return channel_videos

    def _get_single_video_data(self, video_id, part):
        """
        Extract further information for a single video
        parts can be: 'snippet', 'statistics', 'contentDetails', 'topicDetails'
        """

        url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        try:
            data = data['items'][0][part]
        except KeyError as e:
            print(f'Error! Could not get {part} part of data: \n{data}')
            data = dict()
        return data

    def _get_channel_content(self, limit=None, check_all_pages=True):
        """
        Extract all videos and playlists, can check all available search pages
        channel_videos = videoId: title, publishedAt
        channel_playlists = playlistId: title, publishedAt
        return channel_videos, channel_playlists
        """
        url = f"https://www.googleapis.com/youtube/v3/search?key={self.api_key}&channelId={self.channel_id}&part=snippet,id&order=date"
        if limit is not None and isinstance(limit, int):
            url += "&maxResults=" + str(limit)

        vid, pl, npt = self._get_channel_content_per_page(url)
        idx = 0
        while(check_all_pages and npt is not None and idx < 10):
            nexturl = url + "&pageToken=" + npt
            next_vid, next_pl, npt = self._get_channel_content_per_page(nexturl)
            vid.update(next_vid)
            pl.update(next_pl)
            idx += 1

        return vid, pl

    def _get_channel_content_per_page(self, url):
        """
        Extract all videos and playlists per page
        return channel_videos, channel_playlists, nextPageToken
        """
        json_url = requests.get(url)
        data = json.loads(json_url.text)
        channel_videos = dict()
        channel_playlists = dict()
        if 'items' not in data:
            print('Error! Could not get correct channel data!\n', data)
            return channel_videos, channel_videos, None

        nextPageToken = data.get("nextPageToken", None)

        item_data = data['items']
        for item in item_data:
            try:
                kind = item['id']['kind']
                published_at = item['snippet']['publishedAt']
                title = item['snippet']['title']
                if kind == 'youtube#video':
                    video_id = item['id']['videoId']
                    channel_videos[video_id] = {'publishedAt': published_at, 'title': title}
                elif kind == 'youtube#playlist':
                    playlist_id = item['id']['playlistId']
                    channel_playlists[playlist_id] = {'publishedAt': published_at, 'title': title}
            except KeyError as e:
                print('Error! Could not extract data from item:\n', item)

        return channel_videos, channel_playlists, nextPageToken

    def dump(self):
        """Dumps channel statistics and video data in a single json file"""
        if self.channel_statistics is None or self.video_data is None:
            print('data is missing!\nCall get_channel_statistics() and get_channel_video_data() first!')
            return

        fused_data = {self.channel_id: {"channel_statistics": self.channel_statistics,
                              "video_data": self.video_data}}

        channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
        channel_title = channel_title.replace(" ", "_").lower()
        filename = channel_title + '.json'
        with open(filename, 'w') as f:
            json.dump(fused_data, f, indent=4)
        
        print('file dumped to', filename)

In [65]:
import json

with open("/Users/elhqdjigagny/Desktop/astrotube/confluence/data/example.json") as f:
    data =  json.load(f)

type(data)

dict

In [66]:
properties = [
    "channel_id",
    "channel_views",
    "channel_suscribers",
    "channel_videos",
    "video_hash",
    "publish_at",
    "video_title",
    "video_description",
    "channel_title",
    "video_tags",
    "video_lang",
    "video_views",
    "video_likes",
    "video_comments",
    "video_duration"
    ]


In [71]:
def channel_stats_from_json_to_df(json_file):
    import json
    with open("/Users/elhqdjigagny/Desktop/astrotube/confluence/data/example.json") as f:
        data =  json.load(f)
    records = []
    for channel_id, channel_info in data.items():
        channel_views = channel_info['channel_statistics']["viewCount"]
        channel_suscribers = channel_info['channel_statistics']["subscriberCount"]
        channel_videos = channel_info['channel_statistics']["videoCount"]

        videos = channel_info["video_data"]
        for video_id, video_info in videos.items():
            video_hash = video_id
            publish_at = video_info["publishedAt"]
            video_title = video_info["title"]
            video_description = video_info["description"]
            channel_title = video_info["channelTitle"]
            video_tags = video_info.get("tags")
            video_lang = video_info.get("defaultAudioLanguage")
            video_views = video_info["viewCount"]
            video_likes = video_info["likeCount"]
            video_comments = video_info["commentCount"]
            video_duration = video_info["duration"]

            properties_values = (
                    channel_id,
                    channel_views,
                    channel_suscribers,
                    channel_videos,
                    video_hash,
                    publish_at,
                    video_title,
                    video_description,
                    channel_title,
                    video_tags,
                    video_lang,
                    video_views,
                    video_likes,
                    video_comments,
                    video_duration
                )
        
            new_row = dict(zip(properties, properties_values))
            records.append(new_row)

    temp_df = pd.DataFrame.from_records(records)
    return temp_df
        


In [72]:
temp_df

Unnamed: 0,channel_id,channel_views,channel_suscribers,channel_videos,video_hash,publish_at,video_title,video_description,channel_title,video_tags,video_lang,video_views,video_likes,video_comments,video_duration
0,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,qAh5dDODJ5k,2022-07-28T14:30:12Z,HTTPX Tutorial - A next-generation HTTP client...,"In this Python Tutorial we learn about HTTPX, ...",Python Engineer,[Python],en,1407,216,8,PT11M2S
1,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,lnTPnx9O6nM,2022-07-24T13:36:52Z,How to find N-Largest and N-Smallest Numbers i...,Quick Python tip how to efficiently find the n...,Python Engineer,[Python],en,25362,2167,28,PT27S
2,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,PaGp7Vi5gfM,2022-07-20T14:00:12Z,Schedule Python Scripts with GitHub Actions FO...,Learn how to schedule Python scripts with GitH...,Python Engineer,[Python],en,10766,607,34,PT12M33S
3,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,07Pxa3TbQc4,2022-07-16T12:00:00Z,I learned something new about Strings in Python!,Today I learned a new formatting rule for Stri...,Python Engineer,[Python],en,73018,5969,57,PT27S
4,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,O7CX81quvzE,2022-07-07T14:00:09Z,Don't make this mistake with Strings in Python!,Don't forget that Strings are immutable and ma...,Python Engineer,[Python],en,122467,6462,69,PT30S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,UdcPhnNjSEw,2019-06-16T09:11:31Z,Collections in Python - Advanced Python 06 - P...,Collections in Python - Advanced Python 06 - P...,Python Engineer,"[Python, Programming, Tutorial]",,52631,1178,58,PT14M10S
176,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,e6ivlABOYRI,2019-06-14T18:05:55Z,Strings in Python - Advanced Python 05 - Progr...,Strings in Python - Advanced Python 05 - Progr...,Python Engineer,"[Python, Programming, Coding, Strings]",,21816,451,46,PT24M16S
177,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,Qs3BSFZnZSI,2019-05-28T20:30:09Z,Sets in Python - Advanced Python 04 - Programm...,"In this Python Advanced Tutorial, we will be l...",Python Engineer,"[Python, Tutorial, Programming]",,15258,302,32,PT16M19S
178,UCbXgNpp0jedKWcQiULLbDTA,10624878,160000,181,LTXnQdrwyrw,2019-05-26T16:01:16Z,Dictionaries in Python - Advanced Python 03 - ...,In this Python Advanced Tutorial we will be le...,Python Engineer,"[Python, Tutorial, Programming]",,43101,680,53,PT13M5S
