# Scrapping Youtube Data with Youtube API

## Setup

### Library

In [1]:
import configparser
from googleapiclient.discovery import build
import pandas as pd

### Important Variables

In [2]:
config_file = 'youtube.ini'
config = configparser.ConfigParser()
config.read(config_file)
API_KEY = config['youtube']['api_key']

In [3]:
basic_file = 'hololive.xlsx'
df_basic = pd.read_excel(basic_file)
channels = df_basic['channel_id']

### Request to Youtube API

In [4]:
service_name = 'youtube'
service_version = 'v3'

youtube = build(serviceName=service_name, version=service_version, developerKey=API_KEY)

### Request Channels Info

In [5]:
request = youtube.channels().list(
        part = "id,snippet,contentDetails,statistics",
        id = ','.join(channels)
    )
response = request.execute()

### Select Data from Response 

In [6]:
all_data = []

for respon in response['items']:
        data = dict(
            channel_id = respon['id'],
            channel_name = respon['snippet']['title'],
            subscribers = respon['statistics']['subscriberCount'],
            views = respon['statistics']['viewCount'],
            total_videos = respon['statistics']['videoCount'],
            playlist_id =respon['contentDetails']['relatedPlaylists']['uploads']
        )
        all_data.append(data)

In [7]:
df_holo = pd.DataFrame(all_data)
df_holo.head()

Unnamed: 0,channel_id,channel_name,subscribers,views,total_videos,playlist_id
0,UCl_gCybOJRIgOXw6Qb4qJzQ,Rushia Ch. 潤羽るしあ,1450000,124020285,403,UUl_gCybOJRIgOXw6Qb4qJzQ
1,UCP0BspO_AMEe3aQqqpo89Dg,Moona Hoshinova hololive-ID,911000,43805860,317,UUP0BspO_AMEe3aQqqpo89Dg
2,UCL_qhgtOy0dy1Agp8vkySQg,Mori Calliope Ch. hololive-EN,1770000,199723483,336,UUL_qhgtOy0dy1Agp8vkySQg
3,UCoSrY_IQQVpmIRZ9Xf-y93g,Gawr Gura Ch. hololive-EN,3550000,198246837,251,UUoSrY_IQQVpmIRZ9Xf-y93g
4,UCXTpFs_3PqI41qX2d9tL2Rw,Shion Ch. 紫咲シオン,904000,79195197,378,UUXTpFs_3PqI41qX2d9tL2Rw


### Merge Basic Info Data with Scrapped Data from Youtube

In [11]:
df = pd.merge(df_basic, df_holo, on="channel_id")

In [12]:
# Change the data type to numeric
df['subscribers'] = pd.to_numeric(df['subscribers'])
df['views'] = pd.to_numeric(df['views'])
df['total_videos'] = pd.to_numeric(df['total_videos'])
df.head()

Unnamed: 0,nick_name,full_name,branch,gen,channel_id,channel_name,subscribers,views,total_videos,playlist_id
0,Sora,Tokino Sora,JP,JP-0,UCp6993wxpyDPHUpavwDFqgg,SoraCh. ときのそらチャンネル,880000,73798412,502,UUp6993wxpyDPHUpavwDFqgg
1,Suisei,Hoshimachi Suisei,JP,JP-0,UC5CwaMl1eIgY8h02uZw7u8A,Suisei Channel,1200000,170144931,416,UU5CwaMl1eIgY8h02uZw7u8A
2,Roboco,Roboco-san,JP,JP-0,UCDqI2jOz0weumE8s7paEk6g,Roboco Ch. - ロボ子,765000,54335853,821,UUDqI2jOz0weumE8s7paEk6g
3,Azki,AZKi,JP,JP-0,UC0TXe_LYZ4scaW2XMyi5_kw,AZKi Channel,530000,23188749,209,UU0TXe_LYZ4scaW2XMyi5_kw
4,Mel,Yozora Mel,JP,JP-1,UCD8HOxPs4Xvsm8H0ZxXGiBw,Mel Channel 夜空メルチャンネル,631000,17007371,185,UUD8HOxPs4Xvsm8H0ZxXGiBw


### Request Video ID

In [36]:
def get_video_ids(youtube, playlist_id):
    
    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()

    video_ids = [item['contentDetails']['videoId'] for item in response['items']]
        
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()
    
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])
            
            next_page_token = response.get('nextPageToken')
        
    return video_ids

In [None]:
vid_ids = [get_video_ids(youtube, playlist) for playlist in df['playlist_id']]

### Request Video Info

In [164]:
def get_video_details(youtube, playlistid, video_ids, list_stats = []):
    all_video_stats = list_stats
    
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
                    part='id,contentDetails,snippet,statistics',
                    id=','.join(video_ids[i:i+50]))
        response = request.execute()
        
        for video in response['items']:
            video_stats = dict(channel_id = playlistid,
                               video_id = video['id'],
                               title = video['snippet']['title'],
                               published_date = video['snippet']['publishedAt'],
                               duration = video['contentDetails']['duration'],
                               views = video['statistics']['viewCount'],
                               comments = video['statistics'].get('commentCount'),
                               likes = video['statistics'].get('likeCount'),
                               dislikes = video['statistics'].get('dislikeCount'))
            all_video_stats.append(video_stats)
    
    return all_video_stats

In [165]:
data_list =[]
for i in range(len(vid_ids)):
    data_list = get_video_details(youtube, df.iloc[i]['channel_id'], vid_ids[i], data_list)

In [166]:
df_videos = pd.DataFrame(data_list)

In [167]:
df_videos

Unnamed: 0,channel_id,video_id,title,published_date,duration,views,comments,likes,dislikes
0,UCp6993wxpyDPHUpavwDFqgg,KJdDlwzvDC8,【あつ森】別荘がつくれる！？早速やっていかないと！！【#ときのそら生放送】,2021-11-07T04:47:18Z,PT2H35M21S,65130,23,8732,15
1,UCp6993wxpyDPHUpavwDFqgg,9bOom2Gl704,【Minecraft】運動会の練習だ！！！#2【#SorAZ/#ときのそら生放送】,2021-11-04T15:21:35Z,PT1H39M23S,125476,51,11316,7
2,UCp6993wxpyDPHUpavwDFqgg,I61tJJKVlnc,【#ときのそらガルパ配信中day2】ガルパホロライブカップ！いっぱい遊ぶぞ～,2021-11-03T13:13:40Z,PT1H1M35S,34988,44,5497,6
3,UCp6993wxpyDPHUpavwDFqgg,oMqH1n9tgPo,【重大発表】ときのそらTheatrical Cover Live『Role:Play』開催決...,2021-11-01T12:06:13Z,PT1H15S,59297,58,8760,6
4,UCp6993wxpyDPHUpavwDFqgg,E2hTb2IUaNU,【歌枠】ハロウィンだし小物つけながら歌おうかな【＃ときのそら生放送】,2021-10-31T07:35:14Z,PT1H30M11S,60865,81,9857,5
...,...,...,...,...,...,...,...,...,...
20528,UCgmPnx-EEeOrZSg5Tiw7ZRQ,olB5J4PfXuc,≪空気読み１＋２＋３≫ 空気を読むネズミの子。GOOD RAT CAN READ ATMOS...,2021-08-26T14:38:47Z,PT2H25M,247523,176,22059,52
20529,UCgmPnx-EEeOrZSg5Tiw7ZRQ,sfxIojed54Q,《RAT SIMULATOR》- i am one true rat.,2021-08-25T08:09:26Z,PT1H53M5S,378803,381,32785,59
20530,UCgmPnx-EEeOrZSg5Tiw7ZRQ,W9rAtIytnHk,《CHIT-CHAT》- POST DEBUT Q&A,2021-08-23T11:59:25Z,PT1H22M44S,433954,9,33950,70
20531,UCgmPnx-EEeOrZSg5Tiw7ZRQ,7S9QhFeGw60,【COUNCIL MEETING】The Council is in SESSION! + ...,2021-08-23T02:12:07Z,PT1H5M,789925,1470,48040,162


In [168]:
df_videos.isna().sum()

channel_id          0
video_id            0
title               0
published_date      0
duration            0
views               0
comments          125
likes              14
dislikes           14
dtype: int64

In [169]:
df_videos.to_csv('hololive_videos.csv', index=False)

In [170]:
df_videos2 = pd.read_csv('hololive_videos.csv')

In [190]:
df_videos2.dtypes

channel_id                     object
video_id                       object
title                          object
published_date    datetime64[ns, UTC]
duration                       object
views                           int64
comments                      float64
likes                         float64
dislikes                      float64
dtype: object

In [181]:
df_videos2['published_date'] = pd.to_datetime(df_videos2['published_date'])

In [191]:
df_videos2.to_csv('hololive_videos.csv', index=False)

### Most Watched Videos

In [183]:
df_videos2.loc[df_videos2['views'] == df_videos2['views'].max()]

Unnamed: 0,channel_id,video_id,title,published_date,duration,views,comments,likes,dislikes
18864,UCL_qhgtOy0dy1Agp8vkySQg,5y3xh8gs24c,[ORIGINAL SONG] 失礼しますが、RIP♡ || “Excuse My Rud...,2020-09-12 17:52:11+00:00,PT3M13S,26163473,22320.0,592988.0,4187.0
