# Youtube Home Page Data Crawling

## Importing Library

In [8]:
import pandas as pd
import requests as req
from http.cookiejar import MozillaCookieJar
import json
import re
import datetime as dt
from bs4 import BeautifulSoup
import numpy as np
pd.options.mode.copy_on_write = True 

## Load The Cookies

In [2]:
cookie_path = "C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/research/www.youtube.com_cookies.txt"
cookie_jar = MozillaCookieJar(cookie_path)

In [3]:
cookie_jar.load(ignore_discard=True, ignore_expires=True)

## Make a Request

In [4]:
session = req.Session() #-> make a request session object

# request header
session.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-us,en;q=0.5",
    "Sec-Fetch-Mode": "navigate",
}

# request cookies
session.cookies = cookie_jar

# do a request with get method
response = session.get("https://www.youtube.com/")

# save the cookie
cookie_jar.save(ignore_discard=True, ignore_expires=True)

# parsing data
html = response.text #-> save a text response from prev request
regex = r"var ytInitialData = (.*);<\/script>" #-> regular expression to selecting data
match = re.search(regex, html).group(1) #-> parsing data
json_data = json.loads(match) #-> load string data as json

## Function To Construct Data

In [6]:
def construct_home_page_data(json_data,time=dt.datetime.now()):
    try:
        data_holder = json_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']
    except KeyError:
        print('data not valid')

    home_video = []
    short_list = []
    for i in range(len(data_holder)):
        keys = data_holder[i].keys()
        if 'richItemRenderer' in keys:
            keys_1 = data_holder[i]['richItemRenderer']['content'].keys()
            if 'videoRenderer' in keys_1:
                home_video.append(data_holder[i]['richItemRenderer'])
            else:
                pass
        elif 'richSectionRenderer' in keys:
            richSectionRenderer_keys = data_holder[i]['richSectionRenderer']['content'].keys()
            if 'richShelfRenderer' in richSectionRenderer_keys:
                keys_1 = data_holder[i]['richSectionRenderer']['content']['richShelfRenderer']['title']['runs'][0]['text']
                if keys_1 == 'Shorts':
                    short_list.append(data_holder[i]['richSectionRenderer']['content']['richShelfRenderer'])
                else:
                    pass
            else:
                pass
        else:
            pass
    video_df = []
    playlist_df = []
    for i in range(len(home_video)):
        temp_keys = home_video[i]['content'].keys()
        if 'videoRenderer' in temp_keys:
            temp_data = home_video[i]['content']['videoRenderer']
            temp_dict = {}
            temp_dict['video_id'] = temp_data['videoId']
            temp_dict['video_title'] = temp_data['title']['runs'][0]['text']
            if temp_data.get('viewCountText') is not None:
                temp_dict['view_count'] = temp_data['viewCountText'].get('simpleText')
            else:
                temp_dict['view_count'] = None
            if temp_dict['view_count'] is not None:
                temp_dict['view_count'] = int((temp_dict['view_count'])[:-6].replace(",", ""))
            else:
                temp_dict['view_count'] = None
            temp_dict['channel_name'] = temp_data['ownerText']['runs'][0]['text']
            temp_dict['channel_id'] = temp_data['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']
            temp_dict['channel_url'] = temp_data['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl']
            temp_dict['time_record'] = time
            url = f'https://www.youtube.com/watch?v={temp_dict["video_id"]}'
            response = session.get(url)
            cookie_jar.save(ignore_discard=True, ignore_expires=True)
            soup = BeautifulSoup(response.text)
            meta_tags = soup.find_all("meta", property="og:video:tag")
            video_tags = [tag.get("content", None) for tag in meta_tags]
            temp_dict['video_tag'] = str(video_tags)
            
            video_df.append(temp_dict)
        elif 'radioRenderer' in temp_keys:
            temp_data = home_video[i]['content']['radioRenderer']
            temp_dict = {}
            temp_dict['playlist_id'] = temp_data['playlistId']
            temp_dict['playlist_title'] = temp_data['title']['simpleText']
            temp_dict['time_record'] = time
            playlist_df.append(temp_dict)
        else:
            pass
    
    short_df = []
    for i in range(len(short_list[0]['contents'])):
        temp_data = short_list[0]['contents'][i]['richItemRenderer']['content']['reelItemRenderer']
        temp_dict = {}
        temp_dict['short_id'] = temp_data['videoId']
        temp_dict['short_headline'] = temp_data['headline']['simpleText']
        temp_dict['short_view_count'] = temp_data['viewCountText'].get('simpleText')
        temp_dict['url'] = temp_data['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        temp_dict['time_record'] = time
        url = f'https://www.youtube.com/watch?v={temp_dict["short_id"]}'
        response = session.get(url)
        cookie_jar.save(ignore_discard=True, ignore_expires=True)
        soup = BeautifulSoup(response.text)
        meta_tags = soup.find_all("meta", property="og:video:tag")
        video_tags = [tag.get("content", None) for tag in meta_tags]
        temp_dict['video_tag'] = str(video_tags)
        short_df.append(temp_dict)
    return pd.DataFrame.from_records(video_df), pd.DataFrame.from_records(playlist_df), pd.DataFrame.from_records(short_df)


## Preview Data And Save

In [9]:
video_df, playlist_df, short_df = construct_home_page_data(json_data)

In [10]:
video_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/video_home1.csv',index=False)
video_df

Unnamed: 0,video_id,video_title,view_count,channel_name,channel_id,channel_url,time_record,video_tag
0,cIbwFCdWDYk,Mau Random Bentar!,,ShinMei,UCahetHoMuR0j0EKLogexCIg,/@ShinMeii,2024-03-18 23:32:23.757362,[]
1,vZ96jD3jh1Q,Kenapa Eropa Pro LGBT banget? Padahal Dulu Pal...,68735.0,Sepulang Sekolah,UCfQHaUbD0oEBH_FRYHE5qIg,/@SepulangSekolah,2024-03-18 23:32:23.757362,"['sepulang sekolah', 'koi', 'jui', 'SMA', 'pel..."
2,LwLHttdqGsc,Frieren: Beyond Journey's End - Episode 27 [En...,277353.0,Muse Asia,UCGbshtvS9t-8CW11W7TooQg,/@MuseAsia,2024-03-18 23:32:23.757362,[]
3,BkN_K-Px-Kw,The Wellerman Song and Drunken Sailor - Stormy...,261.0,Fenrir Music,UC-SR4l7G-X1Q_5e18QE1Wpg,/@FenrirAudioVisual,2024-03-18 23:32:23.757362,"['wellerman', 'drunken sailor', 'sea shanty', ..."
4,ekh1uwg14Vc,Three two one koncoo,,NapLive,UChXsCcj64gdYp8FVCFTzwiw,/@naplive7,2024-03-18 23:32:23.757362,"['napking', 'naplive', 'livestreaming', 'game'..."
5,6Zy-bJmSwS4,Kehidupan ke-7 Nona Antagonis - Episode 11 [Ta...,183477.0,Muse Indonesia,UCxxnxya_32jcKj4yN1_kD7A,/@MuseIndonesia,2024-03-18 23:32:23.757362,"['ルプなな', '7thloop', 'villainess', 'fantasi', '..."
6,MfSE956GpUE,The Apothecary Diaries OP/Opening - Hana ni N...,980579.0,AniClipsCollection,UCCCFdfIWVMZt2Gb6kjrj4QA,/@accytofficial,2024-03-18 23:32:23.757362,[]
7,yM9nzbzJNec,Sousou no Frieren Ep. 27 Emotional OST- Flower...,7006.0,TaeIn Kim Music,UCO-CX5f2TOpRxeoi0Xi9C1A,/channel/UCO-CX5f2TOpRxeoi0Xi9C1A,2024-03-18 23:32:23.757362,"['sousou', 'sousou no frieren', 'frieren', 'fr..."
8,1h1w876eIAo,MENDING AZKA KATOLIK SELAMANYA DARIPADA DEKET ...,514332.0,Deddy Corbuzier,UCYk4LJI0Pr6RBDWowMm-KUw,/@corbuzier,2024-03-18 23:32:23.757362,"['Login', 'Log in', 'Habib Jafar', 'Onad', 'On..."
9,wI-M4kMTTso,Genshin Impact: Natlan Landscapes — Natlan Music,196.0,vescores,UCxQNI44gl-sduLl9e3_OW1w,/channel/UCxQNI44gl-sduLl9e3_OW1w,2024-03-18 23:32:23.757362,"['composer', 'soundtrack', 'original music', '..."


In [13]:
playlist_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/playlist_on_homepage1.csv',index=False)
playlist_df

In [12]:
short_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/short_on_home1.csv',index=False)
short_df

Unnamed: 0,short_id,short_headline,short_view_count,url,time_record,video_tag
0,Du_PKRdB50Y,She did NOT hesitate giving her number 💀 (Juju...,2.4M views,/shorts/Du_PKRdB50Y,2024-03-18 23:32:23.757362,"['HARISSM', 'anime reaction', 'anime mashup', ..."
1,VmVe5NpLmmw,What else in that water 😭😭😭,7.8M views,/shorts/VmVe5NpLmmw,2024-03-18 23:32:23.757362,[]
2,kIBs7cS12HY,"New animated story video for Milunka Savić, a....",129K views,/shorts/kIBs7cS12HY,2024-03-18 23:32:23.757362,"['sabaton', 'сабатон', 'heavy metal', 'metal',..."
3,TIXpAHkgqQM,Hololive Kobo Kanaeru Realistic AI Generated,111K views,/shorts/TIXpAHkgqQM,2024-03-18 23:32:23.757362,"['kobo', 'kobo kanaeru', 'aI', 'aiart', 'Photo..."
4,GE1hMny0Tt0,He cut the whole ice-cream bucket into two par...,1.9M views,/shorts/GE1hMny0Tt0,2024-03-18 23:32:23.757362,[]
5,yeseJ_WyvQg,Maomao's new little student😂😂 [The Apothecary ...,217K views,/shorts/yeseJ_WyvQg,2024-03-18 23:32:23.757362,[]
6,6Roy8P4NkHU,Why Shylily's Model Is So Good,1.7M views,/shorts/6Roy8P4NkHU,2024-03-18 23:32:23.757362,"['Emiru', 'Emiru Twitch', 'emiru clips', 'emir..."
7,Di4pA8-gBY0,Meaning of Swastika Symbol In Japan vs. The We...,12M views,/shorts/Di4pA8-gBY0,2024-03-18 23:32:23.757362,"['Asian Boss', 'Asia', 'Stay Curious']"
8,Du2LFI-bkF0,when your gf walks in but you're addicted to wow,5.4M views,/shorts/Du2LFI-bkF0,2024-03-18 23:32:23.757362,[]
9,pUy60FwN8mY,Toph Bends Metal For The First Time Reaction,1.9M views,/shorts/pUy60FwN8mY,2024-03-18 23:32:23.757362,"['toph and Aang', 'katara and toph', 'toph and..."


## Collecting Data Every 30 Second Until 100 Data Parsing

In [15]:
import time

In [16]:
data_counter = 0
while data_counter < 100:

    # request cookies
    session.cookies = cookie_jar

    # do a request with get method
    response = session.get("https://www.youtube.com/")

    # save the cookie
    cookie_jar.save(ignore_discard=True, ignore_expires=True)

    # parsing data
    html = response.text #-> save a text response from prev request
    regex = r"var ytInitialData = (.*);<\/script>" #-> regular expression to selecting data
    match = re.search(regex, html).group(1) #-> parsing data
    json_data = json.loads(match) #-> load string data as json
    video_df, playlist_df, short_df = construct_home_page_data(json_data)
    video_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/video_home_{data_counter}.csv',index=False)
    playlist_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/playlist_on_homepage_{data_counter}.csv',index=False)
    short_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/short_on_home_{data_counter}.csv',index=False)
    print(data_counter)
    time.sleep(5) #-> make sure that not time out
    data_counter += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Online Abble to take 68 Data