# Youtube Home Page Data Crawling

## Importing Library

In [38]:
import pandas as pd
import requests as req
from http.cookiejar import MozillaCookieJar
import json
import re
import datetime as dt

## Load The Cookies

In [39]:
cookie_path = "C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/research/www.youtube.com_cookies.txt"
cookie_jar = MozillaCookieJar(cookie_path)

In [40]:
cookie_jar.load(ignore_discard=True, ignore_expires=True)

## Make a Request

In [41]:
session = req.Session() #-> make a request session object

# request header
session.headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-us,en;q=0.5",
    "Sec-Fetch-Mode": "navigate",
}

# request cookies
session.cookies = cookie_jar

# do a request with get method
response = session.get("https://www.youtube.com/")

# save the cookie
cookie_jar.save(ignore_discard=True, ignore_expires=True)

# parsing data
html = response.text #-> save a text response from prev request
regex = r"var ytInitialData = (.*);<\/script>" #-> regular expression to selecting data
match = re.search(regex, html).group(1) #-> parsing data
json_data = json.loads(match) #-> load string data as json

## Function To Construct Data

In [42]:
def construct_home_page_data(json_data,time=dt.datetime.now()):
    try:
        data_holder = json_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']
    except KeyError:
        print('data not valid')

    home_video = []
    short_list = []
    for i in range(len(data_holder)):
        keys = data_holder[i].keys()
        if 'richItemRenderer' in keys:
            keys_1 = data_holder[i]['richItemRenderer']['content'].keys()
            if 'videoRenderer' in keys_1:
                home_video.append(data_holder[i]['richItemRenderer'])
            else:
                pass
        elif 'richSectionRenderer' in keys:
            richSectionRenderer_keys = data_holder[i]['richSectionRenderer']['content'].keys()
            if 'richShelfRenderer' in richSectionRenderer_keys:
                keys_1 = data_holder[i]['richSectionRenderer']['content']['richShelfRenderer']['title']['runs'][0]['text']
                if keys_1 == 'Shorts':
                    short_list.append(data_holder[i]['richSectionRenderer']['content']['richShelfRenderer'])
                else:
                    pass
            else:
                pass
        else:
            pass
    video_df = []
    playlist_df = []
    for i in range(len(home_video)):
        temp_keys = home_video[i]['content'].keys()
        if 'videoRenderer' in temp_keys:
            temp_data = home_video[i]['content']['videoRenderer']
            temp_dict = {}
            temp_dict['video_id'] = temp_data['videoId']
            temp_dict['video_title'] = temp_data['title']['runs'][0]['text']
            if temp_data.get('viewCountText') is not None:
                temp_dict['view_count'] = temp_data['viewCountText'].get('simpleText')
            else:
                temp_dict['view_count'] = None
            if temp_dict['view_count'] is not None:
                temp_dict['view_count'] = int((temp_dict['view_count'])[:-6].replace(",", ""))
            else:
                temp_dict['view_count'] = None
            temp_dict['channel_name'] = temp_data['ownerText']['runs'][0]['text']
            temp_dict['channel_id'] = temp_data['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId']
            temp_dict['channel_url'] = temp_data['ownerText']['runs'][0]['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl']
            temp_dict['time_record'] = time
            video_df.append(temp_dict)
        elif 'radioRenderer' in temp_keys:
            temp_data = home_video[i]['content']['radioRenderer']
            temp_dict = {}
            temp_dict['playlist_id'] = temp_data['playlistId']
            temp_dict['playlist_title'] = temp_data['title']['simpleText']
            temp_dict['time_record'] = time
            playlist_df.append(temp_dict)
        else:
            pass
    
    short_df = []
    for i in range(len(short_list[0]['contents'])):
        temp_data = short_list[0]['contents'][i]['richItemRenderer']['content']['reelItemRenderer']
        temp_dict = {}
        temp_dict['short_id'] = temp_data['videoId']
        temp_dict['short_headline'] = temp_data['headline']['simpleText']
        temp_dict['short_view_count'] = temp_data['viewCountText'].get('simpleText')
        temp_dict['url'] = temp_data['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
        temp_dict['time_record'] = time
        short_df.append(temp_dict)
    return pd.DataFrame.from_records(video_df), pd.DataFrame.from_records(playlist_df), pd.DataFrame.from_records(short_df)


## Preview Data And Save

In [45]:
video_df, playlist_df, short_df = construct_home_page_data(json_data)

In [46]:
video_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/video_home1.csv',index=False)
video_df

Unnamed: 0,video_id,video_title,view_count,channel_name,channel_id,channel_url,time_record
0,V4ft79VokKs,Semua episode Kemono Jihen [Bahasa Indonesia],1203581,Muse ID Best Anime Collection,UCjcAfwNEjkXA352e2uMt5Jg,/@museidbestanimecollection,2024-03-13 21:44:18.945156
1,WiEyqIRbxI4,【Human: Fall Flat】JADI MANUSIA LENJEH BERSAMA ...,354259,Kobo Kanaeru Ch. hololive-ID,UCjLEmnpCNeisMxy134KPwWw,/@KoboKanaeru,2024-03-13 21:44:18.945156
2,_Ci8uRh4__k,Reviewer Nggak Ngerti Mobil & Pilot Kopilot Tidur,6952,TnM,UCZdmuthWjKAC-oiaoh1bfOw,/@andriTNM,2024-03-13 21:44:18.945156
3,jHn8Ts3NwN8,Caranya Punya Dua Otak,12865,Raditya Dika,UC0rzsIrAxF4kCsALP6J2EsA,/@radityadika,2024-03-13 21:44:18.945156
4,q2RAA0LmbXA,"""White Death"" – Sabaton Cover by Red Mage",330,Laurie Hawke Ch. – {RedMageMetal},UCXCrTqkAwxvGXAWZsdWTxcA,/@RedMageMetal,2024-03-13 21:44:18.945156
5,51bYjGrUjys,Kalian Optimis OPM S3? Masih Kinclong Loh - No...,6893,Megane Sensei,UC6Yoq_MyO8-nkrvCj7USqzw,/@MeganeSensei,2024-03-13 21:44:18.945156
6,JGdL5xupAVY,[📺Semua Episode] Aku Tidak Ingin Terluka Jadi ...,1402822,Muse Indonesia,UCxxnxya_32jcKj4yN1_kD7A,/@MuseIndonesia,2024-03-13 21:44:18.945156
7,ZX5UjkEb8ZI,Shylily Reacts To Daily Dose Of Internet | Try...,54680,VTubeMoment,UC5aVBQ1ZwMnM3ddNfGChmkw,/@VTubeMoment,2024-03-13 21:44:18.945156
8,rMbira1-7gM,PENGAKUAN! SAYA SEORANG GUY! Kenapa Orang Kita...,607348,Sepulang Sekolah,UCfQHaUbD0oEBH_FRYHE5qIg,/@SepulangSekolah,2024-03-13 21:44:18.945156
9,u3EBlmzb9ds,Sousou no Frieren - Time Flows Ever Onward For...,331464,Apollonomelody,UC06h2n_lZb30_TscWPA1tWA,/@apollonomelody,2024-03-13 21:44:18.945156


In [47]:
playlist_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/playlist_on_homepage1.csv',index=False)
playlist_df

In [48]:
short_df.to_csv('C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/short_on_home1.csv',index=False)
short_df

Unnamed: 0,short_id,short_headline,short_view_count,url,time_record
0,2C1HWI1xAtw,Bro casually Rizzed her😂 - Horimiya missing pi...,2.7M views,/shorts/2C1HWI1xAtw,2024-03-13 21:44:18.945156
1,Ln_jQL8HOn0,Why Kobo Kanaeru hitting 1 million subscribers...,1.1M views,/shorts/Ln_jQL8HOn0,2024-03-13 21:44:18.945156
2,jP92oR3Hv9E,did you know that Toph... (part 2) | Avatar #S...,12M views,/shorts/jP92oR3Hv9E,2024-03-13 21:44:18.945156
3,vKcjjT0La2Q,"New Sabaton music: ""Weapons Of The Modern Age""...",387K views,/shorts/vKcjjT0La2Q,2024-03-13 21:44:18.945156
4,TIXpAHkgqQM,Hololive Kobo Kanaeru Realistic AI Generated,110K views,/shorts/TIXpAHkgqQM,2024-03-13 21:44:18.945156
5,SUmhpFkUE0E,I just can’t stop watching it! 😂😂😂 #Funny,1.1M views,/shorts/SUmhpFkUE0E,2024-03-13 21:44:18.945156
6,DxN6u3_MkuQ,"sok”an cinta lo dek, rapor lu benerin dlo #cov...",718K views,/shorts/DxN6u3_MkuQ,2024-03-13 21:44:18.945156
7,Ust_HCp8bns,Free Palestine 🇵🇸,4.4K views,/shorts/Ust_HCp8bns,2024-03-13 21:44:18.945156
8,ooMYhmSMis0,that's my gurl #kpop #fypシ #seventeen #emmamye...,1.5M views,/shorts/ooMYhmSMis0,2024-03-13 21:44:18.945156
9,BgxOek3IKUI,Theapothecarydiaries Collage🌱maomao #theapothe...,3M views,/shorts/BgxOek3IKUI,2024-03-13 21:44:18.945156


## Collecting Data Every 30 Minute For Next 24 Hour

In [50]:
import time

In [None]:
data_counter = 0
while data_counter < 48:
    cookie_path = "C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/research/www.youtube.com_cookies.txt"
    cookie_jar = MozillaCookieJar(cookie_path)
    cookie_jar.load(ignore_discard=True, ignore_expires=True)
    session = req.Session() #-> make a request session object

    # request header
    session.headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-us,en;q=0.5",
        "Sec-Fetch-Mode": "navigate",
    }

    # request cookies
    session.cookies = cookie_jar

    # do a request with get method
    response = session.get("https://www.youtube.com/")

    # save the cookie
    cookie_jar.save(ignore_discard=True, ignore_expires=True)

    # parsing data
    html = response.text #-> save a text response from prev request
    regex = r"var ytInitialData = (.*);<\/script>" #-> regular expression to selecting data
    match = re.search(regex, html).group(1) #-> parsing data
    json_data = json.loads(match) #-> load string data as json
    video_df, playlist_df, short_df = construct_home_page_data(json_data)
    video_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/video_home_{data_counter}.csv',index=False)
    playlist_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/playlist_on_homepage_{data_counter}.csv',index=False)
    short_df.to_csv(f'C:/Users/snsv/Documents/Data Analyst/personal youtube analysis/data/home/short_on_home_{data_counter}.csv',index=False)
    time.sleep(1800)
    data_counter += 1

In [51]:
60*30

1800