# Get initial urls from YouTubers.me

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup


def get_redirected_url(url):
    response = requests.get(url, allow_redirects=False)
    redirected_url = response.headers.get('Location')
    return redirected_url


def extract_youtube_urls():
    url = "https://us.youtubers.me/global/all/top-1000-youtube-channels"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    channel_urls = []
    table_element = soup.find('table')
    if table_element:
        anchor_elements = table_element.find_all('a', href=lambda href: href and '/youtuber-stats' in href)[:10]
        for element in anchor_elements:
            youtube_url = "https://us.youtubers.me" + element['href'].replace("youtuber-stats", "youtube")
            channel_urls.append(get_redirected_url(youtube_url))

    return channel_urls




In [2]:
%%time
youtube_urls = extract_youtube_urls()
for url in youtube_urls:
    print(url)

https://www.youtube.com/channel/UCq-Fj5jknLsUf-MWSy4_brA
https://www.youtube.com/channel/UCbCmjCuTUZos6Inko4u57UQ
https://www.youtube.com/channel/UCpEhnqL0y41EpW2TvWAHD7Q
https://www.youtube.com/channel/UC6-F5tO8uklgE9Zy8IvbdFw
https://www.youtube.com/channel/UCk8GzjMOrta8yxDcKfylJYw
https://www.youtube.com/channel/UCJplp5SjeGSdVdwsfb9Q7lQ
https://www.youtube.com/channel/UCJ5v_MCY6GNUBTO8-D3XoAg
https://www.youtube.com/channel/UCvlE5gTbOvjiolFlEm-c_Ow
https://www.youtube.com/channel/UCppHT7SZKKvar4Oc9J4oljQ
https://www.youtube.com/channel/UC55IWqFLDH1Xp7iu1_xknRA
CPU times: user 1.31 s, sys: 52.5 ms, total: 1.36 s
Wall time: 6.07 s


# Get channel id from urls

In [5]:
# Not using beautiful soup

import requests
import re


def get_redirected_url(url):
    response = requests.get(url, allow_redirects=False)
    redirected_url = response.headers.get('Location')
    return redirected_url

#channel_id_pattern = re.compile(r'"channelId":"([a-zA-Z0-9-_]+)"') # this one is bad if there are secondary channels
#channel_id_pattern = re.compile(r'"channelId":"([a-zA-Z0-9-_]+)","title"')
channel_id_pattern = re.compile(r'"c4TabbedHeaderRenderer":{"channelId":"([a-zA-Z0-9-_]+)"')

def get_channel_id_from_url(channel_url):
    channel_url=get_redirected_url(channel_url)
    
    match_id = re.search(r"channel/(UC[a-zA-Z0-9-_]+)", channel_url)
    if match_id:
        channel_id = match_id.group(1)
        return channel_id

    # Send an HTTP GET request to the channel URL with the custom headers
    response = requests.get(channel_url,allow_redirects=True)
    page_source = response.text

    # Extract the Channel ID from the page source using regular expressions
    #match = re.search(r'"channelId":"([a-zA-Z0-9-_]{24})"', page_source)
    match = channel_id_pattern.search(page_source)
    if match:
        channel_id = match.group(1)
        return channel_id
    else:
        return None

In [6]:
# using beautifulsoup

import requests
import re
import json


def get_redirected_url(url):
    response = requests.get(url, allow_redirects=False)
    redirected_url = response.headers.get('Location')
    return redirected_url


def get_channel_id_from_url_2(channel_url):
    channel_url=get_redirected_url(channel_url)
    
    match_id = re.search(r"channel/(UC[a-zA-Z0-9-_]+)", channel_url)
    if match_id:
        channel_id = match_id.group(1)
        return channel_id

    # Send an HTTP GET request to the channel URL with the custom headers
    response = requests.get(channel_url,allow_redirects=True)
    soup=BeautifulSoup(response.text, "html.parser")
    data= re.search(r"var ytInitialData = ({.*});", str(soup.prettify())).group(1)

    json_data=json.loads(data)
    return json_data['header']['c4TabbedHeaderRenderer']['channelId']
    

In [7]:
test_list=[]
for url in youtube_urls[:100]:
    channelid = get_channel_id_from_url(url)
    test_list.append(channelid)
    
    

In [8]:
test_list

['UC3gNmTGu-TTbFPpfSs5kNkg',
 'UCRv76wLBC73jiP7LX4C3l8Q',
 'UCbTLwN10NoCU4WDzLf1JMOA',
 'UCBAb_DK4GYZqZR9MFA7y2Xg',
 'UCyoXW-Dse7fURq30EWl_CUA',
 'UCAOtE1V7Ots4DjM8JLlrYgg',
 'UCUe6ZpY6TJ0no8jI4l2iLxw',
 'UCj-SWZSE0AmotGSQ3apROHw',
 'UCYQo8CdhXD22qfwUBUw591Q',
 'UCN8S4CqMRy6tVKVXvs7Bzeg',
 'UCV4XcEqBswMCryorV_gNENw',
 'UCuSo4gcgxJRf4Bzu43wwVyg',
 'UCi8e0iOVk1fEOogdfu4YgfA',
 'UCPNxhDvTcytIdvwXWAm43cA',
 'UCdgDIIKpFlpHB1L0LZnh5EQ',
 'UCUAL--p3qAa27luR0IYbjZA',
 'UCK9F8nycURBsR0YlrBsu1Ag',
 'UCFeUyPY6W8qX8w2o6oSiRmw',
 'UC6eq3sR4CtbvGdmInchzWUA',
 'UCIeNlITYK46VkR7yIuTL8GQ',
 'UCiBigY9XM-HaOxUc269ympg',
 'UCFMFB7_Ik8jBnCwSuh295wA',
 'UCnB5W_ZJgiDFnklejRGADxw',
 'UCR1c65UsjpaVgcLKa7eM1tg',
 'UCcKJJuOe2tOqgrKw0Gks-sw',
 'UCOF23vGxkbhN4wl7ROrgXsA',
 'UCYzEMRKqrh01-tauv7MYyVQ',
 'UCu7Hg0f3rxqZ6qs-188ZbjQ',
 'UC3uTUlmgECBnsP777U4GPKQ',
 'UCCyKsYaRi62axS-kasXW7wA',
 'UCdX5KXiCTPYWYZscfphgQ4g',
 'UC2el0G8cIcOOdjlHy2KOBkQ',
 'UC0ebPa2q1rwZVN7dYemunUQ',
 'UCrreHSUa5rnuCVDeO8dX4eA',
 'UC-GAhNmY6bk

In [104]:
%%time
get_channel_id_from_url('https://www.youtube.com/@coreyms')

CPU times: user 24.5 ms, sys: 5.31 ms, total: 29.8 ms
Wall time: 345 ms


'UCCezIgC97PvUuR4_gbFUs5g'

In [105]:
%%time
get_channel_id_from_url_2('https://www.youtube.com/@coreyms')

CPU times: user 49.8 ms, sys: 7.19 ms, total: 57 ms
Wall time: 299 ms


'UCCezIgC97PvUuR4_gbFUs5g'