# Web Crawler of Top US Youtube Channels

Course: IS590PR Final Project <br>
Web Crawling Date: 04/15/2019 <br>
Source: https://socialblade.com/youtube/top/country/us


In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### Top 250 YouTube Channels in United States Sorted by Subscribers/Video Views/SocialBlade Grade

In [2]:
def get_top_channels(url): 
    """ This function will get the rank of Top 250 youtube channels and part of its information

    :param url: the website that we want to perform web crawler
    :return: lists that stores rank, grade, name, uploads, subs, views and url
    >>> get_top_channels(123)
    Traceback (most recent call last):
    requests.exceptions.MissingSchema: Invalid URL '123': No schema supplied. Perhaps you meant http://123?
    >>> len(get_top_channels('https://socialblade.com/youtube/top/country/us/mostsubscribed'))
    7
    """
    res = requests.get(url) #get the website, return request.Response object
    # print(res.status_code) #statu_code: return 200(found web), 404(not found)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # get rank
    rank = list(range(1, 251))
    
    grade_div = soup.find_all('div', attrs={'style': 'float: left; width: 70px; font-size: 1.1em;'}) # get grade
    name_div = soup.find_all('div', attrs={'style': 'float: left; width: 350px; line-height: 25px;'}) # get name
    uploads_div = soup.find_all('div', attrs={'style': 'float: left; width: 80px;'}) # get uploads
    subs_views_div = soup.find_all('div', attrs={'style': 'float: left; width: 150px;'}) # get subs and views 
    
    uploads_div = uploads_div[1:]
    subs_div = subs_views_div[2::2]
    views_div = subs_views_div[3::2]
    
    grade = [] 
    name = []
    url = []
    uploads = []
    subs = []
    views = []
    for i in range(250):
        # grade list
        grade_text = re.split('\n| ',grade_div[i].text)
        grade.append(grade_text[1])

        
        # name list
        name_text = re.split('\n| \s',name_div[i].text)
        name_text = ''.join(name_text)
        name.append(name_text)
        
        # url list
        url_channel = name_div[i].find_all('a')[0].get('href')
        url_main = 'https://socialblade.com'
        url.append(url_main+url_channel)
        
        # get uploads text
        uploads_text = re.split('\n| ',uploads_div[i].text)
        uploads.append(int(re.sub("[^\d\.]", "", uploads_text[0]))) # upload list
        
        # get subs and views text
        subs_text = re.split('\n| ',subs_div[i].text)
        views_text = re.split('\n| ',views_div[i].text)
        
        # ignore the 
        try:
            subs.append(int(re.sub("[^\d\.]", "", subs_text[1]))) # subscribers list
            views.append(int(re.sub("[^\d\.]", "", views_text[1]))) # views list
        except ValueError:
            subs.append(subs_text[1]) # subscribers list
            views.append(views_text[1]) # views list            

    return rank, grade, name, uploads, subs, views, url

In [3]:
def get_channel_info(url_list):
    """ This function will the information of specific Youtube channel

    :param url: the url list we output from the get_top_channels function
    :return: a channel information dictionary
    >>> url_list = [123, 456, 789]
    >>> get_channel_info(url_list)
    Traceback (most recent call last):
    requests.exceptions.MissingSchema: Invalid URL '123': No schema supplied. Perhaps you meant http://123?
    >>> url_list = 'https://socialblade.com/youtube/user/taylorswiftvevo'
    >>> get_channel_info(url_list)
    Traceback (most recent call last):
    requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
    >>> url_list = ['https://socialblade.com/youtube/user/taylorswiftvevo']
    >>> type(get_channel_info(url_list))
    <class 'dict'>
    """
    # daily average subscribers and views are calculated based on the subs and views in the past 30 days
    daysubs_list = []
    dayviews_list = []
    min_day_earn_list = []
    max_day_earn_list = []
    min_mon_earn_list = []
    max_mon_earn_list = []
    min_year_earn_list = []
    max_year_earn_list = []
    category_list = []
    created_date_list = []
    sub_world_rank_list = []
    view_world_rank_list = []
    
    
    for url in url_list:
        # print(url)
        res = requests.get(url) 
        # print(res.status_code) 
        soup = BeautifulSoup(res.text, 'html.parser')
        
        # get channel type 
        category_a = soup.find_all('a', attrs={'id': 'youtube-user-page-channeltype'})
        category_list.append(category_a[0].text)
        
        # get created date
        created_date_div = soup.find_all('div', attrs={'class': 'YouTubeUserTopInfo'})
        created_date = created_date_div[-1].find_all('span', attrs={'style': 'font-weight: bold;'})[0].text
        created_date_list.append(created_date)
        
        # get world subscriber rank
        sub_p = soup.find_all('p', attrs={'id': 'afd-header-subscriber-rank'})
        sub_world_rank = re.sub("[^\d]", "", sub_p[0].text)
        sub_world_rank_list.append(sub_world_rank)
        
        # get world view rank
        view_p = soup.find_all('p', attrs={'id': 'afd-header-videoview-rank'})
        view_world_rank = re.sub("[^\d]", "", view_p[0].text)
        view_world_rank_list.append(view_world_rank)
        
        # get daily average subscribers / views / estimated earnings
        daysubs_earnings_div = soup.find_all('div', attrs={'style': 'width: 205px; height: 40px; line-height: 40px; float: left;'})
        dayviews_div = soup.find_all('div', attrs={'id': 'averagedailyviews'})
        # get monthly / yearly estimated earnings
        month_year_earning = soup.find_all('p', attrs={'style': 'font-size: 1.4em; color:#41a200; font-weight: 600; padding-top: 20px;'})
        
        
        # get daily average subscribers
        daysubs = re.split('\n| ',daysubs_earnings_div[0].text)[1]
        daysubs = re.sub("[^-\d\.]", "", daysubs)
        daysubs_list.append(daysubs)
        
        # get daily average views
        dayviews = re.split('\n| ',dayviews_div[0].text)[1]
        dayviews = re.sub("[^-\d\.]", "", dayviews)
        dayviews_list.append(dayviews)
        
        # get min estimated earnings 
        min_day_earn = re.split('\n| ',daysubs_earnings_div[1].text)[1]
        min_mon_earn = re.split('\n| ',month_year_earning[0].text)[1]
        min_year_earn = re.split('\n| ',month_year_earning[1].text)[1]
        
        # change string to float (ex: $10k -> 10000.0)
        try:
            if 'K' in min_day_earn:
                min_day_earn = float(re.sub("[^-\d\.]", "", min_day_earn)) * 1000
            elif 'M' in min_day_earn:
                min_day_earn = float(re.sub("[^-\d\.]", "", min_day_earn)) * 1000000
            else:
                min_day_earn = float(re.sub("[^-\d\.]", "", min_day_earn))

            if 'K' in min_mon_earn:
                min_mon_earn = float(re.sub("[^-\d\.]", "", min_mon_earn)) * 1000
            elif 'M' in min_mon_earn:
                min_mon_earn = float(re.sub("[^-\d\.]", "", min_mon_earn)) * 1000000
            else:
                min_mon_earn = float(re.sub("[^-\d\.]", "", min_mon_earn))

            if 'K' in min_year_earn:
                min_year_earn = float(re.sub("[^-\d\.]", "", min_year_earn)) * 1000
            elif 'M' in min_year_earn:
                min_year_earn = float(re.sub("[^-\d\.]", "", min_year_earn)) * 1000000
            else:
                min_year_earn = float(re.sub("[^-\d\.]", "", min_year_earn))
                
        except ValueError:
            pass
        min_day_earn_list.append(min_day_earn)
        min_mon_earn_list.append(min_mon_earn)
        min_year_earn_list.append(min_year_earn)
        
        # get  max estimated earnings 
        max_day_earn = re.split('\n| ',daysubs_earnings_div[1].text)[3]
        max_mon_earn = re.split('\n| ',month_year_earning[0].text)[3]
        max_year_earn = re.split('\n| ',month_year_earning[1].text)[3]
        # print(max_year_earn)
        try:
            if 'K' in max_day_earn:
                max_day_earn = float(re.sub("[^-\d\.]", "", max_day_earn)) * 1000
            elif 'M' in max_day_earn:
                max_day_earn = float(re.sub("[^-\d\.]", "", max_day_earn)) * 1000000
            else:
                max_day_earn = float(re.sub("[^-\d\.]", "", max_day_earn))
                
            if 'K' in max_mon_earn:
                max_mon_earn = float(re.sub("[^-\d\.]", "", max_mon_earn)) * 1000
            elif 'M' in max_mon_earn:
                max_mon_earn = float(re.sub("[^-\d\.]", "", max_mon_earn)) * 1000000
            else:
                max_mon_earn = float(re.sub("[^-\d\.]", "", max_mon_earn))

            if 'K' in max_year_earn:
                max_year_earn = float(re.sub("[^-\d\.]", "", max_year_earn)) * 1000
            elif 'M' in max_year_earn:
                max_year_earn = float(re.sub("[^-\d\.]", "", max_year_earn)) * 1000000
            else:
                max_year_earn = float(re.sub("[^-\d\.]", "", max_year_earn))
            
        except ValueError:
            pass
        max_day_earn_list.append(max_day_earn)
        max_mon_earn_list.append(max_mon_earn)
        max_year_earn_list.append(max_year_earn)
        # print(max_year_earn_list)

    
    channel_info = {'Category': category_list, 'Created Date': created_date_list, 'Daily Average Subs': daysubs_list,
                   'Daily Average Views': dayviews_list, 'Min Daily Earning': min_day_earn_list, 
                   'Max Daily Earning': max_day_earn_list, 'Min Monthly Earning': min_mon_earn_list, 
                   'Max Monthly Earning': max_mon_earn_list, 'Min Yearly Earning': min_year_earn_list,
                   'Max Yearly Earning': max_year_earn_list, 'World Subscriber Rank': sub_world_rank_list,
                   'World Video View Rank': view_world_rank_list}

    
    return channel_info

In [4]:
url_by_subs = 'https://socialblade.com/youtube/top/country/us/mostsubscribed'
# url_by_grade = 'https://socialblade.com/youtube/top/country/us'
# url_by_views = 'https://socialblade.com/youtube/top/country/us/mostviewed'
rank, grade, name, uploads, subs, views, url = get_top_channels(url_by_subs)

In [5]:
channel_rank = {'US Rank': rank,'Grade': grade,'Name': name, 
                'Uploads': uploads, 'Subscribers': subs, 'Views': views}
url_dict = {'URL': url}

In [6]:
channel_indo = get_channel_info(url)

In [7]:
channels = dict(channel_rank, **channel_indo, **url_dict)

In [8]:
channels = pd.DataFrame(channels)
channels.head()

Unnamed: 0,US Rank,Grade,Name,Uploads,Subscribers,Views,Category,Created Date,Daily Average Subs,Daily Average Views,Min Daily Earning,Max Daily Earning,Min Monthly Earning,Max Monthly Earning,Min Yearly Earning,Max Yearly Earning,World Subscriber Rank,World Video View Rank,URL
0,1,A,PewDiePie,3816,94996769,21273091209,Entertainment,"Apr 29th, 2010",110345,13049800,3300,52200,97900,1600000.0,1200000.0,18800000.0,3,11,https://socialblade.com/youtube/user/pewdiepie
1,2,A+,5-Minute Crafts,3018,54468986,13457658545,Howto,"Nov 15th, 2016",62777,14743200,3700,59000,110600,1800000.0,1300000.0,21200000.0,7,36,https://socialblade.com/youtube/user/295-dw_td...
2,3,A++,Cocomelon - Nursery Rhymes,418,45767781,26686250508,Education,"Sep 1st, 2006",133986,93106600,23300,372400,698300,11200000.0,8400000.0,134100000.0,11,7,https://socialblade.com/youtube/user/checkgate
3,4,A+,WWE,40962,42631850,31739359936,Sports,"May 11th, 2007",48579,26197400,6500,104800,196500,3100000.0,2400000.0,37700000.0,12,3,https://socialblade.com/youtube/user/wwefannation
4,5,A,Dude Perfect,203,41489500,7748697510,Sports,"Mar 17th, 2009",26817,6670330,1700,26700,50000,800400.0,600300.0,9600000.0,13,111,https://socialblade.com/youtube/user/corycotton


In [9]:
channels.to_csv('top250_us_youtube_by_subs.csv')