# Web Crawler of Top US Youtube Channels

Course: IS590PR Final Project <br>
Web Crawling Date: 04/15/2019 <br>
Source: https://socialblade.com/youtube/top/country/us/mostsubscribed


In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## TOP 250 YOUTUBERS IN UNITED STATES SORTED BY SUBSCRIBERS

In [2]:
def get_top_channels(url): 
    res = requests.get(url) #get the website, return request.Response object
    # print(res.status_code) #statu_code: return 200(found web), 404(not found)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # get rank
    rank = list(range(1, 251))
    
    grade_div = soup.find_all('div', attrs={'style': 'float: left; width: 70px; font-size: 1.1em;'}) # get grade
    name_div = soup.find_all('div', attrs={'style': 'float: left; width: 350px; line-height: 25px;'}) # get name
    uploads_div = soup.find_all('div', attrs={'style': 'float: left; width: 80px;'}) # get uploads
    subs_views_div = soup.find_all('div', attrs={'style': 'float: left; width: 150px;'}) # get subs and views 
    
    uploads_div = uploads_div[1:]
    subs_div = subs_views_div[2::2]
    views_div = subs_views_div[3::2]
    
    grade = [] 
    name = []
    url = []
    uploads = []
    subs = []
    views = []
    for i in range(250):
        # grade list
        grade_text = re.split('\n| ',grade_div[i].text)
        grade.append(grade_text[1])

        
        # name list
        name_text = re.split('\n| \s',name_div[i].text)
        name_text = ''.join(name_text)
        name.append(name_text)
        
        # url list
        url_channel = name_div[i].find_all('a')[0].get('href')
        url_main = 'https://socialblade.com'
        url.append(url_main+url_channel)
        
        # upload list
        uploads_text = re.split('\n| ',uploads_div[i].text)
        uploads.append(int(re.sub("[^\d\.]", "", uploads_text[0])))
       
        # subscribers list
        subs_text = re.split('\n| ',subs_div[i].text)
        subs.append(int(re.sub("[^\d\.]", "", subs_text[1])))        
        
        # views list
        views_text = re.split('\n| ',views_div[i].text)
        views.append(int(re.sub("[^\d\.]", "", views_text[1])))

    return rank, grade, name, uploads, subs, views, url

In [3]:
def get_channel_info(url_list):
    # daily average subscribers and views are calculated based on the subs and views in the past 30 days
    daysubs_list = []
    dayviews_list = []
    min_earn_list = []
    max_earn_list = []
    
    for url in url_list:
        # print(url)
        res = requests.get(url) 
        # print(res.status_code) 
        soup = BeautifulSoup(res.text, 'html.parser')
        
        daysubs_earnings_div = soup.find_all('div', attrs={'style': 'width: 205px; height: 40px; line-height: 40px; float: left;'})
        dayviews_div = soup.find_all('div', attrs={'id': 'averagedailyviews'})
        
        # get daily average subscribers
        daysubs = re.split('\n| ',daysubs_earnings_div[0].text)[1]
        daysubs = re.sub("[^-\d\.]", "", daysubs)
        daysubs_list.append(daysubs)
        
        # get daily average 
        dayviews = re.split('\n| ',dayviews_div[0].text)[1]
        dayviews = re.sub("[^-\d\.]", "", dayviews)
        dayviews_list.append(dayviews)
        
        # get daily min estimated earnings 
        min_earn = re.split('\n| ',daysubs_earnings_div[1].text)[1]
        # change string to float (ex: $10k -> 10000.0)
        try:
            if 'K' in min_earn:
                min_earn = float(re.sub("[^-\d\.]", "", min_earn)) * 1000
            else:
                min_earn = float(re.sub("[^-\d\.]", "", min_earn))
        except ValueError:
            pass
        min_earn_list.append(min_earn)
        
        # get daily max estimated earnings 
        max_earn = re.split('\n| ',daysubs_earnings_div[1].text)[3]
        try:
            if 'K' in max_earn:
                max_earn = float(re.sub("[^-\d\.]", "", max_earn)) * 1000
            else:
                max_earn = float(re.sub("[^-\d\.]", "", max_earn))
        except ValueError:
            pass
        max_earn_list.append(max_earn)
        
        
    return daysubs_list, dayviews_list, min_earn_list, max_earn_list
        

In [4]:
main_url = 'https://socialblade.com/youtube/top/country/us/mostsubscribed'
rank, grade, name, uploads, subs, views, url = get_top_channels(main_url)

In [5]:
daysubs, dayviews, min_earn, max_earn = get_channel_info(url)

In [6]:
channel_info = {'Rank': rank,'Grade': grade,'Name': name, 
                'Uploads': uploads, 'Subscribers': subs, 'Views': views, 
                'Daily Average Subs': daysubs, 'Daily Average Video Views': dayviews, 
                'Daily Min Estimated Earnings': min_earn, 'Daily Max Estimated Earnings': max_earn, 'URL': url}

In [9]:
channels = pd.DataFrame(channel_info)
channels.head()

Unnamed: 0,Rank,Grade,Name,Uploads,Subscribers,Views,Daily Average Subs,Daily Average Video Views,Daily Min Estimated Earnings,Daily Max Estimated Earnings,URL
0,1,A+,PewDiePie,3804,94213512,21154615410,144446,13416000,3400,53700,https://socialblade.com/youtube/user/pewdiepie
1,2,A+,5-Minute Crafts,2970,53776226,13280449943,63044,15396600,3800,61600,https://socialblade.com/youtube/user/295-dw_td...
2,3,A++,Cocomelon - Nursery Rhymes,415,44010502,25460364278,122242,88514800,22100,354100,https://socialblade.com/youtube/user/checkgate
3,4,A+,WWE,40771,42069396,31415288099,44395,24296400,6100,97200,https://socialblade.com/youtube/user/wwefannation
4,5,A,Dude Perfect,202,41255100,7679716467,36551,7820910,2000,31300,https://socialblade.com/youtube/user/corycotton


In [8]:
channels.to_csv('top250_us_youtube_channels.csv')