<div style="text-align: center; background-color: #750E21; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
  FINAL PROJECT: RESEARCHING ON MUSIC TASTE WORDWIDELY 📌
</div>

<div style="text-align: center; background-color: #0766AD; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
  Stage 01 - Data collecting 📌
</div>

## **IMPORT LIBRARY** 🎄

In [2]:
import requests 
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
import isodate
from datetime import datetime
import threading
import time

<div style="text-align: left; font-family: 'Trebuchet MS', Arial, sans-serif; color: #FF90BC; padding: 20px; font-size: 30px; font-weight: bold; border-radius: 0 0 0 0">
  STEP 1: Get data of toplist music video on Youtube from Kworb.net statistic 🔥
</div>

In [3]:
soup = BeautifulSoup(requests.get("https://kworb.net/youtube/topvideos.html").content, "html.parser")

music_data = []
for rank,tr in enumerate(soup.find_all("tr")[1:]):
    tds = tr.find_all("td")
    
    music_data.append({
        'Ranking': rank + 1,
        'Video Url': tds[0].a['href'],
        'Title': tds[0].text,
        'Views': tds[1].text,
        'Yesterday Views': tds[2].text,
    })

music_data = pd.DataFrame(music_data).set_index('Ranking')
music_data

Unnamed: 0_level_0,Video Url,Title,Views,Yesterday Views
Ranking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,video/kJQP7kiw5Fk.html,Luis Fonsi - Despacito ft. Daddy Yankee,8326155933,689927
2,video/JGwWNGJdvx8.html,Ed Sheeran - Shape of You (Official Music Video),6147796380,703914
3,video/RgKAFK5djSk.html,Wiz Khalifa - See You Again ft. Charlie Puth [...,6107002744,964865
4,video/OPf0YbXqDm0.html,Mark Ronson - Uptown Funk (Official Video) ft....,5097010247,734633
5,video/9bZkp7q19f0.html,PSY - GANGNAM STYLE(강남스타일) M/V,4975484655,987234
...,...,...,...,...
2496,video/HC172grgTwU.html,Same Time Same Jagah (Chaar Din) ● Sandeep Bra...,324922403,83510
2497,video/cAMHx-m9oh8.html,Kya Loge Tum | Akshay Kumar | Amyra Dastur | B...,324515340,320080
2498,video/Fd7lYEtevxQ.html,Xúc Xắc Xúc Xẻ - Bé Bảo An ft Phi Long,324464042,155994
2499,video/uQFVqltOXRg.html,Daniel Caesar - Get You ft. Kali Uchis [Offici...,324242080,50527


In [4]:
music_video_id = []
for url in music_data['Video Url']:
    music_video_id.append(re.findall(r'video/(.*).html', url)[0])

def generate_video_url(video_id):
    url_arr = []
    for video in video_id:
        url_arr.append(f'https://www.youtube.com/watch?v={video}')
    return url_arr

def save_to_txt(url_arr, file_name):
    with open('../data/' + file_name, 'w') as f:
        for url in url_arr:
            f.write(url + '\n')
    print('Save to txt file successfully!')

youtube_video_url = generate_video_url(music_video_id)
save_to_txt(youtube_video_url, 'youtube_video_url.txt')

#save a column of a dataframe to an array
kworb_video_url = music_data['Video Url'].to_numpy()
kworb_video_url = ['https://kworb.net/youtube/' + url for url in kworb_video_url]

save_to_txt(kworb_video_url, 'kworb_video_url.txt')

Save to txt file successfully!
Save to txt file successfully!


<div style="text-align: left; font-family: 'Trebuchet MS', Arial, sans-serif; color: #FF90BC; padding: 20px; font-size: 30px; font-weight: bold; border-radius: 0 0 0 0">
  STEP 2: Crawling data from youtube using api key
</div>

In [5]:
api_key = 'AIzaSyBzj644ZQin_REcV1t_O6lZuv2kYeY8ZO0'

In [6]:
def get_channel_info(api_key, channel_id):
    youtube = build('youtube', 'v3', developerKey=api_key)

    try:
        response = youtube.channels().list(
            part='snippet, contentDetails, statistics',
            id=channel_id
        ).execute()

        channel_info = response['items'][0]

        # Extract relevant information
        channel_name = channel_info['snippet']['title']
        subscriber_count = channel_info['statistics']['subscriberCount']

        return {
            'channel_name': channel_name,
            'subscriber_count': subscriber_count
        }

    except Exception as e:
        print(f'An error occurred: {e}')
        return None

In [7]:
def get_video_info(api_key, video_id, view_list, like_list, duration_list, channel_name_list, subscriber_list, 
                   publish_time_list, hashtag_list):
    youtube = build('youtube', 'v3', developerKey=api_key)
    
    response = youtube.videos().list(
        part='snippet, contentDetails, statistics',
        id=video_id
    ).execute()

    if (not response['items']):
        view_list.append(np.nan)
        like_list.append(np.nan)
        duration_list.append(np.nan)
        channel_name_list.append(np.nan)
        subscriber_list.append(np.nan)
        publish_time_list.append(np.nan)
        hashtag_list.append(np.nan)
    else:
        video_info = response['items'][0]

        # Extract relevant information
        views = video_info['statistics']['viewCount']
        
        try: 
            likes = video_info['statistics']['likeCount']
        except: 
            likes = np.nan
            
        duration_iso = video_info['contentDetails']['duration']
        channel_id = video_info['snippet']['channelId']

        # Get number of hashtags
        description = video_info['snippet']['description']
        hashtag_count = description.count('#')

        # Get published time
        published_at = video_info['snippet']['publishedAt']
        publish_time = datetime.strptime(published_at, '%Y-%m-%dT%H:%M:%SZ')

        # Convert ISO duration to human-readable format
        duration_human = isodate.parse_duration(duration_iso)

        # Extract channel name and subscribers
        channel_data = get_channel_info(api_key, channel_id)
        channel_name = channel_data['channel_name']
        subscribers = channel_data['subscriber_count']

        view_list.append(views)
        like_list.append(likes)
        duration_list.append(str(duration_human))
        channel_name_list.append(channel_name)
        subscriber_list.append(subscribers)
        publish_time_list.append(publish_time)
        hashtag_list.append(hashtag_count)

In [8]:
def collect_data(music_video_id, api_key):
    # Init empty list to store the values of each attribute.
    view_list = []
    like_list = []
    duration_list = []
    channel_name_list = []
    subscriber_list = []
    publish_time_list = []
    hashtag_list = []
    
    threads = []
    for video_id in music_video_id:
        # Checking whether video_id is blank or not
        if (video_id == ''): 
            continue
        
        # Create thread
        while (threading.active_count() > 20):
            time.sleep(0.1)
        
        thread = threading.Thread(target=get_video_info, args=(api_key, video_id, view_list, like_list, duration_list, 
                                                               channel_name_list, subscriber_list, publish_time_list, 
                                                               hashtag_list))
        threads.append(thread)
        thread.start()
        
    for thread in threads:
        thread.join()
        
    data = pd.DataFrame({'View': view_list,
                         'Like': like_list,
                         'Duration': duration_list,
                         'Channel_name': channel_name_list,
                         'Subscriber': subscriber_list,
                         'Publish_time': publish_time_list,
                         'Hasgtag': hashtag_list})
    
    return data

In [9]:
df = collect_data(music_video_id, api_key)
df

Unnamed: 0,View,Like,Duration,Channel_name,Subscriber,Publish_time,Hasgtag
0,3699397437,14910553,0:04:57,Ed Sheeran,53900000,2014-10-07 13:57:37,3.0
1,8326595309,52530642,0:04:42,LuisFonsiVEVO,16900000,2017-01-13 05:00:02,8.0
2,4975685528,27824172,0:04:13,officialpsy,18400000,2012-07-15 07:46:32,4.0
3,3741038508,16304426,0:03:26,JustinBieberVEVO,31500000,2015-10-22 20:00:02,5.0
4,3972177919,15905086,0:05:02,Maroon5VEVO,14400000,2015-01-14 15:00:11,0.0
...,...,...,...,...,...,...,...
2495,325548763,1133757,0:03:15,Henrique e Juliano,15800000,2014-05-26 17:19:44,0.0
2496,325617449,2009740,0:06:04,ChrisBrownVEVO,11900000,2011-06-17 07:00:00,6.0
2497,324614753,3067877,0:04:01,DM - Desi Melodies,18700000,2023-05-15 12:30:06,3.0
2498,324489563,867382,0:03:07,Ruby Bảo An,1640000,2011-01-31 13:52:25,8.0


In [10]:
df.to_csv('../data/youtube_data.csv', index=False)