Data from [Kaggle](https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset)

In [3]:
# imports
import pandas as pd
import numpy as np
import json
import string
from datetime import datetime, timedelta

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from IPython.display import Markdown, display

In [4]:
# read data
category_ids = json.load(open('US_category_id.json'))
data = pd.read_csv('US_youtube_trending_data.csv')

# preprocess
category_ids = {c['id']: c['snippet']['title'] for c in category_ids['items']}
data.drop_duplicates(subset='video_id', keep='first', inplace=True)
data['category'] = [category_ids[str(i)] for i in data.categoryId]
data = data[['title', 'channelTitle', 'category', 'publishedAt', 'tags', 'view_count', 'likes', 'comment_count',
             'thumbnail_link']].rename(columns={'channelTitle': 'channel_title', 'publishedAt': 'published_at'})
data['published_at'] = [datetime.strptime(time, '%Y-%m-%dT%H:%M:%SZ') for time in data['published_at']]

print(f'Number of videos indexed: {len(data)}')

Number of videos indexed: 37882


In [5]:
# sort by views
data.sort_values('view_count', ascending=False, inplace=True)

display(Markdown('### Top Videos of All Time'))
display(data.head())


# get videos published this week
data_week = data[data.published_at >= (datetime.today() - timedelta(days=7))]

display(Markdown('### Top Videos This Week'))
display(data_week.head())

### Top Videos of All Time

Unnamed: 0,title,channel_title,category,published_at,tags,view_count,likes,comment_count,thumbnail_link
80193,LISA - 'LALISA' M/V,BLACKPINK,Music,2021-09-10 04:00:13,YG Entertainment|YG|와이지|K-pop|BLACKPINK|블랙핑크|블...,85890366,5921316,1958529,https://i.ytimg.com/vi/awkkyBH2zEo/default.jpg
100194,Crazy #alluarjun #painting #shorts #viral #tr...,Dr.Harrsha Artist,Film & Animation,2021-12-08 13:16:02,[None],79283769,5311001,24705,https://i.ytimg.com/vi/CvCtn5Q_nzs/default.jpg
51,Cardi B - WAP feat. Megan Thee Stallion [Offic...,Cardi B,Music,2020-08-07 04:00:10,Cardi B|Cardi|Atlantic Records|rap|hip hop|tra...,76805026,2820345,270263,https://i.ytimg.com/vi/hsm4poTWjMs/default.jpg
114216,"Hey man, we are Italian 🇮🇹😅🤷🏼‍♀️#shorts #funny...",Jessi & Sean,People & Blogs,2022-02-20 20:42:28,[None],71401624,3209656,7844,https://i.ytimg.com/vi/LrJYKxyrMwg/default.jpg
119591,Watch the uncensored moment Will Smith smacks ...,Guardian News,News & Politics,2022-03-28 03:06:53,Jada Pinkett Smith|Jada Pinkett Smith chris ro...,69880692,965557,201533,https://i.ytimg.com/vi/myjEoDypUD8/default.jpg


### Top Videos This Week

Unnamed: 0,title,channel_title,category,published_at,tags,view_count,likes,comment_count,thumbnail_link
208591,ATEEZ(에이티즈) - 'BOUNCY (K-HOT CHILLI PEPPERS)' ...,KQ ENTERTAINMENT,Music,2023-06-16 04:00:04,KQ|케이큐|에이티즈|ATEEZ|THE WORLD 스토리필름|에이티즈 스토리필름|A...,16216516,352923,49951,https://i.ytimg.com/vi/U0G5OA6ZH5w/default.jpg
208990,i met ronaldo 🇵🇹,IShowSpeed,Gaming,2023-06-17 22:00:51,[None],9352163,1123111,61999,https://i.ytimg.com/vi/Fiq9XMRr4jg/default.jpg
208790,6ix9ine - Pa Ti (feat. Yailin La Más Viral) (O...,Tekashi 6ix9ine,Music,2023-06-16 19:05:47,[None],7809395,481929,30333,https://i.ytimg.com/vi/sCyCQ8KLIM0/default.jpg
207990,BTS (방탄소년단) 'Take Two' Live Clip #2023BTSFESTA,BANGTANTV,Music,2023-06-13 09:13:02,방탄소년단|BTS|BANGTAN|알엠|RM|슈가|SUGA|제이홉|jhope|지민|정...,5086417,1218843,76767,https://i.ytimg.com/vi/owjVpYCmwcg/default.jpg
208988,ONE PIECE | Official Teaser Trailer | Netflix,Netflix,Entertainment,2023-06-17 22:26:00,Adventure|Alvida|Anime|Buggy|Eiichiro Oda|Emil...,4225201,159937,22364,https://i.ytimg.com/vi/lNMSqxQtO0w/default.jpg


In [8]:
# get popular channels
channels = data.groupby('channel_title').agg(video_count=('title', 'count'), category=('category', lambda x: x.mode()[0]), total_views=('view_count', 'sum'))
channels.sort_values('total_views', ascending=False, inplace=True)

print(f'{len(channels)} total channels')
display(Markdown('### Top 10 Most Popular Channels'))
display(channels.head(10))

7567 total channels


### Top 10 Most Popular Channels

Unnamed: 0_level_0,video_count,category,total_views
channel_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MrBeast,64,Entertainment,1258200526
NBA,365,Sports,716335767
BLACKPINK,57,Music,689071265
HYBE LABELS,58,Music,617682002
SMTOWN,71,Music,588656748
NFL,329,Sports,529529265
BANGTANTV,72,Music,483997142
MrBeast Gaming,76,Gaming,464474880
JYP Entertainment,71,Music,456012992
SSSniperWolf,117,Entertainment,441121235


In [9]:
# get popular categories
categories = data.groupby('category').agg(top_channel=('channel_title', lambda x: x.mode()[0]), video_count=('title', 'count'), total_views=('view_count', 'sum'))
categories.sort_values('total_views', ascending=False, inplace=True)

display(Markdown('### Top 10 Most Popular Categories'))
display(categories.head(10))

### Top 10 Most Popular Categories

Unnamed: 0_level_0,top_channel,video_count,total_views
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Entertainment,SSSniperWolf,7410,10763071304
Music,BANGTANTV,5970,10358472199
Gaming,SSundee,7528,7559681841
Sports,NBA,4773,5363648250
People & Blogs,Ryland vlogs,3329,3225807284
Film & Animation,The Film Theorists,1438,1913792321
Comedy,The Try Guys,1892,1737728245
Science & Technology,SpaceX,1142,1618817851
News & Politics,Vox,1400,1339840831
Education,Veritasium,881,819295810


In [None]:
# NLP setup
wnl = WordNetLemmatizer()
punct = set(string.punctuation)
to_remove = set(stopwords.words('english')) | punct | set(['–', '—', '...'])

token_freq = pd.DataFrame(columns=['token', 'frequency'])
token_freq.set_index('token')

In [11]:
# return list of tokens, and track number of times each token appears
def analyze_tokens(title):
    tokens = wordpunct_tokenize(title.lower().replace('"', ''))
    cleaned = set()
    
    for t in tokens:
        # skip unwanted tokens
        if t in to_remove:
            continue
        
        # convert word to base form
        t = wnl.lemmatize(t)
        
        # track number of times each token appears
        if t in token_freq.token:
            token_freq.loc[t, 'frequency'] += 1
        else:
            token_freq.loc[t] = [t, 1]
            
        # add to output list of tokens
        cleaned.add(t)
    # end loop
        
    return cleaned

In [12]:
# tokenize title and tags
x = data.title + ' ' + data.tags
transformer = CountVectorizer(analyzer=analyze_tokens).fit(x)
x = transformer.transform(x)

In [19]:
# train model to detect category
categorize = 'Sports'
y = (data.category == categorize)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
model = LogisticRegression(random_state=30, solver='sag', max_iter=10000)
model.fit(x_train, y_train)

LogisticRegression(max_iter=10000, random_state=30, solver='sag')

In [21]:
# test accuracy of model
predictions = model.predict(x_test)
print(f'Detects {categorize.lower()} videos with {round(accuracy_score(y_test, predictions) * 100, 2)} % accuracy')

Detects sports videos with 98.22 % accuracy


In [22]:
# sort keywords by frequency
token_freq.sort_values('frequency', ascending=False, inplace=True)

display(Markdown('### Most Common Keywords'))
display(token_freq.head(10))

### Most Common Keywords

Unnamed: 0,token,frequency
minecraft,minecraft,34266
video,video,23786
game,game,21530
new,new,18626
v,v,17498
highlight,highlight,17326
official,official,14242
music,music,13128
none,none,12562
fortnite,fortnite,12314


In [23]:
# get videos with most common keyword
print(f'The most common keyword was {token_freq.token[0]}.')
display(Markdown(f'### Top {token_freq.token[0].capitalize()} Videos'))
display(data[[(token_freq.token[0] in title) for title in data.title]].head())

The most common keyword was minecraft.


### Top Minecraft Videos

Unnamed: 0,title,channel_title,category,published_at,tags,view_count,likes,comment_count,thumbnail_link
83636,realistic lava vs water in minecraft,steveee,Gaming,2021-09-27 07:00:10,minecraft|realistic|physics|water|shaders|mine...,4264951,103451,3920,https://i.ytimg.com/vi/MUikqGprlOg/default.jpg
81799,realistic lava in minecraft,steveee,Gaming,2021-09-18 07:00:30,minecraft|realistic|physics|water|snapshot|mod...,3870630,87029,5503,https://i.ytimg.com/vi/q3xZgkeUM9I/default.jpg
122015,when minecraft removed the inventory... (april...,camman18,Entertainment,2022-04-09 15:00:01,camman18|camman18 minecraft|minecraft|minecraf...,2885109,375613,2075,https://i.ytimg.com/vi/-g5EZMKAcLI/default.jpg
119244,revisiting old minecraft textures,camman18,Entertainment,2022-03-26 15:00:17,camman18|camman18 minecraft|minecraft|minecraf...,2274038,252135,3595,https://i.ytimg.com/vi/iWE53peAQsc/default.jpg
116391,what if minecraft didn't have wood...,camman18,Entertainment,2022-03-12 16:00:21,camman18|camman18 minecraft|minecraft|minecraf...,2224013,258606,2412,https://i.ytimg.com/vi/qtvxN0CNl-k/default.jpg
