In [1]:
# 1. Scrape data form YT videos
# 2. Scrape stats from channels
# 3. Download thumbnails
# 4. Download videos
# 5. Run a feature extractor over thumbnails
# 6. Run a feature extractor over videos (video and audio)
# 7. Create a preprocessors and dataloader for all data
# 8. Create the regression model
# 9. Train the model
# 10. Test the model

In [2]:
import pandas as pd

In [98]:
video_data_file = 'data/yt_video_data.csv'
channel_data_file = 'data/yt_channel_data.csv'

video_df = pd.read_csv(video_data_file, index_col=0)
channel_df = pd.read_csv(channel_data_file, index_col=0)

In [4]:
print(video_df.columns.tolist())
print(channel_df.columns.tolist())

['thumbnail_link', 'view_count', 'date', 'video_title', 'video_description', 'subscriber_count', 'likes', 'scrape_date', 'channel_name', 'channel_link', 'video_url']
['channel_name', 'channel_url', 'title', 'upload_date', 'view_count', 'scrape_date']


In [5]:
len(video_df['channel_name'].unique())

8226

In [57]:
"""
The program makes the necessary imports, then creates an image feature extractor.
The model used for feature extraction is a pretrained resnet50 model.
Torchfx is used to make the output of the model the 2nd and 4th layer outputs of the resnet model.
"""

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import tqdm

from models import ImageFeatureExtractor

DEVICE = 'cuda:0'


In [43]:
"""
Creates a dataset of all the images in the "thumbnails" folder.
The ImageFolder class is not used because there are not classes,
rather, the folder directly contains all the images.
"""
import os
from PIL import Image

# # Source: https://discuss.pytorch.org/t/how-to-resize-and-pad-in-a-torchvision-transforms-compose/71850
# class SquarePad:
#     def __call__(self, image):
#         max_wh = max(image.size)
#         p_left, p_top = [(max_wh - s) // 2 for s in image.size]
#         p_right, p_bottom = [max_wh - (s+pad) for s, pad in zip(image.size, [p_left, p_top])]
#         padding = (p_left, p_top, p_right, p_bottom)
#         return F.pad(image, padding, 0, 'constant')

# TODO: Try redoing this with cropping or square padding
img_transform = transforms.Compose([
        # transforms.Resize(256),
        # transforms.CenterCrop(224),
        # SquarePad(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225])
    ])

class ImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.img_names = os.listdir(self.root_dir)

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        img_name = self.img_names[idx]
        img_path = os.path.join(self.root_dir, img_name)
        img = Image.open(img_path)
        if self.transform:
            img = self.transform(img)
        return img_name, img

In [44]:
tds = ImageDataset(root_dir='thumbnails', transform=img_transform)
# Create a dataloader for the images
tdl = DataLoader(tds, batch_size=2, shuffle=False)
a = next(tdl.__iter__())

In [45]:
a[1].shape

torch.Size([2, 3, 224, 224])

In [46]:
fe = ImageFeatureExtractor()
fe(a[1]).shape

torch.Size([2, 2048])

In [68]:
# Add thumbnail embeddings to the video dataframe
thumbnail_dataset = ImageDataset(root_dir='thumbnails', transform=img_transform)
thumbnail_loader = DataLoader(tds, batch_size=64, shuffle=False)

feature_extractor = ImageFeatureExtractor().to(DEVICE)

thumbnail_features = {}
print('Generating thumbnail features...')
for img_names, imgs in tqdm.tqdm(thumbnail_loader):
    imgs = imgs.to(DEVICE)
    all_features = feature_extractor(imgs)
    all_features = all_features.cpu().detach().numpy()
    for img_name, img_features in zip(img_names, all_features):
        img_idx = int(img_name.split('.')[0])
        thumbnail_features[img_idx] = img_features

Generating thumbnail features...


100%|██████████| 280/280 [01:08<00:00,  4.06it/s]


In [69]:
video_df['thumbnail_embedding'] = video_df.apply(lambda x: thumbnail_features.get(x.name), axis=1)

In [99]:
print(video_df.columns)
print(channel_df.columns)

Index(['thumbnail_link', 'view_count', 'date', 'video_title',
       'video_description', 'subscriber_count', 'likes', 'scrape_date',
       'channel_name', 'channel_link', 'video_url'],
      dtype='object')
Index(['channel_name', 'channel_link', 'title', 'upload_date', 'view_count',
       'scrape_date'],
      dtype='object')


In [102]:
video_df.shape

(26005, 11)

In [None]:
# Data prep steps:
# 1. Generate features for the video df thumbnails
# 2. Change the channel df column names
# 3. Merge the video and channel dfs
# 4. Drop NA rows and then drop rows with duplicate video urls
# 5. Save the new dataframe

In [None]:
video_df

In [None]:
final_df = 

In [105]:
pd.merge(video_df, channel_df, how='left', on=['channel_name', 'channel_link'])

Unnamed: 0,thumbnail_link,view_count_x,date,video_title,video_description,subscriber_count,likes,scrape_date_x,channel_name,channel_link,video_url,title,upload_date,view_count_y,scrape_date_y
0,,412161,"Dec 12, 2021",Giants vs. Chargers Week 14 Highlights | NFL 2021,The New York Giants take on the Los Angeles Ch...,8520000,4884.0,"Dec 14, 2021",NFL,https://www.youtube.com/channel/UCDVYQ4Zhbm3S2...,https://www.youtube.com/watch?v=ziXPgr8hiLM,"(""How Rodney McLeod's 'Game Changers' Provides...","('7 hours ago', '12 hours ago', 'Streamed 14 h...","(4300, 47000, 51000, 81000, 9900, 9900, 123000...","Dec 15, 2021"
1,https://i.ytimg.com/vi/LZzH9VqfFEU/hqdefault.j...,412161,"Dec 12, 2021",Giants vs. Chargers Week 14 Highlights | NFL 2021,The New York Giants take on the Los Angeles Ch...,8520000,4884.0,"Dec 14, 2021",NFL,https://www.youtube.com/channel/UCDVYQ4Zhbm3S2...,https://www.youtube.com/watch?v=LZzH9VqfFEU,"(""How Rodney McLeod's 'Game Changers' Provides...","('7 hours ago', '12 hours ago', 'Streamed 14 h...","(4300, 47000, 51000, 81000, 9900, 9900, 123000...","Dec 15, 2021"
2,,14837155,"May 22, 2021",Math Has a Fatal Flaw,Not everything that is true can be proven. Thi...,11100000,542231.0,"Dec 14, 2021",Veritasium,https://www.youtube.com/channel/UCHnyfMqiRRG1u...,https://www.youtube.com/watch?v=HeQX2HjkcNo,"('The Snowflake Mystery', ""Most People Don't K...","('2 weeks ago', '2 weeks ago', '3 weeks ago', ...","(5700000, 10000000, 10000000, 9600000, 5800000...","Dec 15, 2021"
3,,25240078,"Oct 31, 2012","Skydiving into the Blue Hole, Belize",http://yogaFLIGHT.com\n\nThe ultimate skydive ...,46600,258377.0,"Dec 14, 2021",yogaFLIGHT,https://www.youtube.com/channel/UCbmzgkvMl7SHK...,https://www.youtube.com/watch?v=jmZ0fJC5lwQ,"('Toronto: Top 6 of the 6ix!', 'A yogaFLIGHT B...","('3 years ago', '7 years ago', '7 years ago', ...","(3700, 3000, 14000, 1100, 3700, 14000, 16000, ...","Dec 15, 2021"
4,,259512,"Dec 2, 2021",Top 10 Worst Video Games of 2021,These games were undoubtedly the worst to rele...,24100000,5746.0,"Dec 14, 2021",WatchMojo.com,https://www.youtube.com/channel/UCaWd5_7JhbQBe...,https://www.youtube.com/watch?v=EGWBKvB098Q,"('On This Day In 2000 | RetroVideo', 'Top 10 S...","('1 hour ago', '6 hours ago', '9 hours ago', '...","(2300, 18000, 32000, 24000, 40000, 16000, 1900...","Dec 15, 2021"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26000,https://i.ytimg.com/vi/Czg_9C7gw0o/hqdefault.j...,7523088,"Jun 25, 2019",Waking up as a meme-hero | Andras Arato | TEDx...,What would you feel if one day you wake up and...,33400000,405913.0,"Dec 14, 2021",TEDx Talks,https://www.youtube.com/channel/UCsT0YIqwnpJCM...,https://www.youtube.com/watch?v=Czg_9C7gw0o,,,,
26001,https://i.ytimg.com/vi/xI8lLpYtQ6M/hqdefault.j...,3289008,"Jun 22, 2017",Is Social Media Hurting Your Mental Health? | ...,Scrolling through our social media feeds feels...,33400000,93058.0,"Dec 14, 2021",TEDx Talks,https://www.youtube.com/channel/UCsT0YIqwnpJCM...,https://www.youtube.com/watch?v=xI8lLpYtQ6M,,,,
26002,https://i.ytimg.com/vi/KElkR9qz-f4/hqdefault.j...,1529396,"Dec 11, 2021","I Survived 2,600 Days in HARDCORE Minecraft...","2,500 DAYS HARDCORE VIDEO: https://youtu.be/F_...",1180000,5926.0,"Dec 14, 2021",SB737,https://www.youtube.com/channel/UCh_qU1GUf4FcN...,https://www.youtube.com/watch?v=KElkR9qz-f4,,,,
26003,https://i.ytimg.com/vi/QlmlqAW0cUQ/hqdefault.j...,3354281,"Oct 5, 2015",Mind is Everything | Dr. David Hendricks | TED...,NOTE FROM TED: Do not look to this talk for me...,33400000,22285.0,"Dec 14, 2021",TEDx Talks,https://www.youtube.com/channel/UCsT0YIqwnpJCM...,https://www.youtube.com/watch?v=QlmlqAW0cUQ,,,,


In [95]:
len(video_df.dropna().drop_duplicates(subset=['thumbnail_link', 'video_title']))

16058

In [48]:
thumbnail_features

{1: array([0.3192735 , 0.68836987, 0.7462309 , ..., 0.79271585, 0.24264106,
        0.02141257], dtype=float32),
 10: array([0.11920096, 0.26325592, 0.6720083 , ..., 0.758535  , 0.34693724,
        0.06105711], dtype=float32),
 100: array([0.03960769, 0.692111  , 0.26814315, ..., 0.41287103, 1.0050873 ,
        0.11512326], dtype=float32),
 10000: array([0.65285236, 0.78685135, 1.1871752 , ..., 0.3095802 , 0.30660123,
        0.15129216], dtype=float32),
 10001: array([0.76707625, 0.77118087, 0.35248336, ..., 0.32322475, 0.05263793,
        0.50666314], dtype=float32),
 10002: array([0.43951142, 0.21792398, 0.3948624 , ..., 0.23010172, 0.461626  ,
        0.8046133 ], dtype=float32),
 10003: array([0.2955636 , 0.15273872, 0.46796793, ..., 0.18608952, 0.35784912,
        0.2490325 ], dtype=float32),
 10004: array([0.8736051 , 0.08869307, 0.8069382 , ..., 0.560225  , 0.5274395 ,
        0.18770084], dtype=float32),
 10005: array([0.6603811 , 0.13746347, 0.44977596, ..., 0.70550156, 0.027