## 0.2. Initial Data Processing

This notebook contains the initial processing of the data in `data/raw/dataset.csv`. The steps performed in this notebook were added to the `youtube_trends/dataset.py` code as a processing step.

In [1]:
import re
import emoji
import torch
import warnings
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from io import BytesIO
from langdetect import detect
from PIL import Image, ImageStat
from deep_translator import GoogleTranslator
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from youtube_trends.config import RAW_DATA_DIR, INTERIM_DATA_DIR

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[32m2025-05-02 06:52:31.054[0m | [1mINFO    [0m | [36myoutube_trends.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\eddel\OneDrive\Documents\MCD\AAA\youtube_trends\venv\src\youtube-trends[0m


In [2]:
df = pd.read_csv(RAW_DATA_DIR / "dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2857423 entries, 0 to 2857422
Data columns (total 28 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   video_id                         object 
 1   video_published_at               object 
 2   video_trending__date             object 
 3   video_trending_country           object 
 4   channel_id                       object 
 5   video_title                      object 
 6   video_description                object 
 7   video_default_thumbnail          object 
 8   video_category_id                object 
 9   video_tags                       object 
 10  video_duration                   object 
 11  video_dimension                  object 
 12  video_definition                 object 
 13  video_licensed_content           object 
 14  video_view_count                 float64
 15  video_like_count                 float64
 16  video_comment_count              float64
 17  channel_

In [4]:
df['video_published_at'] = pd.to_datetime(df['video_published_at'], errors='coerce').dt.tz_localize(None)
df['video_trending__date'] = pd.to_datetime(df['video_trending__date'], errors='coerce').dt.tz_localize(None)
df['days_until_trend'] = (df['video_trending__date'] - df['video_published_at']).dt.days

In [5]:
df = df.drop(['video_id', 'video_trending__date', 'video_trending_country', 'video_description', 'video_tags', 'video_dimension', 'video_definition', 'video_licensed_content',
              'channel_id', 'channel_title', 'channel_description', 'channel_published_at', 'channel_country', 'channel_have_hidden_subscribers', 'channel_video_count',
              'channel_localized_title', 'channel_localized_description'], axis=1)

In [6]:
df = df.sort_values(by='video_published_at', ascending=False)
start_date = df['video_published_at'].iloc[0] - relativedelta(days=1)
df = df[df['video_published_at'] >= start_date]
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [7]:
print(len(df))

523


In [8]:
def detect_thumbnail(thumbnail_url, idx, class_names, model, pbar):
    results = model(thumbnail_url)
    class_ids = results.xyxy[0][:, 5].int().tolist()
    detections = np.zeros(len(class_names), dtype=int)
    for cls_id in set(class_ids):
        detections[int(cls_id)] = 1
    pbar.update(1)
    return idx, detections

In [9]:
def  thumbnail_parallel_processing(df):
    thumbnail_urls = df['video_default_thumbnail'].values

    model = torch.hub.load('ultralytics/yolov5', 'yolov5n', verbose=False).to(device)
    class_names = ['thumbnail_' + name.replace(' ', '_') for name in model.names.values()]
    detections_array = np.zeros((len(thumbnail_urls), len(class_names)), dtype=int)
    
    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(detect_thumbnail, thumbnail_url, idx, class_names, model, pbar)
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, detections = future.result()
                detections_array[idx] = detections
    
    detections_df = pd.DataFrame(detections_array, columns=class_names)
    detections_df['video_default_thumbnail'] = df['video_default_thumbnail'].values
    detections_df = detections_df.loc[:, (detections_df != detections_df.iloc[0]).any()]
    df = pd.concat([df, detections_df.iloc[:, :-1]], axis=1)

    return df

In [10]:
df = thumbnail_parallel_processing(df)
df.info()

YOLOv5  2025-4-28 Python-3.12.9 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
Processing thumbnails: 100%|██████████| 523/523 [00:07<00:00, 71.05it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   video_published_at        523 non-null    datetime64[ns]
 1   video_title               523 non-null    object        
 2   video_default_thumbnail   523 non-null    object        
 3   video_category_id         523 non-null    object        
 4   video_duration            523 non-null    object        
 5   video_view_count          523 non-null    float64       
 6   video_like_count          523 non-null    float64       
 7   video_comment_count       523 non-null    float64       
 8   channel_custom_url        523 non-null    object        
 9   channel_view_count        523 non-null    float64       
 10  channel_subscriber_count  523 non-null    float64       
 11  days_until_trend          523 non-null    float64       
 12  thumbnail_person      




In [11]:
def thumbnail_stats(thumbnail_url, idx, pbar):
    response = requests.get(thumbnail_url, timeout=10)
    img = Image.open(BytesIO(response.content)).convert('RGB')
    stat = ImageStat.Stat(img)

    brightness = sum(stat.mean) / 3
    contrast = sum(stat.stddev) / 3
    hsv = np.array(img.convert('HSV'))
    saturation = hsv[:, :, 1].mean() / 255

    pbar.update(1)
    return idx, [brightness, contrast, saturation]

In [12]:
def thumbnails_stats_parallel(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    stats_array = np.zeros((len(thumbnail_urls), 3), dtype=float)

    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(thumbnail_stats, thumbnail_url, idx, pbar) 
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, stats = future.result()
                stats_array[idx] = stats

    stats_df = pd.DataFrame(stats_array, columns=['brightness', 'contrast', 'saturation'])
    stats_df['video_default_thumbnail'] = thumbnail_urls
    df = pd.concat([df, stats_df.iloc[:, :-1]], axis=1)
    return df

In [13]:
df = thumbnails_stats_parallel(df) 
df = df.drop(['video_default_thumbnail'], axis=1)
df.info()

Processing thumbnails: 100%|██████████| 523/523 [00:05<00:00, 92.11it/s] 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   video_published_at        523 non-null    datetime64[ns]
 1   video_title               523 non-null    object        
 2   video_category_id         523 non-null    object        
 3   video_duration            523 non-null    object        
 4   video_view_count          523 non-null    float64       
 5   video_like_count          523 non-null    float64       
 6   video_comment_count       523 non-null    float64       
 7   channel_custom_url        523 non-null    object        
 8   channel_view_count        523 non-null    float64       
 9   channel_subscriber_count  523 non-null    float64       
 10  days_until_trend          523 non-null    float64       
 11  thumbnail_person          523 non-null    int64         
 12  thumbnail_car         




In [14]:
def detect_and_translate(title):
    try:
        lang = detect(title)
        translated = GoogleTranslator(source=lang, target='en').translate(title)
    except:
        lang = 'unknown'
        translated = title
    return lang, translated

In [15]:
def clean_title(title):
    title = emoji.replace_emoji(title, replace='')
    title = re.sub(r'[^\w\s]', '', title)
    return title

In [16]:
def process_titles_parallel(df):
    titles = df['video_title'].fillna('').astype(str).tolist()
    
    with ThreadPoolExecutor() as executor:
        clean_titles = list(tqdm(executor.map(clean_title, titles), total=len(titles), desc="Cleaning titles"))

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(detect_and_translate, clean_titles), total=len(clean_titles), desc="Processing video title"))

    languages, translations = zip(*results)
    
    df['video_title_language'] = languages
    df['video_title_translated'] = translations
    return df

In [17]:
df = process_titles_parallel(df)
df = df.drop(['video_title'], axis=1)
df.info()

Cleaning titles: 100%|██████████| 523/523 [00:00<00:00, 174359.83it/s]
Processing video title: 100%|██████████| 523/523 [00:05<00:00, 88.40it/s] 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523 entries, 0 to 522
Data columns (total 43 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   video_published_at        523 non-null    datetime64[ns]
 1   video_category_id         523 non-null    object        
 2   video_duration            523 non-null    object        
 3   video_view_count          523 non-null    float64       
 4   video_like_count          523 non-null    float64       
 5   video_comment_count       523 non-null    float64       
 6   channel_custom_url        523 non-null    object        
 7   channel_view_count        523 non-null    float64       
 8   channel_subscriber_count  523 non-null    float64       
 9   days_until_trend          523 non-null    float64       
 10  thumbnail_person          523 non-null    int64         
 11  thumbnail_car             523 non-null    int64         
 12  thumbnail_motorcycle  




In [18]:
df.to_csv(INTERIM_DATA_DIR / 'dataset.csv', index=False)