## 0.2. Initial Data Processing

This notebook contains the initial processing of the data in `data/raw/dataset.csv`. The steps performed in this notebook were added to the `youtube_trends/dataset.py` code as a processing step.

In [1]:
import re
import emoji
import torch
import isodate
import warnings
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from io import BytesIO
from langdetect import detect
from PIL import Image, ImageStat
from sklearn.decomposition import PCA
from torchvision import models, transforms
from deep_translator import GoogleTranslator
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from youtube_trends.config import RAW_DATA_DIR, INTERIM_DATA_DIR

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[32m2025-05-04 07:49:09.996[0m | [1mINFO    [0m | [36myoutube_trends.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\eddel\OneDrive\Documents\MCD\AAA\youtube_trends\venv\src\youtube-trends[0m


In [2]:
df = pd.read_csv(RAW_DATA_DIR / "dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2857423 entries, 0 to 2857422
Data columns (total 28 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   video_id                         object 
 1   video_published_at               object 
 2   video_trending__date             object 
 3   video_trending_country           object 
 4   channel_id                       object 
 5   video_title                      object 
 6   video_description                object 
 7   video_default_thumbnail          object 
 8   video_category_id                object 
 9   video_tags                       object 
 10  video_duration                   object 
 11  video_dimension                  object 
 12  video_definition                 object 
 13  video_licensed_content           object 
 14  video_view_count                 float64
 15  video_like_count                 float64
 16  video_comment_count              float64
 17  channel_

In [4]:
df = df.drop(['video_id', 'video_trending_country', 'video_description', 'video_dimension', 'video_definition', 'video_licensed_content', 'channel_id',  'channel_title', 'channel_published_at',
              'channel_description', 'channel_country', 'channel_have_hidden_subscribers',  'channel_video_count', 'channel_localized_title', 'channel_localized_description'], axis=1)

In [5]:
df['video_published_at'] = pd.to_datetime(df['video_published_at'], errors='coerce').dt.tz_localize(None)
df['video_trending__date'] = pd.to_datetime(df['video_trending__date'], errors='coerce').dt.tz_localize(None)

In [6]:
df = df.sort_values(by='video_published_at', ascending=False)
start_date = df['video_published_at'].iloc[0] - relativedelta(days=1)
df = df[df['video_published_at'] >= start_date]
df.reset_index(drop=True, inplace=True)

In [7]:
df['published_dayofweek'] = df['video_published_at'].dt.dayofweek
df['published_hour'] = df['video_published_at'].dt.hour
df['days_to_trend'] = (df['video_trending__date'] - df['video_published_at']).dt.days
df = df.drop(['video_trending__date'], axis=1) 

In [8]:
def convert_duration(duration):
    try:
        return isodate.parse_duration(duration).total_seconds()
    except:
        return np.nan

In [9]:
durations = df['video_duration'].fillna('').astype(str).tolist()
with ThreadPoolExecutor() as executor: duration_secs = list(tqdm(executor.map(convert_duration, durations), total=len(durations), desc="Converting durations"))
df['video_duration'] = duration_secs

Converting durations: 100%|██████████| 527/527 [00:00<00:00, 189603.55it/s]


In [10]:
df['video_title_length'] = df['video_title'].str.split().str.len()
df['video_tag_count'] = df['video_tags'].str.split('|').str.len()
df['video_tag_count'] = df['video_tag_count'].fillna(0)
df = df.drop(['video_tags'], axis=1)

In [11]:
def clean_title(title):
    title = emoji.replace_emoji(title, replace='')
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'\s+', ' ', title)     
    return title

In [12]:
def detect_and_translate(title):
    try:
        lang = detect(title)
    except:
        return '', ''
    
    if lang == 'en':
        return 'en', title
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(title)
        return lang, translated
    except:
        return lang, title

In [13]:
def process_titles_parallel(df):
    titles = df['video_title'].fillna('').astype(str).tolist()
    
    with ThreadPoolExecutor() as executor:
        clean_titles = list(tqdm(executor.map(clean_title, titles), total=len(titles), desc="Cleaning titles"))

    languages = []
    translations = []
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(detect_and_translate, title) for title in clean_titles]
        for future in tqdm(futures, desc="Processing video title"):
            lang, translated = future.result()
            languages.append(lang)
            translations.append(translated)
    
    df['video_title_language'] = languages
    df['video_title_translated'] = translations
    df = df.drop(['video_title'], axis=1)
    return df

In [14]:
df = process_titles_parallel(df)

Cleaning titles: 100%|██████████| 527/527 [00:00<00:00, 263613.38it/s]
Processing video title: 100%|██████████| 527/527 [00:11<00:00, 46.52it/s]


In [15]:
df['video_category_id'] = df['video_category_id'].str.replace(' ', '_')
df = pd.get_dummies(df, columns=['video_category_id'])

In [16]:
def detect_thumbnail(thumbnail_url, idx, class_names, model, pbar):
    results = model(thumbnail_url)
    class_ids = results.xyxy[0][:, 5].int().tolist()
    detections = np.zeros(len(class_names), dtype=int)
    for cls_id in set(class_ids):
        detections[int(cls_id)] = 1
    pbar.update(1)
    return idx, detections

In [17]:
def  thumbnail_parallel_processing(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    model = torch.hub.load('ultralytics/yolov5', 'yolov5n', verbose=False).to(device)
    class_names = ['thumbnail_' + name.replace(' ', '_') for name in model.names.values()]
    detections_array = np.zeros((len(thumbnail_urls), len(class_names)), dtype=int)
    
    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(detect_thumbnail, thumbnail_url, idx, class_names, model, pbar)
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, detections = future.result()
                detections_array[idx] = detections
    
    detections_df = pd.DataFrame(detections_array, columns=class_names)
    detections_df['video_default_thumbnail'] = df['video_default_thumbnail'].values
    detection_means = (detections_df == detections_df.iloc[0]).mean()
    detection_to_drop = detection_means[(detection_means > 0.9) | (detection_means < 0.1)].index
    detections_df = detections_df.drop(columns=detection_to_drop)
    df = pd.concat([df, detections_df.iloc[:, :-1]], axis=1)

    return df

In [18]:
df = thumbnail_parallel_processing(df)

YOLOv5  2025-4-28 Python-3.12.9 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
Processing thumbnails: 100%|██████████| 527/527 [00:08<00:00, 65.45it/s]


In [19]:
def thumbnail_stats(thumbnail_url, idx, pbar):
    response = requests.get(thumbnail_url, timeout=10)
    img = Image.open(BytesIO(response.content)).convert('RGB')
    stat = ImageStat.Stat(img)

    brightness = sum(stat.mean) / 3
    contrast = sum(stat.stddev) / 3
    hsv = np.array(img.convert('HSV'))
    saturation = hsv[:, :, 1].mean() / 255

    pbar.update(1)
    return idx, [brightness, contrast, saturation]

In [20]:
def thumbnails_stats_parallel(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    stats_array = np.zeros((len(thumbnail_urls), 3), dtype=float)

    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(thumbnail_stats, thumbnail_url, idx, pbar) 
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, stats = future.result()
                stats_array[idx] = stats

    stats_df = pd.DataFrame(stats_array, columns=['thumbnail_brightness', 'thumbnail_contrast', 'thumbnail_saturation'])
    stats_df['video_default_thumbnail'] = thumbnail_urls
    df = pd.concat([df, stats_df.iloc[:, :-1]], axis=1)
    return df

In [21]:
df = thumbnails_stats_parallel(df) 

Processing thumbnails: 100%|██████████| 527/527 [00:04<00:00, 123.44it/s]


In [22]:
def embedding_thumbnail(thumbnail_url, idx, transform, model, pbar):
    try:
        response = requests.get(thumbnail_url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = transform(img).unsqueeze(0).to(device)  
        with torch.no_grad():
            features = model.features(img)
            features = features.mean([2, 3]).squeeze().cpu().numpy() 
    except Exception as e:
        print(f"Error procesando {thumbnail_url}: {e}")
        features = np.full((1280,), np.nan) 
    pbar.update(1)
    return idx, features

In [23]:
def thumbnail_parallel_embeddings(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    
    model = models.mobilenet_v2(pretrained=True)
    model.eval()
    model = model.to(device) 
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    n_samples = len(thumbnail_urls)
    embedding_dim = 1280  
    embeddings_array = np.zeros((n_samples, embedding_dim), dtype=np.float32)

    with tqdm(total=n_samples, desc="Extracting embeddings") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(embedding_thumbnail, url, idx, transform, model, pbar)
                for idx, url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, embedding = future.result()
                embeddings_array[idx] = embedding

    pca_complete = PCA().fit(embeddings_array)
    cumulative_variance = np.cumsum(pca_complete.explained_variance_ratio_)
    n_components = np.searchsorted(cumulative_variance, 0.70) + 1 
    n_components = min(n_components, 40)
    pca = PCA(n_components=n_components)  
    reduced_embeddings = pca.fit_transform(embeddings_array)

    embed_cols = [f'thumb_emb_{i}' for i in range(reduced_embeddings.shape[1])]  
    embeddings_df = pd.DataFrame(reduced_embeddings, columns=embed_cols)

    return pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)

In [24]:
df = thumbnail_parallel_embeddings(df)
df = df.drop(['video_default_thumbnail'], axis=1)

Extracting embeddings: 100%|██████████| 527/527 [00:07<00:00, 72.87it/s]


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 62 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   video_published_at                      527 non-null    datetime64[ns]
 1   video_duration                          527 non-null    float64       
 2   video_view_count                        527 non-null    float64       
 3   video_like_count                        523 non-null    float64       
 4   video_comment_count                     527 non-null    float64       
 5   channel_custom_url                      527 non-null    object        
 6   channel_view_count                      527 non-null    float64       
 7   channel_subscriber_count                527 non-null    float64       
 8   published_dayofweek                     527 non-null    int32         
 9   published_hour                          527 non-null  

In [26]:
df.to_csv(INTERIM_DATA_DIR / 'dataset.csv', index=False)