## 0.2. Initial Data Processing

This notebook contains the initial processing of the data in `data/raw/dataset.csv`. The steps performed in this notebook were added to the `youtube_trends/dataset.py` code as a processing step.

In [1]:
import re
import emoji
import torch
import isodate
import warnings
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from io import BytesIO
from langdetect import detect
from PIL import Image, ImageStat
from sklearn.decomposition import PCA
from torchvision import models, transforms
from deep_translator import GoogleTranslator
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor
from youtube_trends.config import RAW_DATA_DIR, INTERIM_DATA_DIR

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[32m2025-05-04 21:09:35.689[0m | [1mINFO    [0m | [36myoutube_trends.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\eddel\OneDrive\Documents\MCD\AAA\youtube_trends\venv\src\youtube-trends[0m


In [2]:
df = pd.read_csv(RAW_DATA_DIR / "dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2857423 entries, 0 to 2857422
Data columns (total 28 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   video_id                         object 
 1   video_published_at               object 
 2   video_trending__date             object 
 3   video_trending_country           object 
 4   channel_id                       object 
 5   video_title                      object 
 6   video_description                object 
 7   video_default_thumbnail          object 
 8   video_category_id                object 
 9   video_tags                       object 
 10  video_duration                   object 
 11  video_dimension                  object 
 12  video_definition                 object 
 13  video_licensed_content           object 
 14  video_view_count                 float64
 15  video_like_count                 float64
 16  video_comment_count              float64
 17  channel_

In [4]:
df = df.drop(['video_id', 'video_trending_country', 'video_description', 'video_dimension', 'video_definition', 'video_licensed_content', 
                  'channel_id',  'channel_title', 'channel_published_at', 'channel_description', 'channel_country', 'channel_video_count',
                  'channel_have_hidden_subscribers', 'channel_localized_title', 'channel_localized_description'], axis=1)

In [5]:
df['video_published_at'] = pd.to_datetime(df['video_published_at'], errors='coerce').dt.tz_localize(None)
df['video_trending__date'] = pd.to_datetime(df['video_trending__date'], errors='coerce').dt.tz_localize(None)

In [6]:
df = df.sort_values(by='video_published_at', ascending=False)
start_date = df['video_published_at'].iloc[0] - relativedelta(days=1)
df = df[df['video_published_at'] >= start_date]
df.reset_index(drop=True, inplace=True)

In [7]:
df['published_dayofweek'] = df['video_published_at'].dt.dayofweek
df['published_hour'] = df['video_published_at'].dt.hour
df['days_to_trend'] = (df['video_trending__date'] - df['video_published_at']).dt.days
df = df[df['days_to_trend'] >= 0]
df = df.drop(['video_trending__date'], axis=1) 

In [8]:
df['video_title_length'] = df['video_title'].str.split().str.len()
df['video_tag_count'] = df['video_tags'].str.split('|').str.len()
df['video_tag_count'] = df['video_tag_count'].fillna(0)
df = df.drop(['video_tags'], axis=1)
df = df.dropna()

In [9]:
def convert_duration(duration):
    try:
        return isodate.parse_duration(duration).total_seconds()
    except:
        return np.nan

In [10]:
durations = df['video_duration'].fillna('').astype(str).tolist()
with ThreadPoolExecutor() as executor: duration_secs = list(tqdm(executor.map(convert_duration, durations), total=len(durations), desc="Converting durations"))
df['video_duration'] = duration_secs

Converting durations: 100%|██████████| 398/398 [00:00<00:00, 199204.41it/s]


In [11]:
def clean_title(title):
    title = emoji.replace_emoji(title, replace='')
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'\s+', ' ', title)     
    return title

In [12]:
def detect_and_translate(title):
    try:
        lang = detect(title)
    except:
        return '', ''
    
    if lang == 'en':
        return 'en', title
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(title)
        return lang, translated
    except:
        return lang, title

In [13]:
def process_titles_parallel(df):
    titles = df['video_title'].fillna('').astype(str).tolist()
    
    with ThreadPoolExecutor() as executor:
        clean_titles = list(tqdm(executor.map(clean_title, titles), total=len(titles), desc="Cleaning titles"))

    languages = []
    translations = []
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(detect_and_translate, title) for title in clean_titles]
        for future in tqdm(futures, desc="Processing video title"):
            lang, translated = future.result()
            languages.append(lang)
            translations.append(translated)
    
    df['video_title_language'] = languages
    df['video_title_translated'] = translations
    df = df.drop(['video_title'], axis=1)
    return df

In [14]:
df = process_titles_parallel(df)

Cleaning titles: 100%|██████████| 398/398 [00:00<00:00, 398028.85it/s]
Processing video title: 100%|██████████| 398/398 [00:04<00:00, 79.82it/s] 


In [15]:
df['video_category_id'] = df['video_category_id'].str.replace(' ', '_')
df = pd.get_dummies(df, columns=['video_category_id'])
dummy_cols = [col for col in df.columns if col.startswith('video_category_id_')]
df[dummy_cols] = df[dummy_cols].astype(int)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 398 entries, 127 to 526
Data columns (total 28 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   video_published_at                      398 non-null    datetime64[ns]
 1   video_default_thumbnail                 398 non-null    object        
 2   video_duration                          398 non-null    float64       
 3   video_view_count                        398 non-null    float64       
 4   video_like_count                        398 non-null    float64       
 5   video_comment_count                     398 non-null    float64       
 6   channel_custom_url                      398 non-null    object        
 7   channel_view_count                      398 non-null    float64       
 8   channel_subscriber_count                398 non-null    float64       
 9   published_dayofweek                     398 non-null    i

In [17]:
def detect_thumbnail(thumbnail_url, idx, class_names, model, pbar):
    results = model(thumbnail_url)
    class_ids = results.xyxy[0][:, 5].int().tolist()
    detections = np.zeros(len(class_names), dtype=int)
    for cls_id in set(class_ids):
        detections[int(cls_id)] = 1
    pbar.update(1)
    return idx, detections

In [18]:
def detect_thumbnail(thumbnail_url, idx, class_names, model, pbar, img_size=640):
    try:
        response = requests.get(thumbnail_url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img_resized = img.resize((img_size, img_size))
        results = model(img_resized, size=img_size)
        detected_classes = [class_names[int(cls)] for cls in results.pred[0][:, -1].cpu().numpy()]
        detections = [1 if name in detected_classes else 0 for name in class_names]
        return idx, detections
    except Exception as e:
        print(f"Error en {thumbnail_url}: {e}")
        return idx, [0] * len(class_names)
    finally:
        pbar.update(1)

In [19]:
def  thumbnail_parallel_detect(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    model = torch.hub.load('ultralytics/yolov5', 'yolov5n', verbose=False).to(device)
    class_names = ['thumbnail_' + name.replace(' ', '_') for name in model.names.values()]
    detections_array = np.zeros((len(thumbnail_urls), len(class_names)), dtype=int)
    
    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails class") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(detect_thumbnail, thumbnail_url, idx, class_names, model, pbar)
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, detections = future.result()
                detections_array[idx] = detections
    
    detections_df = pd.DataFrame(detections_array, columns=class_names)
    detections_df['video_default_thumbnail'] = df['video_default_thumbnail'].values
    detection_means = (detections_df == detections_df.iloc[0]).mean()
    detection_to_drop = detection_means[(detection_means > 0.95) | (detection_means < 0.05)].index
    detections_df = detections_df.drop(columns=detection_to_drop)
    df = pd.concat([df, detections_df.iloc[:, :-1]], axis=1)
    df = df.dropna()

    return df

In [20]:
df = thumbnail_parallel_detect(df)

YOLOv5  2025-4-28 Python-3.12.9 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
Processing thumbnails class: 100%|██████████| 398/398 [00:05<00:00, 69.42it/s]


In [21]:
def thumbnail_stats(thumbnail_url, idx, pbar):
    response = requests.get(thumbnail_url, timeout=10)
    img = Image.open(BytesIO(response.content)).convert('RGB')
    stat = ImageStat.Stat(img)

    brightness = sum(stat.mean) / 3
    contrast = sum(stat.stddev) / 3
    hsv = np.array(img.convert('HSV'))
    saturation = hsv[:, :, 1].mean() / 255

    pbar.update(1)
    return idx, [brightness, contrast, saturation]

In [22]:
def thumbnails_stats_parallel(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    stats_array = np.zeros((len(thumbnail_urls), 3), dtype=float)

    with tqdm(total=len(thumbnail_urls), desc="Processing thumbnails stats") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(thumbnail_stats, thumbnail_url, idx, pbar) 
                for idx, thumbnail_url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, stats = future.result()
                stats_array[idx] = stats

    stats_df = pd.DataFrame(stats_array, columns=['thumbnail_brightness', 'thumbnail_contrast', 'thumbnail_saturation'])
    stats_df['video_default_thumbnail'] = thumbnail_urls
    df = pd.concat([df, stats_df.iloc[:, :-1]], axis=1)
    return df

In [23]:
df = thumbnails_stats_parallel(df) 

Processing thumbnails stats: 100%|██████████| 271/271 [00:02<00:00, 90.36it/s] 


In [24]:
def embedding_thumbnail(thumbnail_url, idx, transform, model, pbar):
    try:
        response = requests.get(thumbnail_url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img = transform(img).unsqueeze(0).to(device)  
        with torch.no_grad():
            features = model.features(img)
            features = features.mean([2, 3]).squeeze().cpu().numpy() 
    except Exception as e:
        print(f"Error procesando {thumbnail_url}: {e}")
        features = np.full((1280,), np.nan) 
    pbar.update(1)
    return idx, features

In [25]:
def thumbnail_parallel_embeddings(df):
    thumbnail_urls = df['video_default_thumbnail'].values
    
    model = models.mobilenet_v2(pretrained=True)
    model.eval()
    model = model.to(device) 
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    n_samples = len(thumbnail_urls)
    embedding_dim = 1280  
    embeddings_array = np.zeros((n_samples, embedding_dim), dtype=np.float32)

    with tqdm(total=n_samples, desc="Extracting thumbnails embeddings") as pbar:
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(embedding_thumbnail, url, idx, transform, model, pbar)
                for idx, url in enumerate(thumbnail_urls)
            ]
            for future in futures:
                idx, embedding = future.result()
                embeddings_array[idx] = embedding

    valid_rows = ~np.isnan(embeddings_array).any(axis=1) 
    embeddings_array = embeddings_array[valid_rows] 

    pca_complete = PCA().fit(embeddings_array)
    cumulative_variance = np.cumsum(pca_complete.explained_variance_ratio_)
    n_components = np.searchsorted(cumulative_variance, 0.70) + 1 
    n_components = min(n_components, 40)
    pca = PCA(n_components=n_components)  
    reduced_embeddings = pca.fit_transform(embeddings_array)

    embed_cols = [f'thumb_emb_{i}' for i in range(reduced_embeddings.shape[1])]  
    embeddings_df = pd.DataFrame(reduced_embeddings, columns=embed_cols)
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    df = df.dropna()

    return pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)

In [26]:
df = thumbnail_parallel_embeddings(df)
df = df.drop(['video_default_thumbnail'], axis=1)

Extracting thumbnails embeddings:  97%|█████████▋| 387/398 [00:04<00:00, 396.00it/s]

Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Error procesando nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant h

Extracting thumbnails embeddings: 100%|██████████| 398/398 [00:04<00:00, 94.45it/s] 


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 71 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   video_published_at                      144 non-null    datetime64[ns]
 1   video_duration                          144 non-null    float64       
 2   video_view_count                        144 non-null    float64       
 3   video_like_count                        144 non-null    float64       
 4   video_comment_count                     144 non-null    float64       
 5   channel_custom_url                      144 non-null    object        
 6   channel_view_count                      144 non-null    float64       
 7   channel_subscriber_count                144 non-null    float64       
 8   published_dayofweek                     144 non-null    float64       
 9   published_hour                          144 non-null  

In [28]:
df.to_csv(INTERIM_DATA_DIR / 'dataset.csv', index=False)