Este cuadernillo contiene todo el código empleado para realizar la parte de nuestro TFM relativa al preprocesamiento del dataset Youtube Video Trending Dataset. Incluye las siguientes secciones:

- Instalación de subprogramas y librerías
- Preprocesamiento de los datos

## Instalación de subprogramas y librerías

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#import seaborn as sns
import isodate #para manejar formato de duración ISO 8601
import re #para manejar expresiones regulares
from textblob import TextBlob
from lingua import Language, LanguageDetectorBuilder
import cv2
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm
import pytesseract
from datetime import datetime

# Activar barra de progreso en pandas
tqdm.pandas()

## Preprocesamiento de datos

### Lectura del archivo y filtro de vídeo único y suscriptores

In [2]:
# Ruta al archivo original grande
ruta = "youtube_trending_videos_global.csv"

# Leer por chunks para no saturar la memoria
chunksize = 100_000
filtrados = []

for chunk in pd.read_csv(ruta, chunksize=chunksize):
    # Filtrar canales con menos de 20.000 suscriptores
    chunk_filtrado = chunk[chunk["channel_subscriber_count"] < 20000]
    filtrados.append(chunk_filtrado)

# Unir todos los trozos filtrados
df = pd.concat(filtrados, ignore_index=True)

# Eliminar duplicados por video_id, conservando el primero
df = df.drop_duplicates(subset="video_id", keep="first")

print(f"Dataset cargado con {len(df)} vídeos únicos de canales pequeños (< 20.000 suscriptores)")

  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):
  for chunk in pd.read_csv(ruta, chunksize=chunksize):


Dataset cargado con 19866 vídeos únicos de canales pequeños (< 20.000 suscriptores)


In [3]:
print(df.columns.tolist())

['video_id', 'video_published_at', 'video_trending__date', 'video_trending_country', 'channel_id', 'video_title', 'video_description', 'video_default_thumbnail', 'video_category_id', 'video_tags', 'video_duration', 'video_dimension', 'video_definition', 'video_licensed_content', 'video_view_count', 'video_like_count', 'video_comment_count', 'channel_title', 'channel_description', 'channel_custom_url', 'channel_published_at', 'channel_country', 'channel_view_count', 'channel_subscriber_count', 'channel_have_hidden_subscribers', 'channel_video_count', 'channel_localized_title', 'channel_localized_description']


### Creación de columnas auxiliares

#### Funciones auxiliares

In [4]:
def duration_to_seconds(d):
    try:
        return isodate.parse_duration(d).total_seconds()
    except:
        return np.nan

def uppercase_ratio(text):
    text = str(text)
    if len(text) == 0:
        return 0
    upper = sum(1 for c in text if c.isupper())
    return upper / len(text)

def has_links(text):
    return any(x in str(text).lower() for x in ["http", "www", "bit.ly", "youtu.be"])

def convertir_a_hq(url):
    if isinstance(url, str) and "/default.jpg" in url:
        return url.replace("/default.jpg", "/hqdefault.jpg")
    return url

def download_image(url):
    try:
        response = requests.get(url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        return img
    except:
        return None

def preprocess_for_ocr(img_pil):
    try:
        img_gray = img_pil.convert("L")
        img_np = np.array(img_gray)
        _, img_thresh = cv2.threshold(img_np, 120, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return Image.fromarray(img_thresh)
    except:
        return img_pil

def count_text(img_pil):
    try:
        img_preprocessed = preprocess_for_ocr(img_pil)
        text = pytesseract.image_to_string(img_preprocessed)
        return len(text.strip().split())
    except:
        return -1

def calculate_colorfulness(img_pil):
    try:
        img = np.array(img_pil)
        (B, G, R) = cv2.split(img.astype("float"))
        rg = np.absolute(R - G)
        yb = np.absolute(0.5 * (R + G) - B)
        std_rg, std_yb = np.std(rg), np.std(yb)
        mean_rg, mean_yb = np.mean(rg), np.mean(yb)
        return np.sqrt(std_rg**2 + std_yb**2) + (0.3 * np.sqrt(mean_rg**2 + mean_yb**2))
    except:
        return -1

def detectar_caras_en_url(url_imagen):
    try:
        resp = requests.get(url_imagen, stream=True, timeout=5)
        if resp.status_code != 200:
            return -1
        imagen_array = np.asarray(bytearray(resp.content), dtype=np.uint8)
        imagen = cv2.imdecode(imagen_array, cv2.IMREAD_COLOR)
        if imagen is None:
            return -1
        gris = cv2.cvtColor(imagen, cv2.COLOR_BGR2GRAY)
        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
        caras = face_cascade.detectMultiScale(gris, scaleFactor=1.1, minNeighbors=5)
        return len(caras)
    except:
        return -1

#### Creación de las columnas

In [5]:
# TRANSFORMACIONES Y FEATURES DERIVADAS

# Procesar URLs de miniatura
if "thumbnail_url_hq" not in df.columns:
    df["thumbnail_url_hq"] = df["video_default_thumbnail"].apply(convertir_a_hq)

# Texto
df["title_length"] = df["video_title"].astype(str).progress_apply(len)
df["title_word_count"] = df["video_title"].astype(str).progress_apply(lambda x: len(x.split()))
df["title_has_exclamation"] = df["video_title"].astype(str).progress_apply(lambda x: "!" in x)
df["title_has_question"] = df["video_title"].astype(str).progress_apply(lambda x: "?" in x)
df["title_uppercase_ratio"] = df["video_title"].astype(str).progress_apply(uppercase_ratio)
df['title_sentiment'] = df['video_title'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df["description_length"] = df["video_description"].astype(str).progress_apply(len)
df['description_sentiment'] = df['video_description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df["has_external_links"] = df["video_description"].astype(str).progress_apply(has_links)
df["tag_count"] = df["video_tags"].astype(str).progress_apply(lambda x: len(x.split(",")) if pd.notnull(x) else 0)

# Tiempo
df["published_at"] = pd.to_datetime(df["video_published_at"], errors="coerce")
df["hour_of_day"] = df["published_at"].dt.hour
df["day_of_week"] = df["published_at"].dt.weekday
df["is_weekend"] = df["day_of_week"].isin([5, 6])
df["is_peak_hour"] = df["hour_of_day"].between(15, 17)

# Duración
df["video_duration_sec"] = df["video_duration"].progress_apply(duration_to_seconds)

# Miniatura
df["thumbnail_text_count"] = df["thumbnail_url_hq"].progress_apply(lambda x: count_text(download_image(x)))
df["thumbnail_colorfulness"] = df["thumbnail_url_hq"].progress_apply(lambda x: calculate_colorfulness(download_image(x)))
df["thumbnail_faces_count"] = df["thumbnail_url_hq"].progress_apply(detectar_caras_en_url)

#Métricas
df["views_per_second"] = df["video_view_count"] / (df["video_duration_sec"] + 1)
df["likes_per_view"] = df["video_like_count"] / (df["video_view_count"] + 1)
df["likes_per_sub"] = df["video_like_count"] / (df["channel_subscriber_count"] + 1)
df["views_per_sub"] = df["video_view_count"] / (df["channel_subscriber_count"] + 1)
df["viral_score"] = df["views_per_sub"] + df["likes_per_view"] + df["likes_per_sub"]
df["is_viral"] = (df["views_per_sub"] > 1.5).astype(int)

100%|███████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 1496534.42it/s]
100%|████████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 737159.11it/s]
100%|███████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 1257797.35it/s]
100%|███████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 1316668.40it/s]
100%|████████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 316604.77it/s]
100%|████████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 689391.92it/s]
100%|████████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 112876.15it/s]
100%|████████████████████████████████████████████████████████████████████████| 19866/19866 [00:00<00:00, 545460.78it/s]
100%|███████████████████████████████████

#### Exportación dataset plus

In [None]:
output_csv = "videos_virales_final_plus.csv"
df.to_csv(output_csv, index=False)
print(f"Dataset guardado como '{output_csv}'")

#### Reordenación de columnas y exportación

In [3]:
ordered_cols = [
"video_id", "video_published_at", "channel_id", "video_title", "video_description", "video_default_thumbnail",
"video_category_id", "video_tags", "video_duration", "video_definition", "video_view_count", "video_like_count",
"video_comment_count", "channel_title", "channel_published_at", "channel_subscriber_count", "channel_video_count",
"thumbnail_url_hq", "thumbnail_faces_count", "title_length", "title_word_count", "title_has_exclamation",
"title_has_question", "title_has_keywords", "title_uppercase_ratio", "title_sentiment", "description_length", 
"description_sentiment", "has_external_links", "tag_count", "video_duration_sec", "published_at", "hour_of_day", 
"day_of_week", "is_weekend", "is_peak_hour","thumbnail_text_count", "thumbnail_colorfulness", "is_viral",
"viral_score", "views_per_second", "likes_per_view", "likes_per_sub", "views_per_sub"
]

df = df[[col for col in ordered_cols if col in df.columns]]

# Exportar
output_csv = "videos_virales_final.csv"
df.to_csv(output_csv, index=False)
print(f"Dataset guardado como '{output_csv}'")

Dataset guardado como 'videos_virales_final.csv'
