Run this script in Google Colab to process the data and generate new csv files with the additional columns:

- hue
- saturation
- lightness
- contrast
- sharpness
- num_faces

Make sure that the original csv files are located in your Google Drive under "Data Literacy Project/" (or rename the prefix variable to the correct path).

This script will take a while to run, so be patient.

The new csv files will be saved in the same directory as the original csv files.

In [None]:
import pandas as pd
from google.colab import drive, userdata

drive.mount('/content/drive')
prefix = '/content/drive/MyDrive/Data Literacy Project/'
file_names = ["education.csv", "entertainment.csv", "comedy.csv", "howto_style.csv", "people-and-blogs.csv", "gaming.csv", "sports.csv", "news-and-politics.csv"]

def load_file(file_name):
  return pd.read_csv(prefix + file_name)

def save_file(df, file_name):
  df.to_csv(prefix + file_name)

df_arr = [load_file(file_name) for file_name in file_names]
print("Number of csv files: ", len(df_arr))
print(df_arr[0].head())

In [None]:
import time
import os
import cv2
import requests
import numpy as np
from io import BytesIO
from PIL import Image
from tqdm import tqdm
from typing import List, Dict, Any


def time_it(func):
    """
    Decorator to print the time taken for a function to execute.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Time taken for {func.__name__}: {end_time - start_time} seconds")
        return result
    return wrapper


def load_single_image(image_path: str) -> np.ndarray:
    """
    Takes a single image path and returns the image as a Numpy array.
    """
    image_bgr = cv2.imread(image_path)
    if image_bgr is None:
        raise ValueError(f"Image at path '{image_path}' could not be loaded.")
    return image_bgr


def load_images(image_paths: List[str]) -> List[np.ndarray]:
    """
    Takes a list of image paths and returns the images as a list of Numpy arrays.
    """
    return [load_single_image(image_path) for image_path in image_paths]


def download_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        return Image.open(BytesIO(response.content))
    else:
        raise Exception(f"Failed to download image from {url}")


import cv2
import numpy as np


def calculate_image_features(image_bgr: np.ndarray) -> dict:
    """
    Calculates the average hue, saturation, and lightness (HSL) of an image.
    Additionally calculates the contrast and sharpness based on the grayscale version.
    """

    # Convert BGR to HLS
    image_hls = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HLS).astype(np.float32)

    # Split into H, L, S channels
    H = image_hls[:, :, 0]  # Hue channel (0-179 in OpenCV)
    L = image_hls[:, :, 1]  # Lightness channel (0-255)
    S = image_hls[:, :, 2]  # Saturation channel (0-255)

    # Normalize Hue to [0, 360) degrees
    H_degrees = (H * 2) % 360  # OpenCV Hue ranges from 0-179, scaled to 0-358

    # Normalize Saturation and Lightness to [0, 1]
    S_normalized = S / 255.0
    L_normalized = L / 255.0

    # Flatten the arrays for processing
    H_rad = np.deg2rad(H_degrees.flatten())
    S_flat = S_normalized.flatten()
    L_flat = L_normalized.flatten()

    # Compute mean of sine and cosine of Hue
    sin_sum = np.mean(np.sin(H_rad))
    cos_sum = np.mean(np.cos(H_rad))

    # Calculate average Hue
    avg_h_rad = np.arctan2(sin_sum, cos_sum)
    if avg_h_rad < 0:
        avg_h_rad += 2 * np.pi
    avg_hue = np.degrees(avg_h_rad)

    # Calculate average Saturation and Lightness
    avg_saturation = np.mean(S_flat)
    avg_lightness = np.mean(L_flat)

    # Calculate image RMS contrast based on grayscale
    image_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY).astype(np.float32)
    rms_contrast = np.std(image_gray / 255.0)

    # Calculate image sharpness using the variance of the Laplacian
    laplacian_var = cv2.Laplacian(image_gray, cv2.CV_32F).var()

    output = {
        "hue": round(avg_hue, 2),
        "saturation": round(avg_saturation, 4),
        "lightness": round(avg_lightness, 4),
        "contrast": round(rms_contrast, 4),
        "sharpness": round(laplacian_var, 4)
    }

    return output

In [None]:
!pip install deepface
from deepface import DeepFace

import matplotlib.pyplot as plt
import matplotlib.patches as patches


def detect_faces(image_path: str | np.ndarray, detector_backend: str = "ssd") -> List[Dict[str, Any]]:
    """
    Detect faces in an image and return the bounding boxes of the detected faces.
    """
    resp = DeepFace.extract_faces(img_path=image_path, detector_backend=detector_backend, enforce_detection=False, align=False)
    return resp

def count_faces(faces: List[Dict[str, Any]]) -> int:
    count = int(np.sum([1 for face in faces if face['confidence'] > 0]))
    return count

def plot_detected_faces(image_path: str | np.ndarray, detector_backend: str = "ssd") -> None:
    """
    Detect faces in an image and plot the image with bounding boxes around detected faces.

    Args:
        image_path (str): Path to the image file
        detector_backend (str): Face detector backend to use.
            Options: 'opencv', 'retinaface', 'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8', 'centerface'
            (default is 'opencv')
    """
    # Read image
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Detect faces
    faces = detect_faces(image_path, detector_backend)

    # Create figure and axes
    fig, ax = plt.subplots(1)
    ax.imshow(img)

    # Plot each detected face
    for face in faces:
        facial_area = face['facial_area']
        x, y, w, h = facial_area['x'], facial_area['y'], facial_area['w'], facial_area['h']

        # Create a rectangle patch
        rect = patches.Rectangle(
            (x, y), w, h,
            linewidth=2,
            edgecolor='r',
            facecolor='none'
        )

        # Add the rectangle to the plot
        ax.add_patch(rect)

    plt.axis('off')
    plt.show()

def plot_image_with_faces(img, faces, ax=None, show=False):
    """
    Plot a single image with detected faces and face count.

    Args:
        img: Image array in RGB format
        faces: List of detected faces
        ax: Matplotlib axis to plot on
        show: Whether to show the plot immediately
    """
    if ax is None:
        fig, ax = plt.subplots(1)

    ax.imshow(img)

    # Plot each detected face
    for face in faces:
        facial_area = face['facial_area']
        x, y, w, h = facial_area['x'], facial_area['y'], facial_area['w'], facial_area['h']
        rect = patches.Rectangle(
            (x, y), w, h,
            linewidth=1,
            edgecolor='r',
            facecolor='none'
        )
        ax.add_patch(rect)

    # Add face count text below the image
    ax.text(0.5, -0.1, f'Faces: {count_faces(faces)}',
            horizontalalignment='center',
            transform=ax.transAxes)

    ax.axis('off')

    if show:
        plt.show()

In [None]:
detector_backend = "retinaface"

# Process each dataframe and add new columns
for i, df in enumerate(df_arr):
    # Initialize new columns with None
    features = ['hue', 'saturation', 'lightness', 'contrast', 'sharpness', 'num_faces']
    for feature in features:
        df[feature] = None
    
    # Process each row
    for idx in tqdm(df.index, desc=f"Processing images in {file_names[i]}"):
        try:
            # Load and process image
            img_path = df.loc[idx, 'thumbnail-url']
            img = np.array(download_image(img_path))
            
            # Calculate image features
            img_features = calculate_image_features(img)
            
            # Detect faces
            faces = detect_faces(img_path, detector_backend=detector_backend)
            face_count = count_faces(faces)
            
            # Update row with new features
            df.loc[idx, 'hue'] = img_features['hue']
            df.loc[idx, 'saturation'] = img_features['saturation']
            df.loc[idx, 'lightness'] = img_features['lightness']
            df.loc[idx, 'contrast'] = img_features['contrast']
            df.loc[idx, 'sharpness'] = img_features['sharpness']
            df.loc[idx, 'num_faces'] = face_count
            
        except Exception as e:
            print(f"Error processing row {idx} in {file_names[i]}: {str(e)}")
            continue
    
    # Save the updated dataframe
    save_file(df, f"processed_{file_names[i]}")