# Install Packages

In [1]:
#!pip install yt-dlp opencv-python-headless deepface google-api-python-client

# Code to Clean All Images In Frames(only run if needed)

In [2]:
# import os
# from pathlib import Path

# # Define the path to the frames folder
# FRAME_DIR = Path("youtube_smile_analysis/frames")

# # Remove all .jpg and .png files in the folder
# for file in FRAME_DIR.glob("*"):
#     if file.suffix.lower() in [".jpg", ".jpeg", ".png"]:
#         try:
#             file.unlink()  # Deletes the file
#             print(f"🗑️ Deleted: {file.name}")
#         except Exception as e:
#             print(f"⚠️ Failed to delete {file.name}: {e}")

# Import Packages

In [3]:
import os
import cv2
import yt_dlp
import requests
import pandas as pd
from deepface import DeepFace
from googleapiclient.discovery import build
from pathlib import Path
from datetime import datetime
from tqdm import tqdm


25-04-21 17:14:53 - Directory C:\Users\fabia\.deepface has been created
25-04-21 17:14:53 - Directory C:\Users\fabia\.deepface\weights has been created


# Setup root path for files

In [4]:
PROJECT_ROOT = Path("Data")
VIDEO_DIR = PROJECT_ROOT / "videos"
FRAME_DIR = PROJECT_ROOT / "frames"
META_DIR = PROJECT_ROOT / "metadata"
RESULT_DIR = PROJECT_ROOT / "results"

# Ensure folders exist
for folder in [VIDEO_DIR, FRAME_DIR, META_DIR, RESULT_DIR]:
    folder.mkdir(parents=True, exist_ok=True)

In [5]:
# API key
YOUTUBE_API_KEY = "AIzaSyBq7YzbZa3shucO-jieJJ-HtzlPYbZHYNE"
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

# Download Video

##  Worst-Case Scenario: Manually Create a List of Videos

We will build our own list of videos by:

1. **Selecting videos URLs manually** from YT

2. **Think of this lists, how should we categorize them? How should we design our regression analysis later on?**
   - **Video Channel** (e.g., TEDx, WIRED, individual vloggers)
   - **Video Type** (e.g., interview, vlog, storytime, reaction)


In [6]:
# Video Links
# Right now I only add 2 videos from GQ, I think their video might be suitable for our project cause a lot of frame contains human face(interview videos)
video_urls = [
    "https://www.youtube.com/watch?v=3D1hn3jLO6Q&ab_channel=GQ",
    "https://www.youtube.com/watch?v=kuQv-4nSmXM&ab_channel=GQ"
]

In [7]:
ydl_opts = {
    'format': 'best[ext=mp4]',
    'outtmpl': str(VIDEO_DIR / '%(id)s.%(ext)s'),
    'noplaylist': True,
    'quiet': False
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download(video_urls)


[youtube] Extracting URL: https://www.youtube.com/watch?v=3D1hn3jLO6Q&ab_channel=GQ
[youtube] 3D1hn3jLO6Q: Downloading webpage
[youtube] 3D1hn3jLO6Q: Downloading tv client config
[youtube] 3D1hn3jLO6Q: Downloading player 9a279502-main
[youtube] 3D1hn3jLO6Q: Downloading tv player API JSON
[youtube] 3D1hn3jLO6Q: Downloading ios player API JSON
[youtube] 3D1hn3jLO6Q: Downloading m3u8 information
[info] 3D1hn3jLO6Q: Downloading 1 format(s): 18
[download] Destination: Data\videos\3D1hn3jLO6Q.mp4
[download] 100% of   26.29MiB in 00:00:02 at 9.69MiB/s     
[youtube] Extracting URL: https://www.youtube.com/watch?v=kuQv-4nSmXM&ab_channel=GQ
[youtube] kuQv-4nSmXM: Downloading webpage
[youtube] kuQv-4nSmXM: Downloading tv client config
[youtube] kuQv-4nSmXM: Downloading tv player API JSON
[youtube] kuQv-4nSmXM: Downloading ios player API JSON
[youtube] kuQv-4nSmXM: Downloading m3u8 information
[info] kuQv-4nSmXM: Downloading 1 format(s): 18
[download] Destination: Data\videos\kuQv-4nSmXM.mp4
[dow

# Function for extracting frames (frame_interval = 2*fps)

In [8]:
def extract_frames(video_path, frame_output_dir, interval=2):
    cap = cv2.VideoCapture(str(video_path))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_interval = interval * fps
    count = 0
    success, image = cap.read()
    while success:
        if count % frame_interval == 0:
            frame_filename = frame_output_dir / f"{video_path.stem}_frame{count}.jpg"
            cv2.imwrite(str(frame_filename), image)
        success, image = cap.read()
        count += 1
    cap.release()

for video_file in VIDEO_DIR.glob("*.mp4"):
    extract_frames(video_file, FRAME_DIR)


# Get Emotion Score with DeepFace and Calculate Average Score

In [9]:
results = []

for frame in tqdm(list(FRAME_DIR.glob("*.jpg"))):
    try:
        analysis = DeepFace.analyze(img_path=str(frame), actions=['emotion'], enforce_detection=False)
        emotion_scores = analysis[0]['emotion']

        # Add frame and video ID info
        emotion_scores['frame'] = frame.name
        emotion_scores['video_id'] = frame.name.split("_frame")[0]

        results.append(emotion_scores)
    except Exception as e:
        print(f"❌ Failed on {frame.name}: {e}")

# Save full frame-level results
df_emotions = pd.DataFrame(results)
df_emotions.to_csv(RESULT_DIR / "frame_emotions.csv", index=False)

# Compute video-level average for all emotions
emotion_cols = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
df_avg = df_emotions.groupby("video_id")[emotion_cols].mean().reset_index()
df_avg.to_csv(RESULT_DIR / "video_emotions_avg.csv", index=False)

  0%|          | 0/608 [00:00<?, ?it/s]

25-04-21 17:15:30 - facial_expression_model_weights.h5 will be downloaded...


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: C:\Users\fabia\.deepface\weights\facial_expression_model_weights.h5

[A%|          | 0.00/5.98M [00:00<?, ?B/s]
[A%|▉         | 524k/5.98M [00:00<00:02, 2.33MB/s]
[A%|█▊        | 1.05M/5.98M [00:00<00:01, 2.70MB/s]
[A%|██▋       | 1.57M/5.98M [00:00<00:01, 3.06MB/s]
[A%|███▌      | 2.10M/5.98M [00:00<00:01, 3.38MB/s]
[A%|████▍     | 2.62M/5.98M [00:00<00:00, 3.74MB/s]
[A%|█████▎    | 3.15M/5.98M [00:00<00:00, 4.06MB/s]
[A%|██████▏   | 3.67M/5.98M [00:01<00:00, 4.30MB/s]
[A%|███████   | 4.19M/5.98M [00:01<00:00, 4.53MB/s]
[A%|████████▊ | 5.24M/5.98M [00:01<00:00, 5.03MB/s]
100%|██████████| 5.98M/5.98M [00:01<00:00, 4.26MB/s]
100%|██████████| 608/608 [01:31<00:00,  6.61it/s]


# Get YouTube Video Stats

In [10]:
def get_video_metadata(video_id):
    request = youtube.videos().list(
        part="snippet,statistics,contentDetails",
        id=video_id
    )
    response = request.execute()
    item = response['items'][0]
    stats = item['statistics']
    snippet = item['snippet']
    return {
        'video_id': video_id,
        'title': snippet['title'],
        'views': int(stats.get('viewCount', 0)),
        'likes': int(stats.get('likeCount', 0)),
        'comments': int(stats.get('commentCount', 0)),
        'published_at': snippet['publishedAt']
    }

video_ids = [url.split("v=")[-1].split("&")[0] for url in video_urls]
metadata = [get_video_metadata(vid) for vid in video_ids]
df_meta = pd.DataFrame(metadata)
df_meta.to_csv(META_DIR / "youtube_video_metadata.csv", index=False)


# Merge Metadata With Smile Scores

In [11]:
df_merged = pd.merge(df_meta, pd.read_csv(RESULT_DIR / "video_emotions_avg.csv"), on="video_id")
df_merged.to_csv(RESULT_DIR / "final_dataset.csv", index=False)
df_merged.head()


Unnamed: 0,video_id,title,views,likes,comments,published_at,angry,disgust,fear,happy,sad,surprise,neutral
0,3D1hn3jLO6Q,10 Things JENNIE Can't Live Without | 10 Essen...,1518478,78631,1450,2025-03-05T17:00:08Z,9.586111,1.291082,10.741376,14.146092,21.476984,1.031442,41.726913
1,kuQv-4nSmXM,Fanum Replies To Fans Online | Actually Me,402388,15296,267,2025-01-28T17:00:02Z,17.474733,0.194227,11.011636,8.953383,12.206915,4.693855,45.465252
