In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install yt-dlp opencv-python-headless deepface google-api-python-client

Collecting yt-dlp
  Downloading yt_dlp-2025.3.31-py3-none-any.whl.metadata (172 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/172.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.2/172.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting deepface
  Downloading deepface-0.0.93-py3-none-any.whl.metadata (30 kB)
Collecting flask-cors>=4.0.1 (from deepface)
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting mtcnn>=0.1.0 (from deepface)
  Downloading mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting retina-face>=0.0.1 (from deepface)
  Downloading retina_face-0.0.17-py3-none-any.whl.metadata (10 kB)
Collecting fire>=0.4.0 (from deepface)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [4]:
import os
import cv2
import yt_dlp
import requests
import time
import pandas as pd
from deepface import DeepFace
from googleapiclient.discovery import build
from pathlib import Path
from datetime import datetime
from tqdm import tqdm

25-04-20 15:40:06 - Directory /root/.deepface has been created
25-04-20 15:40:06 - Directory /root/.deepface/weights has been created


In [5]:
API_KEY = 'AIzaSyC34PDdHRGHIYIT_iqEquq9na1JHgOhMSY'
youtube = build('youtube', 'v3', developerKey=API_KEY)

In [7]:
channel_name = "Casey Neistat" # change here to search for other channel

search_response = youtube.search().list(
    part="snippet",
    q=channel_name,
    type="channel",
    maxResults=1
).execute()

channel_id = search_response['items'][0]['snippet']['channelId']
channel_title = search_response['items'][0]['snippet']['title']
print(f"✅ Found channel: {channel_title} (ID: {channel_id})")

✅ Found channel: CaseyNeistat (ID: UCtinbF-Q-fVthA0qrFQTgXQ)


In [8]:
all_playlists = []
next_page_token = None

while True:
    playlist_response = youtube.playlists().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,
        pageToken=next_page_token
    ).execute()

    for item in playlist_response['items']:
        all_playlists.append({
            "playlist_id": item['id'],
            "playlist_title": item['snippet']['title']
        })

    next_page_token = playlist_response.get('nextPageToken')
    if not next_page_token:
        break

In [9]:
raw_videos = []

for playlist in tqdm(all_playlists, desc="Fetching playlist videos"):
    playlist_id = playlist["playlist_id"]
    playlist_title = playlist["playlist_title"]
    next_page_token = None

    while True:
        items_response = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in items_response["items"]:
            snippet = item["snippet"]
            video_id = snippet["resourceId"]["videoId"]
            title = snippet["title"]
            published_at = snippet["publishedAt"]
            raw_videos.append({
                "video_id": video_id,
                "video_title": title,
                "published_at": published_at,
                "playlist_name": playlist_title,
                "playlist_id": playlist_id,
                "video_url": f"https://www.youtube.com/watch?v={video_id}"
            })

        next_page_token = items_response.get("nextPageToken")
        if not next_page_token:
            break
        time.sleep(0.5)

Fetching playlist videos: 100%|██████████| 6/6 [00:05<00:00,  1.04it/s]


In [10]:
videos_with_stats = []

video_id_list = [v['video_id'] for v in raw_videos]

for i in tqdm(range(0, len(video_id_list), 50), desc="Fetching video stats"):
    batch_ids = video_id_list[i:i+50]
    stats_response = youtube.videos().list(
        part="snippet,statistics,contentDetails",
        id=",".join(batch_ids)
    ).execute()

    video_stats_map = {item["id"]: item for item in stats_response["items"]}

    for video in raw_videos[i:i+50]:
        video_id = video["video_id"]
        video_data = video_stats_map.get(video_id)

        if video_data:
            snippet = video_data["snippet"]
            stats = video_data.get("statistics", {})
            content = video_data.get("contentDetails", {})

            video.update({
                "channel_title": snippet.get("channelTitle"),
                "views": int(stats.get("viewCount", 0)),
                "likes": int(stats.get("likeCount", 0)),
                "comments": int(stats.get("commentCount", 0)),
                "duration": content.get("duration")
            })

            videos_with_stats.append(video)

    time.sleep(1)

Fetching video stats: 100%|██████████| 11/11 [00:12<00:00,  1.16s/it]


In [11]:
df = pd.DataFrame(videos_with_stats)
df = df.drop_duplicates(subset="video_id", keep="first")

# Sanitize channel title to make it filename-safe (optional but recommended)
safe_channel_title = channel_title.replace(" ", "_").replace("/", "_")

# Save the CSV
output_filename = f"{safe_channel_title}_playlists_with_stats.csv"
df.to_csv(output_filename, index=False)

print(df.head())
print(f"✅ Total unique videos for {channel_title}: {len(df)}")
print(f"📄 Saved to file: {output_filename}")

      video_id                     video_title          published_at  \
0  gnHCw87Enq4                   MY FIRST VLOG  2015-04-02T02:55:32Z   
1  cmLFCfNrHlo  Risky Cliff Jump in St. Barths  2015-04-02T02:55:41Z   
2  pmGOE6yNA98           Nude Beach and a Goat  2015-04-02T02:57:01Z   
3  buIevUkvZaA        Found GoPro in the Ocean  2015-04-02T02:55:26Z   
4  dld7XXbMKDQ   Flying from St. Barths to NYC  2015-04-02T02:55:19Z   

  playlist_name                         playlist_id  \
0          Vlog  PLTHOlLMWEwVy52FUngq91krMkQDQBagYw   
1          Vlog  PLTHOlLMWEwVy52FUngq91krMkQDQBagYw   
2          Vlog  PLTHOlLMWEwVy52FUngq91krMkQDQBagYw   
3          Vlog  PLTHOlLMWEwVy52FUngq91krMkQDQBagYw   
4          Vlog  PLTHOlLMWEwVy52FUngq91krMkQDQBagYw   

                                     video_url channel_title    views  likes  \
0  https://www.youtube.com/watch?v=gnHCw87Enq4  CaseyNeistat  5550974  98236   
1  https://www.youtube.com/watch?v=cmLFCfNrHlo  CaseyNeistat  2069186  33297

In [15]:
# use latest 30 vids as example
df.sort_values(by='published_at', ascending=False, inplace=True)
df_30vids = df.iloc[:30]

video_urls = df_30vids['video_url'].tolist()

['https://www.youtube.com/watch?v=jG7dSXcfVqE',
 'https://www.youtube.com/watch?v=4rBrQKtGICU',
 'https://www.youtube.com/watch?v=5cuxPmmrlv4',
 'https://www.youtube.com/watch?v=MHBI8OU7mq8',
 'https://www.youtube.com/watch?v=GjOeZ2xk96Y',
 'https://www.youtube.com/watch?v=4EQYXH9qT2o',
 'https://www.youtube.com/watch?v=xmhtV4270NU',
 'https://www.youtube.com/watch?v=5kjNdPGTpfI',
 'https://www.youtube.com/watch?v=z0SmeXerHMg',
 'https://www.youtube.com/watch?v=Wpb-gQDkzyo',
 'https://www.youtube.com/watch?v=fftGsCpJBKw',
 'https://www.youtube.com/watch?v=Vp2NnN_5qSQ',
 'https://www.youtube.com/watch?v=jOeZBW9Lwos',
 'https://www.youtube.com/watch?v=ocdj6JEG5aw',
 'https://www.youtube.com/watch?v=PhBQjGKJjcs',
 'https://www.youtube.com/watch?v=vGO8HiQj82E',
 'https://www.youtube.com/watch?v=C3YgRUX97jI',
 'https://www.youtube.com/watch?v=mtTGYXj3htE',
 'https://www.youtube.com/watch?v=oMy9NgZArEM',
 'https://www.youtube.com/watch?v=ALLvROmC-uE',
 'https://www.youtube.com/watch?v=mbGO6W

# Video Download and Analysis

In [16]:
PROJECT_ROOT = Path("Data")
VIDEO_DIR = PROJECT_ROOT / "videos"
FRAME_DIR = PROJECT_ROOT / "frames"
META_DIR = PROJECT_ROOT / "metadata"
RESULT_DIR = PROJECT_ROOT / "results"

# Ensure folders exist
for folder in [VIDEO_DIR, FRAME_DIR, META_DIR, RESULT_DIR]:
    folder.mkdir(parents=True, exist_ok=True)

In [17]:
youtube = build('youtube', 'v3', developerKey=API_KEY)

In [18]:
ydl_opts = {
    'format': 'best[ext=mp4]',
    'outtmpl': str(VIDEO_DIR / '%(id)s.%(ext)s'),
    'noplaylist': True,
    'quiet': False
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download(video_urls)


[youtube] Extracting URL: https://www.youtube.com/watch?v=jG7dSXcfVqE
[youtube] jG7dSXcfVqE: Downloading webpage
[youtube] jG7dSXcfVqE: Downloading tv client config
[youtube] jG7dSXcfVqE: Downloading player 9a279502-main
[youtube] jG7dSXcfVqE: Downloading tv player API JSON
[youtube] jG7dSXcfVqE: Downloading ios player API JSON
[youtube] jG7dSXcfVqE: Downloading m3u8 information
[info] jG7dSXcfVqE: Downloading 1 format(s): 18
[download] Destination: Data/videos/jG7dSXcfVqE.mp4
[download] 100% of   16.92MiB in 00:00:00 at 29.35MiB/s  
[youtube] Extracting URL: https://www.youtube.com/watch?v=4rBrQKtGICU
[youtube] 4rBrQKtGICU: Downloading webpage
[youtube] 4rBrQKtGICU: Downloading tv client config
[youtube] 4rBrQKtGICU: Downloading tv player API JSON
[youtube] 4rBrQKtGICU: Downloading ios player API JSON
[youtube] 4rBrQKtGICU: Downloading m3u8 information
[info] 4rBrQKtGICU: Downloading 1 format(s): 18
[download] Destination: Data/videos/4rBrQKtGICU.mp4
[download] 100% of   10.96MiB in 0

In [19]:
def extract_frames(video_path, frame_output_dir, interval=2):
    cap = cv2.VideoCapture(str(video_path))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_interval = interval * fps
    count = 0
    success, image = cap.read()
    while success:
        if count % frame_interval == 0:
            frame_filename = frame_output_dir / f"{video_path.stem}_frame{count}.jpg"
            cv2.imwrite(str(frame_filename), image)
        success, image = cap.read()
        count += 1
    cap.release()

for video_file in VIDEO_DIR.glob("*.mp4"):
    extract_frames(video_file, FRAME_DIR)


In [20]:
results = []

for frame in tqdm(list(FRAME_DIR.glob("*.jpg"))):
    try:
        analysis = DeepFace.analyze(img_path=str(frame), actions=['emotion'], enforce_detection=False)
        emotion_scores = analysis[0]['emotion']

        # Add frame and video ID info
        emotion_scores['frame'] = frame.name
        emotion_scores['video_id'] = frame.name.split("_frame")[0]

        results.append(emotion_scores)
    except Exception as e:
        print(f"❌ Failed on {frame.name}: {e}")

# Save full frame-level results
df_emotions = pd.DataFrame(results)
df_emotions.to_csv(RESULT_DIR / "frame_emotions.csv", index=False)

# Compute video-level average for all emotions
emotion_cols = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
df_avg = df_emotions.groupby("video_id")[emotion_cols].mean().reset_index()
df_avg.to_csv(RESULT_DIR / "video_emotions_avg.csv", index=False)

  0%|          | 0/7873 [00:00<?, ?it/s]Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /root/.deepface/weights/facial_expression_model_weights.h5


25-04-20 16:01:19 - facial_expression_model_weights.h5 will be downloaded...



100%|██████████| 5.98M/5.98M [00:00<00:00, 271MB/s]
100%|██████████| 7873/7873 [38:22<00:00,  3.42it/s]


In [33]:
combined_df = df_30vids.merge(df_avg, on = 'video_id', how = 'left')
combined_df.head()

Unnamed: 0,video_id,video_title,published_at,playlist_name,playlist_id,video_url,channel_title,views,likes,comments,duration,angry,disgust,fear,happy,sad,surprise,neutral
0,jG7dSXcfVqE,DO WHAT YOU CAN'T,2017-03-30T18:43:28Z,best of CASEY NEISTAT,PLTHOlLMWEwVy2ZNmdrwRlRlVfZ8fiR_ms,https://www.youtube.com/watch?v=jG7dSXcfVqE,CaseyNeistat,16274589,691501,30008,PT3M53S,18.937672,0.109991,22.43918,9.969246,20.106173,4.562572,23.875166
1,4rBrQKtGICU,something i need to say,2016-07-21T14:51:21Z,Vlog,PLTHOlLMWEwVy52FUngq91krMkQDQBagYw,https://www.youtube.com/watch?v=4rBrQKtGICU,CaseyNeistat,3091995,154275,19924,PT3M35S,2.997589,2.112531,16.21987,15.832929,9.405345,4.624678,48.80706
2,5cuxPmmrlv4,$11 McLobster,2016-07-21T14:51:21Z,Vlog,PLTHOlLMWEwVy52FUngq91krMkQDQBagYw,https://www.youtube.com/watch?v=5cuxPmmrlv4,CaseyNeistat,2327870,59017,3455,PT3M11S,16.757627,0.02472,5.654192,18.652498,24.452524,0.644216,33.814224
3,MHBI8OU7mq8,PROM DATE,2016-07-21T14:51:21Z,Vlog,PLTHOlLMWEwVy52FUngq91krMkQDQBagYw,https://www.youtube.com/watch?v=MHBI8OU7mq8,CaseyNeistat,2515798,71672,4374,PT9M25S,12.30231,1.272033,20.888214,16.56142,23.513166,1.518946,23.943909
4,GjOeZ2xk96Y,Pokémon Go IN REAL LIFE,2016-07-21T14:51:21Z,Vlog,PLTHOlLMWEwVy52FUngq91krMkQDQBagYw,https://www.youtube.com/watch?v=GjOeZ2xk96Y,CaseyNeistat,5426784,213149,13119,PT2M43S,12.834095,2.187926,23.008091,13.541709,25.455215,1.437809,21.535154


In [35]:
import numpy as np
import statsmodels.api as sm
import re

df_model = combined_df.copy()
df_model['logviews'] = np.log(df_model['views'])

emotion_cols = ['angry','disgust','fear','happy','sad','surprise','neutral']

X = df_model[emotion_cols]
X = sm.add_constant(X)

# model including only neutral score -> negative impact indicates that emotions generally seem to improve view count
X = df_model['neutral']
X = sm.add_constant(X)
y = df_model['logviews']
model = sm.OLS(y,X).fit()
print('Linear Model Views ~ Neutral Emotion Score: \n', model.summary())

Linear Model Views ~ Emotion Scores: 
                             OLS Regression Results                            
Dep. Variable:               logviews   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                 -0.025
Method:                 Least Squares   F-statistic:                    0.2951
Date:                Sun, 20 Apr 2025   Prob (F-statistic):              0.591
Time:                        17:32:10   Log-Likelihood:                -25.338
No. Observations:                  30   AIC:                             54.68
Df Residuals:                      28   BIC:                             57.48
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        