In [1]:
import pandas as pd
import os

# 1. Load the Raw Data
# We use '..' to go up one level from Notebooks, then down into Data/raw
file_path = os.path.join("..", "Data", "raw", "h3_podcast_raw.csv") 
df = pd.read_csv(file_path)

print(f"Data Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print("-" * 30)

# 2. Check for Missing Values (Nulls)
print("MISSING VALUES:")
print(df.isnull().sum())
print("-" * 30)

# 3. Check for Duplicates
duplicates = df['video_id'].duplicated().sum()
print(f"DUPLICATE VIDEO IDs: {duplicates}")
print("-" * 30)

# 4. Check for Anomalies (Statistical sanity check)
# Look at the 'min' and 'max' to see if anything looks weird (like negative views)
print("STATISTICS:")
print(df[['view_count', 'like_count', 'comment_count']].describe().round(0))

Data Loaded: 200 rows, 7 columns
------------------------------
MISSING VALUES:
video_id         0
title            0
published_at     0
view_count       0
like_count       0
comment_count    0
description      1
dtype: int64
------------------------------
DUPLICATE VIDEO IDs: 0
------------------------------
STATISTICS:
       view_count  like_count  comment_count
count       200.0       200.0          200.0
mean     401011.0      9750.0         1321.0
std      328819.0      6907.0         2127.0
min       15867.0       391.0           17.0
25%       85289.0      3296.0          202.0
50%      392829.0     10990.0         1089.0
75%      625453.0     12705.0         1502.0
max     2026569.0     49131.0        22094.0


In [2]:
# Find the row where description is NaN (Not a Number/Null)
print(df[df['description'].isnull()])

        video_id                                    title  \

             published_at  view_count  like_count  comment_count description  
128  2025-07-10T19:50:23Z       75014        4456            291         NaN  
