In [2]:
import googleapiclient.discovery
import pandas as pd
import time
import os
from dotenv import load_dotenv

# --- CONFIGURATION ---
# Load environment variables from the .env file
load_dotenv()

# Securely fetch the key (Variable name is lowercase 'api_key')
api_key = os.getenv("YOUTUBE_API_KEY")

# Sanity Check
if api_key:
    print("API Key loaded successfully!")
else:
    print("Error: API Key not found. Check your .env file.")

CHANNEL_ID = "UCLtREJY21xRfCuEKvdki1Kw" # H3 Podcast Channel
MAX_VIDEOS = 200 

def get_h3_dataset(api_key, channel_id, max_results):
    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
    
    video_data = []
    uploads_playlist_id = "UU" + channel_id[2:] 
    next_page_token = None
    
    print(f"Fetching last {max_results} videos...")

    while len(video_data) < max_results:
        try:
            request = youtube.playlistItems().list(
                part="snippet,contentDetails",
                playlistId=uploads_playlist_id,
                maxResults=50,
                pageToken=next_page_token
            )
            response = request.execute()

            vid_ids = [item['contentDetails']['videoId'] for item in response['items']]
            
            stats_request = youtube.videos().list(
                part="statistics,snippet",
                id=','.join(vid_ids)
            )
            stats_response = stats_request.execute()

            for item in stats_response['items']:
                stats = item['statistics']
                snippet = item['snippet']
                
                video_data.append({
                    "video_id": item['id'],
                    "title": snippet['title'],
                    "published_at": snippet['publishedAt'],
                    "view_count": int(stats.get('viewCount', 0)),
                    "like_count": int(stats.get('likeCount', 0)),
                    "comment_count": int(stats.get('commentCount', 0)),
                    "description": snippet['description'] 
                })

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break 
                
            print(f"Collected {len(video_data)} videos...")
            time.sleep(0.5) 
            
        except Exception as e:
            print(f"Error: {e}")
            break

    return pd.DataFrame(video_data)

# --- EXECUTE ---
# FIXED: Using lowercase 'api_key' here to match line 13
df = get_h3_dataset(api_key, CHANNEL_ID, MAX_VIDEOS)

# Save to the 'data/raw' folder
save_path = os.path.join("..", "data", "raw", "h3_podcast_raw.csv")

os.makedirs(os.path.dirname(save_path), exist_ok=True)
df.to_csv(save_path, index=False)

print(f"SUCCESS: Saved {len(df)} rows to {save_path}")
print(df.head())

API Key loaded successfully!
Fetching last 200 videos...
Collected 50 videos...
Collected 100 videos...
Collected 150 videos...
Collected 200 videos...
SUCCESS: Saved 200 rows to ..\data\raw\h3_podcast_raw.csv
      video_id                                              title  \
0  Sh4VlCzrf5I  The Right Is Self Imploding (Candace Owens, Ti...   
1  mY9qhoZRHSU                            They Did Me So Dirty...   
2  UeIWNYtqwpU  The Trisha Paytas Reality Show Is Worse Than W...   
3  wjI0xyDkbJc                         I Have To Talk About This.   
4  1ANasCrCylQ                 Ethan Gets DESTROYED By Lena's Mom   

           published_at  view_count  like_count  comment_count  \
0  2025-12-10T18:39:29Z       40972        2246              0   
1  2025-12-10T00:34:51Z       24432        1232             31   
2  2025-12-09T23:48:20Z      253882        6877            607   
3  2025-12-09T00:25:19Z      385155        8365            849   
4  2025-12-06T20:05:46Z      124268        44