# A.2 Captura de vídeos de <50000 views

In [None]:
#!pip install isodate
#!pip install load_dotenv
#!pip install google-api-python-client

In [None]:
import googleapiclient.discovery
import pandas as pd
from datetime import datetime, timezone, timedelta
import isodate
import os
import time
from dotenv import load_dotenv

load_dotenv("/Users/danielmunoz/Documents/EDUCACION/DATA_ANALIST/CURSOS/TFM/.env")
DEVELOPER_KEY = os.getenv("API_KEY")


youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=DEVELOPER_KEY)


now = datetime.now(timezone.utc)
published_after = (now - timedelta(hours=96)).isoformat()
published_before = (now - timedelta(hours=72)).isoformat()

# Lista de palabras clave para obtener vídeos en español
queries = ["cómo", "por qué", "noticias", "vlog", "música", "review", "tutorial", "España"]

video_data = []
video_ids_collected = set()


for query in queries:
    next_page_token = None

    while len(video_data) < 300:
        search_response = youtube.search().list(
            part="id",
            type="video",
            maxResults=50,
            order="relevance",
            publishedAfter=published_after,
            publishedBefore=published_before,
            q=query,
            pageToken=next_page_token
        ).execute()

        video_ids = [item['id']['videoId'] for item in search_response['items']]
        new_ids = [vid for vid in video_ids if vid not in video_ids_collected]
        video_ids_collected.update(new_ids)

        for i in range(0, len(new_ids), 50):
            batch_ids = new_ids[i:i+50]
            video_response = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=",".join(batch_ids)
            ).execute()

            for item in video_response['items']:
                try:
                    snippet = item['snippet']
                    stats = item.get('statistics', {})
                    details = item['contentDetails']

                    video_id = item['id']
                    published_at = snippet.get('publishedAt')
                    published_datetime = datetime.fromisoformat(published_at.replace("Z", "+00:00"))

                    if not (now - timedelta(hours=96) <= published_datetime <= now - timedelta(hours=36)):
                        continue

                    views = int(stats.get('viewCount', 0))
                    if views >= 50000:
                        continue

                    likes = int(stats.get('likeCount', 0))
                    comments = int(stats.get('commentCount', 0))
                    duration_iso = details.get('duration')
                    duration_seconds = isodate.parse_duration(duration_iso).total_seconds()
                    duration_minutes = duration_seconds / 60

                    tags = snippet.get('tags', [])
                    num_tags = len(tags)
                    channel_id = snippet.get('channelId')
                    title = snippet.get('title')
                    description = snippet.get('description')
                    category_id = snippet.get('categoryId')
                    definition = details.get('definition')
                    caption = details.get('caption')
                    licensed = details.get('licensedContent')
                    channel_title = snippet.get('channelTitle')

                    video_age_days = (now - published_datetime).days
                    views_per_day = views / video_age_days if video_age_days > 0 else views

                    # Datos del canal
                    channel_response = youtube.channels().list(
                        part="snippet,statistics",
                        id=channel_id
                    ).execute()

                    channel_info = channel_response['items'][0]
                    channel_stats = channel_info['statistics']
                    channel_snippet = channel_info['snippet']

                    channel_description = channel_snippet.get('description', '')
                    subscriber_count = int(channel_stats.get('subscriberCount', 0))
                    video_count = int(channel_stats.get('videoCount', 0))
                    channel_views = int(channel_stats.get('viewCount', 0))
                    channel_created = channel_snippet.get('publishedAt')
                    channel_age_days = (now - datetime.fromisoformat(channel_created.replace("Z", "+00:00"))).days

                    engagement_rate = (likes + comments) / views if views > 0 else 0
                    efficiency = (likes + comments) / video_count if video_count > 0 else 0
                    engagement_subscribers = (likes + comments) / subscriber_count if subscriber_count > 0 else 0
                    frecuencia_videos = video_count / channel_age_days if channel_age_days > 0 else 0
                    views_rate_video = channel_views / video_count if video_count > 0 else 0

                    video_data.append({
                        'video_id': video_id,
                        'title': title,
                        'description': description,
                        'published_at': published_at,
                        'duration_minutes': duration_minutes,
                        'definition': definition,
                        'licensed_content': licensed,
                        'has_caption': caption == 'true',
                        'category_id': category_id,
                        'views': views,
                        'likes': likes,
                        'comments': comments,
                        'tags': ", ".join(tags),
                        'tags_count': num_tags,
                        'views_per_day': views_per_day,
                        'video_age_days': video_age_days,
                        'engagement_rate': engagement_rate,
                        'efficiency': efficiency,
                        'engagement_subscribers': engagement_subscribers,
                        'subscriber_count': subscriber_count,
                        'channel_id': channel_id,
                        'channel_title': channel_title,
                        'channel_description': channel_description,
                        'channel_video_count': video_count,
                        'channel_views': channel_views,
                        'channel_created': channel_created,
                        'channel_age_days': channel_age_days,
                        'frecuencia_videos': frecuencia_videos,
                        'views_rate_video': views_rate_video
                    })

                    if len(video_data) >= 300:
                        break

                except Exception as e:
                    print(f"Error procesando video {item.get('id')}: {e}")
                    continue

        if len(video_data) >= 300 or not search_response.get('nextPageToken'):
            break

        next_page_token = search_response.get('nextPageToken')
        time.sleep(1)

    if len(video_data) >= 300:
        break


df = pd.DataFrame(video_data)
timestamp = datetime.now().strftime("%y%m%d")
output_path = os.path.expanduser(f"{timestamp}_no_populares.csv")
df.to_csv(output_path, index=False)
