In [None]:
import os
import json
import time
from datetime import datetime, timedelta
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from dotenv import load_dotenv

load_dotenv()

# --- Configuration ---
API_KEY = "AIzaSyBlUpDZZ4gYI__pCKaI-I-XTiq2q_KCsQY"  # Inserisci la tua API key nel file .env
query = 'war ukraine'  # Puoi anche usare 'zelensky trump' per AND search
max_results_per_search = 50  # Max 50 per richiesta
max_videos = 2000  # Totale video da processare
output_json = 'youtube_war_ukraine_2.json'

# --- FILTRI TEMPORALI ---
# Puoi impostare range temporali specifici (formato ISO 8601: YYYY-MM-DDTHH:MM:SSZ)
published_after = '2024-03-01T00:00:00Z'  # Inizio range
published_before = '2025-07-01T23:59:59Z'  # Fine range

# --- SETTINGS ---
request_delay = 1.0  # Secondi tra le richieste per rispettare rate limits
comment_score_min = 5  # Score minimo per i commenti
max_comments_per_video = 200  # Max commenti per video per risparmiare quota

def initialize_youtube_api():
    """Inizializza il client YouTube API"""
    if not API_KEY:
        print("❌ Errore: YOUTUBE_API_KEY non trovata nel file .env")
        exit(1)
    
    try:
        youtube = build('youtube', 'v3', developerKey=API_KEY)
        print("✅ YouTube API inizializzata con successo!")
        return youtube
    except Exception as e:
        print(f"❌ Errore nell'inizializzare YouTube API: {e}")
        exit(1)

def search_videos(youtube, query, published_after=None, published_before=None, max_results=50, page_token=None):
    """Cerca video con filtri temporali"""
    try:
        search_params = {
            'q': query,
            'part': 'id,snippet',
            'type': 'video',
            'maxResults': max_results,
            'order': 'date',  # Ordina per data
            'regionCode': 'US'  # Puoi cambiare in 'IT' per risultati italiani
        }
        
        # Aggiungi filtri temporali se specificati
        if published_after:
            search_params['publishedAfter'] = published_after
        if published_before:
            search_params['publishedBefore'] = published_before
        if page_token:
            search_params['pageToken'] = page_token
            
        response = youtube.search().list(**search_params).execute()
        return response
    
    except HttpError as e:
        print(f"❌ Errore HTTP nella ricerca: {e}")
        return None
    except Exception as e:
        print(f"❌ Errore generico nella ricerca: {e}")
        return None

def get_video_details(youtube, video_ids):
    """Ottieni dettagli dei video (statistiche, durata, etc.)"""
    try:
        response = youtube.videos().list(
            part='statistics,contentDetails,snippet',
            id=','.join(video_ids)
        ).execute()
        return response
    except HttpError as e:
        print(f"❌ Errore nell'ottenere dettagli video: {e}")
        return None

def get_video_comments(youtube, video_id, max_results=100):
    """Ottieni commenti di un video con paginazione"""
    comments = []
    next_page_token = None
    
    try:
        while len(comments) < max_results:
            # Calcola quanti commenti mancano
            remaining = max_results - len(comments)
            per_page = min(100, remaining)  # YouTube max 100 per richiesta
            
            request_params = {
                'part': 'snippet,replies',
                'videoId': video_id,
                'maxResults': per_page,
                'order': 'relevance',  # Oppure 'time' per cronologico
                'textFormat': 'plainText'
            }
            
            if next_page_token:
                request_params['pageToken'] = next_page_token
            
            response = youtube.commentThreads().list(**request_params).execute()
            
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                
                # Filtra per score minimo
                if comment.get('likeCount', 0) >= comment_score_min:
                    comment_data = {
                        'author': comment.get('authorDisplayName', '[Unknown]'),
                        'text': comment.get('textDisplay', ''),
                        'like_count': comment.get('likeCount', 0),
                        'published_at': comment.get('publishedAt', ''),
                        'updated_at': comment.get('updatedAt', '')
                    }
                    
                    # Aggiungi replies se presenti
                    if 'replies' in item:
                        replies = []
                        for reply in item['replies']['comments']:
                            reply_snippet = reply['snippet']
                            if reply_snippet.get('likeCount', 0) >= comment_score_min:
                                replies.append({
                                    'author': reply_snippet.get('authorDisplayName', '[Unknown]'),
                                    'text': reply_snippet.get('textDisplay', ''),
                                    'like_count': reply_snippet.get('likeCount', 0),
                                    'published_at': reply_snippet.get('publishedAt', '')
                                })
                        comment_data['replies'] = replies
                    
                    comments.append(comment_data)
            
            # Controlla se ci sono altre pagine
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
                
            # Pausa tra le richieste
            time.sleep(0.5)
    
    except HttpError as e:
        if 'commentsDisabled' in str(e):
            print(f"      ℹ️ Commenti disabilitati per video {video_id}")
        else:
            print(f"      ⚠️ Errore nell'ottenere commenti per {video_id}: {e}")
    except Exception as e:
        print(f"      ⚠️ Errore generico commenti per {video_id}: {e}")
    
    return comments

def main():
    print("🎥 YouTube Data Scraper - Avvio raccolta dati...")
    print(f"🔍 Query: '{query}'")
    print(f"📅 Range temporale: {published_after} -> {published_before}")
    
    # Inizializza API
    youtube = initialize_youtube_api()
    
    all_data = []
    total_videos = 0
    total_comments = 0
    next_page_token = None
    page_number = 1
    
    # Stima quota usage
    estimated_quota = 0
    
    while total_videos < max_videos:
        print(f"\n📄 Pagina {page_number} - Ricerca video...")
        
        # Calcola quanti video mancano
        remaining_videos = max_videos - total_videos
        search_limit = min(max_results_per_search, remaining_videos)
        
        # Cerca video
        search_response = search_videos(
            youtube, query, published_after, published_before, 
            search_limit, next_page_token
        )
        
        if not search_response or not search_response.get('items'):
            print("   ℹ️ Nessun video trovato o fine risultati")
            break
        
        estimated_quota += 100  # Costo search
        videos = search_response['items']
        print(f"   📹 Trovati {len(videos)} video")
        
        # Ottieni dettagli video (statistiche)
        video_ids = [video['id']['videoId'] for video in videos]
        video_details_response = get_video_details(youtube, video_ids)
        estimated_quota += 1  # Costo video details
        
        # Crea mappa per accesso rapido ai dettagli
        video_details_map = {}
        if video_details_response:
            for detail in video_details_response['items']:
                video_details_map[detail['id']] = detail
        
        page_videos = 0
        page_comments = 0
        
        # Processa ogni video
        for video in videos:
            video_id = video['id']['videoId']
            snippet = video['snippet']
            
            # Ottieni dettagli aggiuntivi
            details = video_details_map.get(video_id, {})
            stats = details.get('statistics', {})
            content_details = details.get('contentDetails', {})
            
            print(f"   🎬 Processando: {snippet['title'][:50]}...")
            
            # Ottieni commenti
            comments = get_video_comments(youtube, video_id, max_comments_per_video)
            estimated_quota += max(1, len(comments) // 100)  # Stima quota commenti
            
            # Costruisci record video
            video_record = {
                'page_number': page_number,
                'video_id': video_id,
                'title': snippet['title'],
                'description': snippet.get('description', '')[:500],  # Primi 500 char
                'channel_title': snippet['channelTitle'],
                'channel_id': snippet['channelId'],
                'published_at': snippet['publishedAt'],
                'thumbnail_url': snippet['thumbnails']['high']['url'] if 'high' in snippet['thumbnails'] else '',
                
                # Statistiche
                'view_count': int(stats.get('viewCount', 0)),
                'like_count': int(stats.get('likeCount', 0)),
                'comment_count': int(stats.get('commentCount', 0)),
                'duration': content_details.get('duration', ''),
                
                # Commenti raccolti
                'comments': comments,
                'comments_collected': len(comments)
            }
            
            all_data.append(video_record)
            page_videos += 1
            page_comments += len(comments)
            
            # Piccola pausa tra i video
            time.sleep(0.2)
        
        total_videos += page_videos
        total_comments += page_comments
        
        print(f"   ✅ {page_videos} video, {page_comments} commenti raccolti")
        print(f"   📊 Totale: {total_videos} video, {total_comments} commenti")
        print(f"   📈 Quota stimata usata: {estimated_quota}/10000")
        
        # Controlla se ci sono altre pagine
        next_page_token = search_response.get('nextPageToken')
        if not next_page_token:
            print("   ℹ️ Non ci sono più pagine disponibili")
            break
        
        page_number += 1
        
        # Pausa tra le pagine per rispettare rate limits
        time.sleep(request_delay)
        
        # Salva progressi ogni 5 pagine
        if page_number % 5 == 0:
            print(f"   💾 Salvataggio progressi...")
            with open(f"temp_{output_json}", 'w', encoding='utf-8') as f:
                json.dump(all_data, f, ensure_ascii=False, indent=2)
    
    # Salvataggio finale
    print(f"\n💾 Salvataggio finale in {output_json}...")
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)
    
    # Statistiche finali
    print(f"\n🎉 COMPLETATO!")
    print(f"📊 Statistiche finali:")
    print(f"   📄 Pagine processate: {page_number}")
    print(f"   🎬 Video totali: {total_videos}")
    print(f"   💬 Commenti totali: {total_comments}")
    print(f"   📈 Quota stimata usata: {estimated_quota}/10000 ({(estimated_quota/10000)*100:.1f}%)")
    print(f"   📄 File salvato: {output_json}")
    
    if total_videos > 0:
        print(f"   ⏱️ Media commenti/video: {total_comments/total_videos:.1f}")
    
    # Rimuovi file temporaneo
    temp_file = f"temp_{output_json}"
    if os.path.exists(temp_file):
        os.remove(temp_file)
        print(f"🗑️ File temporaneo rimosso")
    
    print(f"\n✨ Raccolta completata!")

if __name__ == "__main__":
    main()

In [None]:
import json
import pandas as pd
from datetime import datetime

# --- Caricamento del file JSON ---
with open('youtube_war_ukraine_2.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# --- Statistiche ---
num_videos = len(data)
num_comments = sum(video['comments_collected'] for video in data)
total_views = sum(video['view_count'] for video in data)
total_likes = sum(video['like_count'] for video in data)

print(f"🎥 Numero totale di video: {num_videos}")
print(f"💬 Numero totale di commenti raccolti: {num_comments}")
print(f"👀 Visualizzazioni totali: {total_views:,}")
print(f"👍 Like totali: {total_likes:,}")

if num_videos > 0:
    print(f"📊 Media commenti per video: {num_comments/num_videos:.1f}")
    print(f"📊 Media visualizzazioni per video: {total_views/num_videos:,.0f}")

# --- Creazione del DataFrame ---
rows = []

for video in data:
    video_id = video['video_id']
    video_title = video['title']
    channel_title = video['channel_title']
    channel_id = video['channel_id']
    published_at = video['published_at']
    view_count = video['view_count']
    like_count = video['like_count']
    comment_count = video['comment_count']
    duration = video['duration']
    description = video['description']
    
    # Se il video non ha commenti, aggiungi comunque una riga per il video
    if not video['comments']:
        rows.append({
            'video_id': video_id,
            'video_title': video_title,
            'channel_title': channel_title,
            'channel_id': channel_id,
            'video_published_at': published_at,
            'video_view_count': view_count,
            'video_like_count': like_count,
            'video_comment_count': comment_count,
            'video_duration': duration,
            'video_description': description[:200] + '...' if len(description) > 200 else description,
            
            # Campi commento vuoti
            'comment_author': None,
            'comment_text': None,
            'comment_like_count': None,
            'comment_published_at': None,
            'comment_updated_at': None,
            'comment_replies_count': None,
            'is_reply': False
        })
    else:
        # Processa ogni commento
        for comment in video['comments']:
            # Commento principale
            rows.append({
                'video_id': video_id,
                'video_title': video_title,
                'channel_title': channel_title,
                'channel_id': channel_id,
                'video_published_at': published_at,
                'video_view_count': view_count,
                'video_like_count': like_count,
                'video_comment_count': comment_count,
                'video_duration': duration,
                'video_description': description[:200] + '...' if len(description) > 200 else description,
                
                # Dati commento
                'comment_author': comment['author'],
                'comment_text': comment['text'],
                'comment_like_count': comment['like_count'],
                'comment_published_at': comment['published_at'],
                'comment_updated_at': comment['updated_at'],
                'comment_replies_count': len(comment.get('replies', [])),
                'is_reply': False
            })
            
            # Processa le risposte se presenti
            if 'replies' in comment and comment['replies']:
                for reply in comment['replies']:
                    rows.append({
                        'video_id': video_id,
                        'video_title': video_title,
                        'channel_title': channel_title,
                        'channel_id': channel_id,
                        'video_published_at': published_at,
                        'video_view_count': view_count,
                        'video_like_count': like_count,
                        'video_comment_count': comment_count,
                        'video_duration': duration,
                        'video_description': description[:200] + '...' if len(description) > 200 else description,
                        
                        # Dati reply
                        'comment_author': reply['author'],
                        'comment_text': reply['text'],
                        'comment_like_count': reply['like_count'],
                        'comment_published_at': reply['published_at'],
                        'comment_updated_at': reply.get('updated_at', reply['published_at']),
                        'comment_replies_count': 0,  # Le risposte non hanno sotto-risposte
                        'is_reply': True
                    })

# Crea DataFrame
df = pd.DataFrame(rows)

# --- Elaborazioni aggiuntive ---
if not df.empty:
    # Converti le date in datetime per analisi temporali
    df['video_published_at'] = pd.to_datetime(df['video_published_at'])
    df['comment_published_at'] = pd.to_datetime(df['comment_published_at'])

    # Aggiungi questa riga per rimuovere timezone info
    df['video_published_at'] = df['video_published_at'].dt.tz_localize(None)
    
    # Aggiungi colonne derivate utili per l'analisi
    df['video_age_days'] = (datetime.now() - df['video_published_at']).dt.days
    df['comment_length'] = df['comment_text'].str.len()
    
    # Estrai durata in secondi (da formato PT1M30S)
    def parse_duration(duration_str):
        if not duration_str or duration_str == '':
            return 0
        try:
            # Rimuovi PT e splitta per M e S
            duration_str = duration_str.replace('PT', '')
            minutes = 0
            seconds = 0
            
            if 'H' in duration_str:
                parts = duration_str.split('H')
                hours = int(parts[0])
                duration_str = parts[1]
                minutes += hours * 60
            
            if 'M' in duration_str:
                parts = duration_str.split('M')
                minutes += int(parts[0])
                duration_str = parts[1]
            
            if 'S' in duration_str:
                seconds = int(duration_str.replace('S', ''))
            
            return minutes * 60 + seconds
        except:
            return 0
    
    df['video_duration_seconds'] = df['video_duration'].apply(parse_duration)
    df['video_duration_minutes'] = df['video_duration_seconds'] / 60

# --- Visualizzazione del DataFrame ---
print(f"\n🧾 DataFrame creato con {len(df)} righe")
print(f"📊 Colonne disponibili: {list(df.columns)}")

if not df.empty:
    print("\n📋 Esempio del DataFrame:")
    print(df[['video_title', 'channel_title', 'comment_author', 'comment_like_count', 'is_reply']].head())
    
    print(f"\n📈 Statistiche rapide:")
    print(f"   🎬 Video unici: {df['video_id'].nunique()}")
    print(f"   📺 Canali unici: {df['channel_id'].nunique()}")
    print(f"   👤 Autori commenti unici: {df['comment_author'].nunique()}")
    print(f"   💬 Commenti principali: {len(df[df['is_reply'] == False])}")
    print(f"   🔄 Risposte: {len(df[df['is_reply'] == True])}")
    
    if df['comment_like_count'].notna().any():
        print(f"   👍 Like medio per commento: {df['comment_like_count'].mean():.1f}")
        print(f"   👍 Commento con più like: {df['comment_like_count'].max()}")
    
    # Top canali per numero di video
    print(f"\n🏆 Top 5 canali per numero di video:")
    top_channels = df.groupby('channel_title')['video_id'].nunique().sort_values(ascending=False).head()
    for channel, count in top_channels.items():
        print(f"   📺 {channel}: {count} video")

# --- Salvataggio ---
if not df.empty:
    # Salva come CSV
    csv_filename = 'youtube_war_ukraine_data_2.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"\n💾 DataFrame salvato come: {csv_filename}")
    
    # Salva anche un dataset solo commenti (senza video senza commenti)
    df_comments_only = df[df['comment_text'].notna()]
    if not df_comments_only.empty:
        comments_csv = 'youtube_war_ukraine_comments_only_2.csv'
        df_comments_only.to_csv(comments_csv, index=False, encoding='utf-8')
        print(f"💾 Solo commenti salvati come: {comments_csv}")
        
    # Crea anche un summary dei video
    video_summary = df.groupby(['video_id', 'video_title', 'channel_title']).agg({
        'video_view_count': 'first',
        'video_like_count': 'first',
        'video_comment_count': 'first',
        'video_duration_minutes': 'first',
        'comment_like_count': ['count', 'sum', 'mean'],
        'video_published_at': 'first'
    }).reset_index()
    
    # Flatten column names
    video_summary.columns = ['video_id', 'video_title', 'channel_title', 'view_count', 
                           'like_count', 'total_comments', 'duration_minutes',
                           'comments_collected', 'total_comment_likes', 'avg_comment_likes', 'published_at']
    
    video_summary_csv = 'youtube_war_ukraine_video_summary_2.csv'
    video_summary.to_csv(video_summary_csv, index=False, encoding='utf-8')
    print(f"💾 Summary video salvato come: {video_summary_csv}")

print(f"\n✨ Elaborazione completata!")