In [None]:
import requests
import csv
from datetime import datetime, timedelta
import time
import json
from dotenv import load_dotenv
import os

load_dotenv()

# --- Configuration ---
client_id = "XpeJWmueFBEGkyXjE-dpcA"
client_secret = "yOyCLHaB0Ur7R0sW75UUmc20MjCPkw"
user_agent = "Samu_Miki"
username = "Samu_Miki"
password = "M!rand0la!"
subreddit = 'politics'
query = 'russian AND invasion'
comment_score_min = 0
output_json = 'russian_invasion_rel_p.json'

# --- PAGINAZIONE SETTINGS ---
max_pages = 50  # Numero massimo di pagine da recuperare (100 posts per pagina = 5000 posts max)
request_delay = 1.5  # Secondi tra le richieste
limit_per_page = 1000  # Reddit max per request

# --- Step 1: OAuth2 Token ---
print("🔐 Ottenendo token di accesso...")
auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
data = {'grant_type': 'password', 'username': username, 'password': password}
headers = {'User-Agent': user_agent}

try:
    res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data, headers=headers)
    token = res.json()['access_token']
    headers['Authorization'] = f'bearer {token}'
    print("✅ Token ottenuto con successo!")
except Exception as e:
    print(f"❌ Errore nell'ottenere il token: {e}")
    exit(1)

# --- Helper: Recursive Comment Extractor ---
def extract_comments(children, threshold):
    results = []
    for child in children:
        kind = child.get('kind')
        data = child.get('data', {})
        if kind != 't1':
            continue
        score = data.get('score', 0)
        body = data.get('body', '')
        if score >= threshold and not body.lower().startswith('[deleted') and body.strip():
            results.append({
                'author': data.get('author'),
                'score': score,
                'body': body,
                'created_utc': datetime.utcfromtimestamp(data['created_utc']).isoformat()
            })
        replies = data.get('replies')
        if replies and isinstance(replies, dict):
            results.extend(extract_comments(replies['data']['children'], threshold))
    return results

# --- Main Loop: Paginazione con "after" ---
all_data = []
total_posts = 0
total_comments = 0
current_page = 1
after_token = None  # Token per la paginazione

print(f"\n📄 Inizio raccolta dati con paginazione (max {max_pages} pagine)...")
print(f"🔍 Query: '{query}' in r/{subreddit}")

while current_page <= max_pages:
    print(f"\n📃 Pagina {current_page}/{max_pages}...")
    
    # Parametri per la ricerca
    search_url = f'https://oauth.reddit.com/r/{subreddit}/search'
    params = {
        'q': query,
        'limit': limit_per_page,
        'sort': 'relevance',  # Manteniamo relevance come richiesto
        'restrict_sr': True
    }
    
    # Aggiungi after token se disponibile (per paginazione)
    if after_token:
        params['after'] = after_token
    
    try:
        resp = requests.get(search_url, headers=headers, params=params)
        
        if resp.status_code != 200:
            print(f"⚠️ Errore API: {resp.status_code}")
            print(f"Response: {resp.text}")
            break
            
        data = resp.json()
        posts = data['data']['children']
        after_token = data['data'].get('after')  # Token per la prossima pagina
        
        print(f"   📝 Trovati {len(posts)} posts in questa pagina")
        
        # Se non ci sono più posts, fermiamoci
        if not posts:
            print("   ℹ️ Nessun post trovato, fine raccolta")
            break
            
        # Se non c'è un after token, siamo all'ultima pagina
        if not after_token:
            print("   ℹ️ Ultima pagina raggiunta (nessun 'after' token)")
        
        page_posts = 0
        page_comments = 0
        
        # Processa ogni post della pagina
        for post in posts:
            post_data = post['data']
            post_id = post_data['id']
            title = post_data['title']
            selftext = post_data.get('selftext', '')
            score = post_data['score']
            author = post_data.get('author', '[deleted]')
            created_utc = datetime.utcfromtimestamp(post_data['created_utc']).isoformat()
            num_comments = post_data['num_comments']
            
            # Fetch comments per questo post
            comment_url = f'https://oauth.reddit.com/comments/{post_id}.json'
            try:
                response = requests.get(comment_url, headers=headers, params={'depth': 10, 'limit': 500})
                if response.status_code == 200:
                    comment_blob = response.json()[1]['data']['children']
                    high_comments = extract_comments(comment_blob, comment_score_min)
                else:
                    print(f"      ⚠️ Errore {response.status_code} per comments post {post_id}")
                    high_comments = []
            except Exception as e:
                print(f"      ⚠️ Errore comments per post {post_id}: {e}")
                high_comments = []
            
            # Costruisci record
            post_record = {
                'page_number': current_page,
                'post_id': post_id,
                'title': title,
                'author': author,
                'score': score,
                'created_utc': created_utc,
                'selftext': selftext,
                'num_comments': num_comments,
                'high_score_comments': high_comments
            }
            
            all_data.append(post_record)
            page_posts += 1
            page_comments += len(high_comments)
            
            # Piccola pausa tra i posts
            time.sleep(0.1)
        
        total_posts += page_posts
        total_comments += page_comments
        
        print(f"   ✅ {page_posts} posts, {page_comments} commenti raccolti da pagina {current_page}")
        print(f"   📊 Totale finora: {total_posts} posts, {total_comments} commenti")
        
        # Se non c'è after token, usciamo dal loop
        if not after_token:
            break
            
    except Exception as e:
        print(f"❌ Errore generale pagina {current_page}: {e}")
        break
    
    # Incrementa pagina
    current_page += 1
    
    # Pausa tra le richieste
    time.sleep(request_delay)
    
    # Salva progressi ogni 10 pagine
    if current_page % 10 == 0:
        print(f"💾 Salvataggio progressi... ({total_posts} posts totali finora)")
        with open(f"temp_{output_json}", 'w', encoding='utf-8') as f:
            json.dump(all_data, f, ensure_ascii=False, indent=2)

# --- Salvataggio finale ---
print(f"\n💾 Salvataggio finale...")
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

# Statistiche finali
print(f"\n🎉 COMPLETATO!")
print(f"📊 Statistiche finali:")
print(f"   📄 Pagine processate: {current_page - 1}")
print(f"   📝 Posts totali: {total_posts}")
print(f"   💬 Commenti totali: {total_comments}")
print(f"   📄 File salvato: {output_json}")
if current_page > 1:
    print(f"   ⏱️ Media posts/pagina: {total_posts/(current_page-1):.1f}")

# Rimuovi file temporaneo se esiste
if os.path.exists(f"temp_{output_json}"):
    os.remove(f"temp_{output_json}")
    print(f"🗑️ File temporaneo rimosso")

print(f"\n✨ Raccolta completata! Trovati dati da {len(set(record['post_id'] for record in all_data))} post unici.")

In [1]:
import json
import pandas as pd

NOME_FILE = "russian_invasion_rel_p"

# --- Caricamento del file JSON ---
with open(f'{NOME_FILE}.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# --- Statistiche ---
num_posts = len(data)
num_comments = sum(len(post['high_score_comments']) for post in data)

print(f"📌 Numero totale di post: {num_posts}")
print(f"💬 Numero totale di commenti ad alto punteggio: {num_comments}")

# --- Creazione del DataFrame ---
rows = []
for post in data:
    post_id = post['post_id']
    post_title = post['title']
    post_author = post['author']
    post_score = post['score']
    post_created_utc = post['created_utc']
    
    for comment in post['high_score_comments']:
        rows.append({
            'post_id': post_id,
            'post_title': post_title,
            'post_author': post_author,
            'post_score': post_score,
            'post_created_utc': post_created_utc,
            'comment_author': comment['author'],
            'comment_score': comment['score'],
            'comment_body': comment['body'],
            'comment_created_utc': comment['created_utc']
        })

df = pd.DataFrame(rows)

# --- Visualizzazione del DataFrame ---
print("\n🧾 Esempio del DataFrame:")
df.tail()

# Se vuoi salvarlo anche come CSV:
df.to_csv(f'{NOME_FILE}.csv', index=False)


📌 Numero totale di post: 197
💬 Numero totale di commenti ad alto punteggio: 17370

🧾 Esempio del DataFrame:
