In [113]:
# Social Media Mining - Caso Garlasco
# Obiettivo: raccogliere e salvare commenti da video YouTube relativi alla riapertura del caso Chiara Poggi (Garlasco), pubblicati tra marzo 2024 e luglio 2025.
from googleapiclient.discovery import build
import csv
from datetime import datetime
import json
import os
from dotenv import load_dotenv

In [114]:
# Rimuove una variabile specifica
#os.environ.pop("yt_key", None)  

In [115]:
# Configurazione dell'accesso alle API di YouTube
load_dotenv()
DEVELOPER_KEY = os.environ.get('yt_key')
if not DEVELOPER_KEY:
    raise ValueError("Chiave API non trovata")
youtube = build("youtube", "v3", developerKey=DEVELOPER_KEY)

In [116]:
# Canali target da cui raccogliere i video
channels = [
    "Fanpage.it",
    "La Repubblica",
    "FABRIZIO CORONA",
    "Gianluca Spina",
    "Gianmarco Zagato",
    "La7 Attualità",
    "Rai",
    "Bugalalla Crime",
    "DarkSide - Storia Segreta d'Italia"
]

# Intervallo temporale per i video (due settimana prima dalla riapertura del caso fino ai giorni odierni)
begin_date = datetime(2025, 2, 25)
end_date = datetime(2025, 6, 8)

In [117]:
# Ottieni ID canale da nome
def getIDfromName(name):
    request = youtube.search().list(
        part="snippet",
        q=name,
        type="channel",
        maxResults=1
    )
    response = request.execute()
    return response['items'][0]['id']['channelId']

# Ottieni playlist upload del canale
def getChannelPlaylist(channel_id):
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    )
    response = request.execute()
    return response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

# Estrai video dalla playlist e filtra quelli relativi al caso Garlasco
def get_videos_from_channel(playlist_id, channel_name):
    video_data = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            try:
                snippet = item['snippet']
                video_date = datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
                title = snippet['title'].lower()
                description = snippet.get('description', '').lower()
                combined_text = title + description
                video_id = snippet['resourceId']['videoId']

                if begin_date <= video_date <= end_date and "garlasco" in combined_text:
                    video_req = youtube.videos().list(
                        part="statistics",
                        id=video_id
                    )
                    video_res = video_req.execute()

                    if not video_res['items']:
                        continue  

                    stats = video_res['items'][0].get('statistics', {})
                    if 'commentCount' not in stats:
                        continue  
                    video_entry = {
                        "video_id": video_id,
                        "published_at": video_date.strftime("%Y-%m-%d"),
                        "title": snippet['title'],
                        "channel": channel_name
                    }
                    video_data.append(video_entry)
            except Exception as e:
                print(f"[Errore parsing video]: {e}")
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
    return video_data

# Funzione principale che unisce i risultati e li salva in un unico file JSON
def get_videos_json(channels, output_path="garlasco_videos.json"):
    all_videos = []
    for channel in channels:
        try:
            print(f"\n>>> Ricerca nel canale: {channel}")
            channel_id = getIDfromName(channel)
            playlist_id = getChannelPlaylist(channel_id)
            videos = get_videos_from_channel(playlist_id, channel)
            all_videos.extend(videos)
        except Exception as e:
            print(f"[Errore canale {channel}]: {e}")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_videos, f, ensure_ascii=False, indent=2)
    print(f"\nSalvati {len(all_videos)} video in '{output_path}'")

In [118]:
# Avvio raccolta video
get_videos_json(channels)


>>> Ricerca nel canale: Fanpage.it

>>> Ricerca nel canale: La Repubblica

>>> Ricerca nel canale: FABRIZIO CORONA

>>> Ricerca nel canale: Gianluca Spina

>>> Ricerca nel canale: Gianmarco Zagato

>>> Ricerca nel canale: La7 Attualità

>>> Ricerca nel canale: Rai

>>> Ricerca nel canale: Bugalalla Crime

>>> Ricerca nel canale: DarkSide - Storia Segreta d'Italia

Salvati 239 video in 'garlasco_videos.json'


In [133]:
# Crea struttura Comment
class Comment:
    def __init__(self, id, video_id, content, author, date, likes, reply_to_id=None):
        self.id = id
        self.video_id = video_id
        self.content = content
        self.author = author
        self.date = datetime.strptime(date, "%Y-%m-%d")
        self.likes = int(likes)
        self.reply_to_id = reply_to_id

    def __repr__(self):
        return f"<Comment by {self.author} on {self.date.strftime('%Y-%m-%d')}>"


In [120]:
# Estrae e struttura commenti da risposta API
def get_comments_from_response(items):
    comments = []
    for item in items:
        try:
            top = item["snippet"]["topLevelComment"]
            main = Comment(
                id=top["id"],
                video_id=item["snippet"]["videoId"],
                content=top["snippet"]["textDisplay"],
                author=top["snippet"]["authorDisplayName"],
                date=datetime.strptime(top["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"),
                likes=top["snippet"]["likeCount"],
                reply_to_id=None
            )
            comments.append(main)

            if "replies" in item:
                for reply in item["replies"]["comments"]:
                    comments.append(Comment(
                        id=reply["id"],
                        video_id=item["snippet"]["videoId"],
                        content=reply["snippet"]["textDisplay"],
                        author=reply["snippet"]["authorDisplayName"],
                        date=datetime.strptime(reply["snippet"]["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"),
                        likes=reply["snippet"]["likeCount"],
                        reply_to_id=top["id"]
                    ))
        except Exception as e:
            print(f"[Errore parsing commento]: {e}")
            continue
    return comments

# Estrae tutti i commenti da un singolo video
def get_comments_one_vid(video_id):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100
        )
        response = request.execute()
        comments.extend(get_comments_from_response(response["items"]))
        next_token = response.get("nextPageToken")
        while next_token:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id,
                textFormat="plainText",
                maxResults=100,
                pageToken=next_token
            )
            response = request.execute()
            comments.extend(get_comments_from_response(response["items"]))
            next_token = response.get("nextPageToken")
    except Exception as e:
        print(f"[Errore] Estrazione commenti video {video_id}: {e}")
    return comments

# Carica gli ID video da un file JSON e raccoglie tutti i commenti in un file JSON unico
def get_all_comments(video_json_path, output_json_path="garlasco_comments.json"):
    all_comments = []

    with open(video_json_path, "r", encoding="utf-8") as f:
        videos = json.load(f)
        video_ids = [video["video_id"] for video in videos]

    for idx, vid in enumerate(video_ids):
        print(f"[{idx+1}/{len(video_ids)}] Estrazione commenti video {vid}")
        comments = get_comments_one_vid(vid)
        for c in comments:
            all_comments.append({
                "comment_id": c.id,
                "video_id": c.video_id,
                "author": c.author,
                "content": c.content,
                "date": c.date.strftime("%Y-%m-%d"),
                "likes": c.likes,
                "reply_to_id": c.reply_to_id
            })

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(all_comments, f, ensure_ascii=False, indent=2)

    print(f"\nSalvati {len(all_comments)} commenti in '{output_json_path}'")

In [121]:
# Estrazione commenti
get_all_comments("garlasco_videos.json", "garlasco_comments.json")

[1/239] Estrazione commenti video 0E2znss1aPI
[2/239] Estrazione commenti video oaS-KJ0mz20
[3/239] Estrazione commenti video hMD5rLYDUrk
[4/239] Estrazione commenti video 9jNySIytgp4
[5/239] Estrazione commenti video -tIRGBdPabA
[6/239] Estrazione commenti video MkJULnHa9Ug
[7/239] Estrazione commenti video rGQlVoynlWo
[8/239] Estrazione commenti video jcym9wXec3c
[9/239] Estrazione commenti video ibyXAAz-7eg
[10/239] Estrazione commenti video s-HdzEdrcQQ
[11/239] Estrazione commenti video PK-Q-YpAASU
[12/239] Estrazione commenti video HoJIDggfkDU
[13/239] Estrazione commenti video PXOOzIBVgKo
[14/239] Estrazione commenti video 46FI8BwAQts
[15/239] Estrazione commenti video QyZRLDgKGpM
[16/239] Estrazione commenti video fqALwc2qNKE
[17/239] Estrazione commenti video Uon0UXL9YtI
[18/239] Estrazione commenti video 3KLuu0X2wu8
[19/239] Estrazione commenti video Vr-ijcH1WUc
[20/239] Estrazione commenti video FBX4eUjjnuY
[21/239] Estrazione commenti video gX3_L0Y0Qj4
[22/239] Estrazione co

In [135]:
from collections import defaultdict

# Carica file JSON unificati
with open("garlasco_comments.json", "r", encoding="utf-8") as f:
    comments = json.load(f)

with open("garlasco_videos.json", "r", encoding="utf-8") as f:
    videos = json.load(f)

# Mappa video_id → canale
Channel_of = {v["video_id"]: v["channel"] for v in videos}

# Mappa canale → lista video_id
Videos_of = defaultdict(list)
for v in videos:
    Videos_of[v["channel"]].append(v["video_id"])

# Mappa video_id → data
Date_of_video = {v["video_id"]: v["published_at"] for v in videos}

# Inizializza dizionari ausiliari
Commenters_of_video = defaultdict(set)
Videos_commented_by = defaultdict(set)
Channels_commented_by = defaultdict(set)
Commenters_of_channel = defaultdict(set)
Comments_of_channel = defaultdict(list)
Comments_by_user = defaultdict(list)

# Popola strutture dai commenti
for c in comments:
    video_id = c["video_id"]
    author = c["author"]
    channel = Channel_of.get(video_id, "Unknown")

    comment_obj = Comment(
        id=c["comment_id"],
        video_id=video_id,
        content=c["content"],
        author=author,
        date=c["date"],
        likes=c["likes"],
        reply_to_id=c.get("reply_to_id")
    )

    Commenters_of_video[video_id].add(author)
    Videos_commented_by[author].add(video_id)
    Channels_commented_by[author].add(channel)
    Commenters_of_channel[channel].add(author)
    Comments_of_channel[channel].append(comment_obj)
    Comments_by_user[author].append(comment_obj)

# Debug: stampa sommaria
print(f"Video totali: {len(Channel_of)}")
print(f"Utenti unici: {len(Comments_by_user)}")
print(f"Canali totali: {len(Videos_of)}")


Video totali: 239
Utenti unici: 24851
Canali totali: 8


In [144]:
# Grafo bipartito: utenti e video
import networkx as nx

# Caricamento dati
with open("garlasco_comments.json", "r", encoding="utf-8") as f:
    comments = json.load(f)

# Costruzione grafo bipartito: utenti e video
G_user_video = nx.Graph()
user_video_counts = defaultdict(lambda: defaultdict(int))

# Costruzione delle relazioni utente-video
for c in comments:
    user = c["author"].strip()
    video = c["video_id"].strip()
    if user and video:
        user_video_counts[user][video] += 1

# Aggiunta nodi utenti
for user in user_video_counts:
    G_user_video.add_node(user, type="user")

# Aggiunta nodi video
videos = set(v for user_videos in user_video_counts.values() for v in user_videos)
for video in videos:
    G_user_video.add_node(video, type="video")

# Aggiunta archi pesati
for user, video_dict in user_video_counts.items():
    for video, count in video_dict.items():
        G_user_video.add_edge(user, video, weight=count, relation="commented")

# Esportazione file GEXF
nx.write_gexf(G_user_video, "garlasco_user_video_bipartite.gexf")
print(f"Grafo Utente-Video salvato con {G_user_video.number_of_nodes()} nodi e {G_user_video.number_of_edges()} archi.")

Grafo Utente-Video salvato con 25089 nodi e 70760 archi.


In [145]:
# Grafo bipartito: utenti e canali
with open("garlasco_videos.json", "r", encoding="utf-8") as f:
    videos = json.load(f)

# Mappa video_id → canale
Channel_of = {v["video_id"]: v["channel"] for v in videos}

G_user_channel = nx.Graph()
user_channel_counts = defaultdict(lambda: defaultdict(int))

# Costruzione delle relazioni utente-canale
for c in comments:
    user = c["author"].strip()
    video = c["video_id"].strip()
    channel = Channel_of.get(video, "Unknown").strip()
    if user and channel:
        user_channel_counts[user][channel] += 1

# Nodi utenti
for user in user_channel_counts:
    G_user_channel.add_node(user, type="user")

# Nodi canali
channels = set(c for uc in user_channel_counts.values() for c in uc)
for channel in channels:
    G_user_channel.add_node(channel, type="channel")

# Aggiunta archi
for user, channel_dict in user_channel_counts.items():
    for channel, count in channel_dict.items():
        G_user_channel.add_edge(user, channel, weight=count, relation="commented")

# Esportazione GEXF
nx.write_gexf(G_user_channel, "garlasco_user_channel_bipartite.gexf")
print(f"Grafo Utente-Canale salvato con {G_user_channel.number_of_nodes()} nodi e {G_user_channel.number_of_edges()} archi.")

Grafo Utente-Canale salvato con 24858 nodi e 35533 archi.
