In [None]:
import pandas as pd
import json
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict

# Load the datasets
guest_df = pd.read_csv('../data/guest_data_with_podcast_score.csv')

# Load commenters data - you'll need to provide this file
with open('../data/video_commenters_final.json', 'r') as f:
    video_commenters = json.load(f)

# Create mapping from videos to guests
video_to_guests = defaultdict(list)
for _, row in guest_df.iterrows():
    guest_name = row['Nome']
    video_id = row['video_id']
    video_to_guests[video_id].append(guest_name)

# Create mapping from guests to videos
guest_to_videos = defaultdict(list)
for _, row in guest_df.iterrows():
    guest_name = row['Nome']
    video_id = row['video_id']
    guest_to_videos[guest_name].append(video_id)

# Create mapping from guests to their commenters
guest_to_commenters = defaultdict(set)
for guest_name, videos in guest_to_videos.items():
    for video_id in videos:
        if video_id in video_commenters:
            guest_to_commenters[guest_name].update(video_commenters[video_id])

# Identify guest pairs who appeared in the same video (to exclude)
guests_who_appeared_together = set()
for video_id, guests in video_to_guests.items():
    if len(guests) > 1:
        for i in range(len(guests)):
            for j in range(i+1, len(guests)):
                guests_who_appeared_together.add(tuple(sorted([guests[i], guests[j]])))

# Create the graph
G = nx.Graph()

# Add nodes (guests)
for guest_name in guest_to_videos.keys():
    G.add_node(guest_name)

# Add edges for guests with shared commenters
for i, guest1 in enumerate(guest_to_videos.keys()):
    for guest2 in list(guest_to_videos.keys())[i+1:]:  # More efficient iteration
        # Skip if they appeared together in a video
        if tuple(sorted([guest1, guest2])) in guests_who_appeared_together:
            continue
        
        # Calculate shared commenters
        shared_commenters = guest_to_commenters[guest1].intersection(guest_to_commenters[guest2])
        num_shared = len(shared_commenters)
        
        # Add edge if they share at least one commenter
        if num_shared > 0:
            G.add_edge(guest1, guest2, weight=num_shared)


# Function to analyze the graph
def analyze_graph(G):
    print(f"Number of nodes (guests): {G.number_of_nodes()}")
    print(f"Number of edges (connections): {G.number_of_edges()}")
    
    # Top guest pairs with most shared commenters
    top_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:10]
    print("\nTop 10 guest pairs with most shared commenters:")
    for guest1, guest2, data in top_edges:
        print(f"{guest1} - {guest2}: {data['weight']} shared commenters")
    
    # Guests with most connections
    top_degree = sorted(G.degree, key=lambda x: x[1], reverse=True)[:10]
    print("\nTop 10 most connected guests:")
    for guest, degree in top_degree:
        print(f"{guest}: connected to {degree} other guests")
        
# Salva il grafo in formato GraphML
def save_graph_to_graphml(G, filename):
    # Converti i pesi degli archi in stringhe poiché GraphML richiede tipi di dati coerenti
    for u, v, d in G.edges(data=True):
        G[u][v]['weight'] = str(d['weight'])
    
    # Salva il grafo
    nx.write_graphml(G, filename)
    print(f"Grafo salvato con successo in: {filename}")
    
    # Riconverti i pesi in numeri per l'utilizzo successivo
    for u, v, d in G.edges(data=True):
        G[u][v]['weight'] = int(d['weight'])


# Run the analysis
analyze_graph(G)

Number of nodes (guests): 989
Number of edges (connections): 361775

Top 10 guest pairs with most shared commenters:
Donald Trump - Elon Musk: 25365 shared commenters
Donald Trump - JD Vance: 25305 shared commenters
Tim Dillon - Joey Diaz: 15165 shared commenters
Duncan Trussell - Tim Dillon: 15018 shared commenters
Elon Musk - Tim Dillon: 15016 shared commenters
Donald Trump - Tim Dillon: 14297 shared commenters
Tim Dillon - Jeremy Corbell: 14083 shared commenters
Tim Dillon - Tim Pool: 13031 shared commenters
Donald Trump - Terrence Howard: 12952 shared commenters
Tim Dillon - Graham Hancock: 12899 shared commenters

Top 10 most connected guests:
Donald Trump: connected to 974 other guests
Dave Smith: connected to 972 other guests
Eric Weinstein: connected to 971 other guests
Tim Dillon: connected to 969 other guests
Terrence Howard: connected to 968 other guests
Mike Baker: connected to 966 other guests
Elon Musk: connected to 965 other guests
Jordan Peterson: connected to 965 other

In [15]:
# Aggiungi attributi degli ospiti ai nodi e salva il grafo in formato GraphML
def add_guest_attributes_and_save_graphml(G, guest_df, filename):
    # Crea un dizionario con gli attributi degli ospiti
    guest_attributes = {}
    
    # Per ogni ospite, estrai gli attributi
    for guest_name in G.nodes():
        # Filtra il dataframe per questo ospite
        guest_rows = guest_df[guest_df['Nome'] == guest_name]
        
        if len(guest_rows) > 0:
            # Attributi che non cambiano tra apparizioni (prendi il primo valore non nullo)
            attributes = {
                'Nazionalità': guest_rows['Nazionalità'].iloc[0],
                'Sesso': guest_rows['Sesso'].iloc[0],
                'Età': str(guest_rows['Età'].iloc[0]),
                'Professione': guest_rows['Professione'].iloc[0],
                'Notorietà': guest_rows['Notorietà'].iloc[0],
                'Follower': str(guest_rows['Numero di follower social'].iloc[0])
            }
            
            # Per attributi che possono variare tra apparizioni
            # Topic video: unisci tutti i topic unici
            topics = guest_rows['Topic video'].unique()
            attributes['Topics'] = ', '.join(topics)
            
            # Score: calcola la media
            attributes['podcast_score_avg'] = str(guest_rows['podcast_score'].mean())
            
            # Numero di apparizioni
            attributes['apparizioni'] = str(len(guest_rows))
            
            guest_attributes[guest_name] = attributes
    
    # Aggiungi gli attributi al grafo
    for guest_name, attrs in guest_attributes.items():
        for key, value in attrs.items():
            G.nodes[guest_name][key] = value
    
    # Converti i pesi degli archi in stringhe
    for u, v, d in G.edges(data=True):
        G[u][v]['weight'] = str(d['weight'])
    
    # Salva il grafo
    nx.write_graphml(G, filename)
    print(f"Grafo con attributi degli ospiti salvato con successo in: {filename}")
    
    # Pulizia: rimuovi weight_numeric
    for u, v, d in G.edges(data=True):
        if 'weight_numeric' in d:
            del d['weight_numeric']

# Salva il grafo con gli attributi degli ospiti
add_guest_attributes_and_save_graphml(G, guest_df, './graph_data/guest_network_with_attributes.graphml')

Grafo con attributi degli ospiti salvato con successo in: ./graph_data/guest_network_with_attributes.graphml
