In this notebook you will find:
- SNA Metrics
- Community Detection

## Libraries

In [None]:
!pip install pyvis cv2 gdown spicy



ERROR: Could not find a version that satisfies the requirement cv2 (from versions: none)
ERROR: No matching distribution found for cv2
You should consider upgrading via the 'C:\Users\Paola\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [None]:
import lzma
import pickle
import pandas as pd
import os
import networkx as nx
import networkx.algorithms.community as nx_comm
import matplotlib.pyplot as plt
import gdown
import cv2 as cv
from shutil import copyfile
import zipfile
from pyvis.network import Network
from wordcloud import WordCloud
import numpy as np
from imageio import imread
import plotly
import plotly.express as px
from nrclex import NRCLex
import nltk
nltk.download('punkt')

# For centrality metrics
from networkx.algorithms.centrality import (closeness_centrality,
                                            betweenness_centrality,
                                            degree_centrality)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Paola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Data

In [None]:
import lzma
import pandas as pd
import pickle
import networkx as nx
import random

def load_data(graph_names, start_year, end_year):
    global_vars = globals()

    for year in range(start_year, end_year + 1):
        for name in graph_names:
            with lzma.open(name + "_" + str(year) + '.lzma', 'rb') as file:
                G = pickle.load(file)
                # Numero massimo totale di nodi nel subgrafo
                max_total_nodes = 300

                # Trova le componenti fortemente connesse
                strongly_connected_components = list(nx.strongly_connected_components(G))

                # Ordina le componenti in base alla dimensione in modo decrescente
                sorted_components = sorted(strongly_connected_components, key=len, reverse=True)

                # Crea il subgrafo
                subgraph = nx.DiGraph()

                for component in sorted_components:
                    # Limita il numero di nodi nella componente
                    selected_nodes = list(component)


                    if len(subgraph) + len(selected_nodes) <= max_total_nodes:
                        # Aggiungi i nodi al subgrafo
                        subgraph.add_nodes_from(selected_nodes)

                        # Aggiungi gli archi al subgrafo
                        subgraph.add_edges_from(G.subgraph(selected_nodes).edges())

                        print(f"Nodi nel subgrafo: {len(subgraph)}, Archi nel subgrafo: {subgraph.number_of_edges()}")
                    else:
                        break  # Interrompi se l'aggiunta di questa componente supera il limite totale di nodi

                if len(subgraph) == 0:
                    random_nodes = random.sample(G.nodes(), max_total_nodes)
                    subgraph = G.subgraph(random_nodes)
                    print(f"Subgrafo casuale: Nodi nel subgrafo: {len(subgraph)}, Archi nel subgrafo: {subgraph.number_of_edges()}")

                # k = min(50, len(G))  # Scegli il minimo tra 50 e il numero di nodi nel grafo
                # nodes = list(G.nodes())
                # random_nodes = random.sample(nodes, k)
                # H = G.subgraph(random_nodes)  # Crea un sottografo con i nodi campionati casualmente
                global_vars[name + "_" + str(year)] = subgraph


# SNA metrics

In [None]:
import tqdm

def write_to_file(filename, text):
    with open(filename, 'a') as f:
        f.write(text + '\n')

def get_metrics(G, year):
    assortativity = nx.degree_assortativity_coefficient(G)
    node_conn = nx.node_connectivity(G)
    edge_conn = nx.edge_connectivity(G)
    density = nx.density(G)
    overall_reciprocity = nx.overall_reciprocity(G)
    scc = nx.number_strongly_connected_components(G)
    nnodes = G.number_of_nodes()
    deg = sum(d for n, d in G.degree) / (2*float(nnodes))

    metrics = f'Year: {year}\nNumber of nodes: {G.number_of_nodes()}\nNumber of edges: {G.number_of_edges()}\nAssortativity: {round(assortativity,4)}\nNode connectivity: {node_conn}\nEdge connectivity: {edge_conn}\nGraph density: {round(density,6)}\nOverall_reciprocity: {round(overall_reciprocity,5)}\nNumber of strongly connected component: {scc}\nAverage degree: {deg}\n'
    print(metrics)
    write_to_file('SNA_metrics.txt', metrics)

def get_degree_centrality(G, year):
    dc_dict = dict(nx.degree_centrality(G))
    ordered_dc = dict(sorted(dc_dict.items(), key=lambda item: item[1], reverse=True))

    degree_centrality = '\nTop 10 degree centrality connected component:'
    i = 0
    for key, value in ordered_dc.items():
        degree_centrality += f'\n{key}: {value}'
        i += 1
        if i == 10:
            break
    print(degree_centrality)
    write_to_file('SNA_metrics.txt', degree_centrality)

def get_clos_centrality(G, year):
    cc_dict = dict(nx.closeness_centrality(G))
    ordered_cc = dict(sorted(cc_dict.items(), key=lambda item: item[1], reverse=True))

    closeness_centrality = '\nTop 10 closeness centrality connected component:'
    i = 0
    for key, value in ordered_cc.items():
        closeness_centrality += f'\n{key}: {value}'
        i += 1
        if i == 10:
            break
    print(closeness_centrality)
    write_to_file('SNA_metrics.txt', closeness_centrality)

def get_bet_centrality(G, year):
    bc_dict = dict(nx.betweenness_centrality(G))
    ordered_bc = dict(sorted(bc_dict.items(), key=lambda item: item[1], reverse=True))

    betweenness_centrality = '\nTop 10 betweenness centrality connected component:'
    i = 0
    for key, value in ordered_bc.items():
        betweenness_centrality += f'\n{key}: {value}'
        i += 1
        if i == 10:
            break
    print(betweenness_centrality)
    write_to_file('SNA_metrics.txt', betweenness_centrality)


In [None]:
def SNA_metrics(graph_names, start_year, end_year):
    for year in tqdm.tqdm(range(start_year, end_year + 1), desc='Processing years'):
        for name in graph_names:
            G = globals()[name + "_" + str(year)]

            get_metrics(G, year)
            get_degree_centrality(G, year)
            get_clos_centrality(G, year)
            get_bet_centrality(G, year)


# Community Detection

In [None]:
def find_communities(G):
  c = nx.community.greedy_modularity_communities(G)
  print('Numbers of communities:', len(c))
  return c

def plot_communities(G, c):
  partition = {}
  for i,comm in enumerate([list(el) for el in c]):
    for el in comm:
        partition[el] = i

  plt.figure(figsize=(15, 15))
  plt.axis('off')
  nx.draw_networkx(G, pos=nx.spring_layout(G), cmap=plt.cm.viridis, node_color=list(partition.values()), with_labels=False)

def get_mod_score(G, c):
  print('Modularity score:', nx_comm.modularity(G, c))

def get_top_community(c):
  partition = {}
  for i,comm in enumerate([list(el) for el in c]):
      for el in comm:
          partition[el] = i

  communities = {}
  for node, comm_id in partition.items():
      communities.setdefault(comm_id, []).append(node)

  sorted_communities = {k: v for k, v in sorted(communities.items(), key=lambda item: len(item[1]), reverse=True)}

  top_communities = {k: len(v) for k, v in list(sorted_communities.items())[:3]}
  print(top_communities)

def p_viz(G, communities, year=None):
  for c, v_c in enumerate(communities):
    if len(v_c) > 2: # coloring only communities with more than 2 users
      for v in v_c:
        # Add 1 to save 0 for external edges
        G.nodes[v]['group'] = c + 1

  net = Network(height = '1024px', width = "100%", directed=True, notebook=True, cdn_resources='remote')
  net.set_options("""
  var options = {
    "edges": {
      "color": {
        "inherit": true
      },
      "smooth": {
        "type": "continuous",
        "forceDirection": "none",
        "roundness": 0.65
      }
    },
    "physics": {
      "forceAtlas2Based": {
        "springLength": 100
      },
      "minVelocity": 0.75,
      "solver": "forceAtlas2Based"
    },
    "interaction": {
      "navigationButtons": true
    }
  }
  """)

  net.from_nx(G)

  html_file_path = f'community_{year}.html'
  net.write_html(html_file_path)

  net.show('community.html')



In [None]:
def Community_Detection(graph_names, start_year, end_year):
    for year in tqdm.tqdm(range(start_year, end_year + 1), desc='Processing years'):
        for name in graph_names:
            G = globals()[name + "_" + str(year)]

            c = find_communities(G)
            plot_communities(G, c)
            get_mod_score(G, c)
            get_top_community(c)
            p_viz(G, c, year=year)

## Pipeline function

In [None]:
import gc

def pipeline(graph_names, start_year, end_year):
    # Cancella il file esistente per evitare di appendere a vecchi dati
    if os.path.exists('SNA_metrics.txt'):
        os.remove('SNA_metrics.txt')

    for year in tqdm.tqdm(range(start_year, end_year + 1), desc='Processing years'):
        load_data(graph_names, year, year)

        SNA_metrics(graph_names, year, year)
        Community_Detection(graph_names, year, year)

        # Cancella i grafici di quell'anno
        for name in graph_names:
            del globals()[name + "_" + str(year)]

        # Usa gc per pulire la RAM
        gc.collect()


In [None]:
# Utilizzo della funzione
df_names = ["politics_comments"]
graph_names = ["graph"]

start_year = 2007
end_year = 2009

pipeline(graph_names, start_year, end_year)