# Warning

don't run all cells at once, visualizations take up a lot of space

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import sys
from matplotlib import pyplot as plt
from functools import reduce
import os

sys.path.append("../src/")
from features.graph_embed import GraphEmbed
from models.cluster_graphs import ClusterGraphs
from visualization.visualize import Visualize

In [None]:
cluster_f_name = 'pheme_graphs_clustered.pkl'
raw_data = pd.read_csv('../data/raw/pheme/pheme_all_events.csv')

In [None]:
default_a_type = 'user_level'
all_clusters = {}
all_graphs = {}
for f in os.listdir("../data/archived/"):
    if f.startswith("."):
        continue
    clusters = pd.read_pickle(os.path.join("..", "data", "archived", f, cluster_f_name))
    clusters['analysis_type'] = f
    all_clusters[f] = clusters

    graphs = GraphEmbed.read_graphs(clusters)
    all_graphs[f] = graphs

clusters = pd.read_pickle(os.path.join("..", "models", cluster_f_name))
clusters['analysis_type'] = default_a_type
all_clusters[default_a_type] = clusters

graphs = GraphEmbed.read_graphs(clusters)
all_graphs[default_a_type] = graphs

In [None]:
analysis_types = list(all_clusters.keys())

In [None]:
viz = Visualize(all_clusters, all_graphs)

In [None]:
all_clusters[analysis_types[0]].columns

In [None]:
raw_data.title.nunique()

In [None]:
num_clusters = 3

In [None]:
cl = ClusterGraphs(clusters)
num_clusters, inertias = cl.choose_clust_num_k_means()

In [None]:
plt.plot(inertias)

In [None]:
num_clusters

In [None]:
len(clusters)

In [None]:
ind_clusts = {}
for k, v in all_clusters.items():
    clusts = []
    for i in range(num_clusters):
        clusts.append(v.loc[v.label == i])
    ind_clusts[k] = clusts

## Exploring Differences between Clusters

### Central Network of each cluster

In [None]:
viz.viz_graphs(clusters.loc[clusters.is_mean_vec == True].id.to_list())

### Cluster Size

In [None]:
viz.plot_cluster_size()

In [None]:
viz.graph_point_range_cluster_info(True, {"num_nodes": "Number of Nodes"}, 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(True, {"num_edges": "Number of Edges"}, 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(True, {"num_wcc": "Number of Components"}, 200, 250, 1)

In [None]:
viz.graph_point_range_cluster_info(True, {'largest_wcc': "Largest Component"}, 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['diameter_largest_wcc'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['max_out_degree'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['max_in_degree'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['mean_out_degree', "mean_in_degree"], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['wiener_index'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['sentiment_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['user_friends_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['user_follower_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['user_tweet_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['mentions_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['favorite_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['retweet_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['hashtags_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['media_count_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['urls_mean'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['mean_user_mentions'], 200, 250, 3)

In [None]:
viz.graph_point_range_cluster_info(False, ['mean_tweet_len'], 200, 250, 3)

In [None]:
for i in range(3):
    print("cluster", i)
    print(ind_clusts[i].truth.value_counts() / len(ind_clusts[i]))

In [None]:
for i in range(3):
    print("cluster", i)
    print(ind_clusts[i].event.value_counts() / len(ind_clusts[i]))

In [None]:
viz.viz_type_clusters("truth")

In [None]:
viz.viz_type_clusters("event")

In [None]:
p_components = viz.graph_reduced_dimensions(["id", "num_nodes", "title", 'truth', 'event'], 400, 400, "How Diffusion Networks Vary")
p_components

In [None]:
ids = [ind_clusts[0].sample(1).id.values[0], ind_clusts[1].sample(1).id.values[0], ind_clusts[2].sample(1).id.values[0]]
viz.viz_graphs(ids)

In [None]:
viz.get_corr_heat_map(clusters.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns)