# Warning

don't run all cells at once, visualizations take up a lot of space

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import sys
from matplotlib import pyplot as plt
from functools import reduce
import os
import altair as alt

sys.path.append("../src/")
from features.graph_embed import GraphEmbed
from models.cluster_graphs import ClusterGraphs
from visualization.visualize import Visualize
print("import done")

## Compiling all analysis types

In [None]:
cluster_f_name = 'pheme_graphs_clustered.pkl'
raw_data = pd.read_csv('../data/raw/pheme/pheme_all_events.csv')

In [None]:
default_a_type = 'user_level'
all_clusters = {}
all_graphs = {}
all_viz = {}
for f in os.listdir("../data/archived/"):
    if f.startswith("."):
        continue
    clusters = pd.read_pickle(os.path.join("..", "data", "archived", f, cluster_f_name))
    clusters['analysis_type'] = f
    all_clusters[f] = clusters

    graphs = GraphEmbed.read_graphs(clusters)
    all_graphs[f] = graphs
    all_viz[f] = Visualize(clusters, graphs)

clusters = pd.read_pickle(os.path.join("..", "models", cluster_f_name))
clusters['analysis_type'] = default_a_type
all_clusters[default_a_type] = clusters

graphs = GraphEmbed.read_graphs(clusters)
all_graphs[default_a_type] = graphs
all_viz[default_a_type] = Visualize(clusters, graphs)

In [None]:
analysis_types = list(all_clusters.keys())
analysis_types

In [None]:
all_clusters[analysis_types[0]].columns

In [None]:
raw_data.title.nunique()

In [None]:
num_clusters = 3

In [None]:
cl = ClusterGraphs(clusters)
num_clusters, inertias = cl.choose_clust_num_k_means()

In [None]:
plt.plot(inertias)

In [None]:
num_clusters

In [None]:
len(clusters)

In [None]:
ind_clusts = {}
for k, v in all_clusters.items():
    clusts = []
    for i in range(num_clusters):
        clusts.append(v.loc[v.label == i])
    ind_clusts[k] = clusts

## Exploring Differences between Clusters

### Central Network of each cluster

#### User Level

In [None]:
all_viz['user_level'].viz_graphs(clusters.loc[clusters.is_mean_vec == True].id.to_list())

#### Tweet Level

In [None]:
tweet_l = all_clusters['tweet_level']
all_viz['tweet_level'].viz_graphs(tweet_l.loc[tweet_l.is_mean_vec == True].id.to_list())

### Cluster Size

In [None]:
all_viz['user_level'].plot_cluster_size()

In [None]:
all_viz['tweet_level'].plot_cluster_size()

In [None]:
all_viz['user_level_no_unverified'].plot_cluster_size()

In [None]:
all_viz['tweet_level_no_unverified'].plot_cluster_size()

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {"num_nodes": "Number of Nodes"}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {"num_nodes": "Number of Nodes"}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {"num_wcc": "Number of Components user level"}, 200, 250, 1)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {"num_wcc": "Number of Components tweet level"}, 200, 250, 1)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'largest_wcc': "Largest Component user level"}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'largest_wcc': "Largest Component tweet level"}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'diameter_largest_wcc': 'largest component user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'diameter_largest_wcc': 'largest component tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'wiener_index': 'wiener index user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'wiener_index': 'wiener index tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'sentiment_mean': 'sentiment mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'sentiment_mean': 'sentiment mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'user_friends_count_mean': 'user friend count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'user_friends_count_mean': 'user friend count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'user_follower_count_mean': 'user follower count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'user_follower_count_mean': 'user follower count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'user_tweet_count_mean': 'user tweet count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'user_tweet_count_mean': 'user tweet count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'mentions_count_mean': 'mentions count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'mentions_count_mean': 'mentions count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'favorite_count_mean': 'favorite count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'favorite_count_mean': 'favorite count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'retweet_count_mean': 'retweet count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'retweet_count_mean': 'retweet count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'hashtags_count_mean': 'hashtag count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'hashtags_count_mean': 'hashtag count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'media_count_mean': 'media count mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'media_count_mean': 'media count mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'urls_mean': 'urls mean user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'urls_mean': 'urls mean tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'mean_user_mentions': 'mean_user_mentions user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'mean_user_mentions': 'mean_user_mentions tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].graph_point_range_cluster_info(True, {'mean_tweet_len': 'mean tweet len user level'}, 200, 250, 3)

In [None]:
all_viz['tweet_level'].graph_point_range_cluster_info(True, {'mean_tweet_len': 'mean tweet len tweet level'}, 200, 250, 3)

In [None]:
all_viz['user_level'].viz_ind_cluster_truth()

In [None]:
all_viz['tweet_level'].viz_ind_cluster_truth()

In [None]:
all_viz['tweet_level_no_unverified'].viz_ind_cluster_truth()

In [None]:
all_viz['user_level'].viz_type_clusters("truth")

In [None]:
all_viz['tweet_level'].viz_type_clusters("truth")

In [None]:
all_viz['user_level'].viz_type_clusters("event")

In [None]:
all_viz['tweet_level'].viz_type_clusters("event")

In [None]:
all_viz['user_level'].graph_reduced_dimensions(["id", "num_nodes", "title", 'truth', 'event'], 400, 400, "How Diffusion Networks Vary")

In [None]:
all_viz['tweet_level'].graph_reduced_dimensions(["id", "num_nodes", "title", 'truth', 'event'], 400, 400, "How Diffusion Networks Vary")