In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
import sys
from matplotlib import pyplot as plt
from functools import reduce
import os
import altair as alt

sys.path.append("../src/")
from features.graph_embed import GraphEmbed
from models.cluster_graphs import ClusterGraphs
from visualization.visualize import Visualize
print("import done")

In [None]:
cluster_f_name = 'pheme_graphs_clustered.pkl'
raw_data = pd.read_csv('../data/raw/pheme/pheme_all_events.csv')
clusters = pd.read_pickle(os.path.join("..", "models", cluster_f_name))

drop_small_events = False

In [None]:
clusters['nodes_per_thread'] = clusters['num_nodes'] / clusters['num_threads']
clusters['truth_val'] = 0
clusters.loc[clusters.truth == 'true', 'truth_val'] = 1

if drop_small_events:
    clusters = clusters.groupby('event').filter(lambda x: len(x) > 6)

graphs = GraphEmbed.read_graphs(clusters)
viz = Visualize(clusters, graphs)

# Overview

Data: Each network is a combination of all rumour threads relating to a specific rumour. All pheme rumours are included except ones that don't have any edges

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = clusters.select_dtypes(numerics).copy()
numeric_features.columns

## Clustering stats

In [None]:
cl = ClusterGraphs(clusters)
num_clusters, inertias = cl.choose_clust_num_k_means()
plt.plot(inertias)

In [None]:
num_clusters

In [None]:
len(clusters)

## Descriptive Stats

In [None]:
clusters.loc[:, ['num_threads', 'num_nodes', 'num_edges', 'num_wcc', 'largest_wcc',
       'diameter_largest_wcc', 'max_out_degree', 'max_in_degree',
       'mean_out_degree', 'mean_in_degree', 'wiener_index', 'nodes_per_thread']].describe()

In [None]:
clusters.event.value_counts()

In [None]:
clusters.truth.value_counts()

In [None]:
viz.graph_point_range_cluster_info(False, ['num_threads'], 300, 250, 1, 'event')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes'], 300, 250, 1, 'event')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 300, 250, 1, 'event')

# Exploring Differences between Clusters

## Central Network of Each cluster

In [None]:
viz.viz_graphs(clusters.loc[clusters.is_mean_vec == True].id.to_list())

## Cluster Size

In [None]:
viz.plot_cluster_size()

## Truth Composition of Each cluster

In [None]:
viz.viz_ind_cluster_truth()

## Topological Features

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes', 'num_edges'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes', 'num_edges'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_wcc', 'num_threads'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_wcc', 'num_threads'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['largest_wcc'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['largest_wcc'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['diameter_largest_wcc'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['diameter_largest_wcc'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['wiener_index'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['wiener_index'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 200, 250, 3, 'truth')

## Text Features

In [None]:
viz.graph_point_range_cluster_info(False, ['sentiment_mean'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['sentiment_mean'], 200, 250, 3, 'truth')

## Social Features

## TSNE Plot

In [None]:
viz.graph_reduced_dimensions(["id", "num_nodes", "title", 'truth', 'event'], 400, 400, "How Threads Networks Vary")

## Correlations

In [None]:
numeric_features

In [None]:
viz.get_corr_heat_map(numeric_features.columns.to_list())