In [221]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
import sys
from matplotlib import pyplot as plt
from functools import reduce
import os
import altair as alt

sys.path.append("../src/")
from features.graph_embed import GraphEmbed
from models.cluster_graphs import ClusterGraphs
from visualization.visualize import Visualize
print("import done")

import done


In [231]:
cluster_f_name = 'pheme_graphs_clustered.pkl'
raw_data = pd.read_csv('../data/raw/pheme/pheme_all_events.csv')
clusters = pd.read_pickle(os.path.join("..", "models", cluster_f_name))

drop_small_events = True

  raw_data = pd.read_csv('../data/raw/pheme/pheme_all_events.csv')


In [208]:
if drop_small_events:
    clusters = clusters.groupby('event').filter(lambda x: len(x) > 6)

graphs = GraphEmbed.read_graphs(clusters)
viz = Visualize(clusters, graphs)

# Clustering test

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.manifold import TSNE

In [47]:
def choose_clust_num_k_means(graph_vecs, end=11):
        inertias = []
        for i in range(1, end):
            kmeans = KMeans(n_clusters=i, random_state=0)
            kmeans.fit(graph_vecs)

            inertia = kmeans.inertia_
            inertias.append(inertia)

        # choose best cluster num
        kneedle = KneeLocator(
            range(1, end), inertias, S=1.0, curve="convex", direction="decreasing"
        )
        return kneedle.elbow, inertias
    
def cluster_k_means_clusters(graph_vecs, n_clusters=None):
    if n_clusters is None:
        n_clusters, inertias = choose_clust_num_k_means(graph_vecs)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(graph_vecs)

    return pd.Series(kmeans.labels_)

def graph_reduced_dimensions(X, labels, width, height, title):
        tsne = TSNE(2)
        two_d = tsne.fit_transform(X)

        components = pd.DataFrame(two_d, columns=['dimension 1', 'dimension 2'])
        components['label'] = labels
        
        chart = alt.Chart(components).mark_circle(size=60).encode(
                    x='dimension 1',
                    y='dimension 2',
                    color='label:N'
                ).properties(title=title, width=width, height=height).interactive()
        
        chart = chart.configure_title(fontSize=25, fontWeight='bold')
        chart = chart.configure_header(titleFontSize=25, titleFontWeight='bold')
        chart = chart.configure_legend(titleFontSize=25, labelFontSize=20, labelFontWeight='bold', titleFontWeight='bold')
        chart = chart.configure_axis(grid=False, titleFontSize=20, labelFontSize=15, labelAngle=0)

        return chart

In [223]:
clusters.columns

Index(['graph_embedding', 'id', 'truth', 'title', 'event', 'mean_tweet_len',
       'mean_user_mentions', 'urls_mean', 'media_count_mean',
       'hashtags_count_mean', 'retweet_count_mean', 'favorite_count_mean',
       'mentions_count_mean', 'user_tweet_count_mean',
       'user_follower_count_mean', 'user_friends_count_mean', 'sentiment_mean',
       'total_time', 'unverified', 'truth_val', 'num_threads', 'edges',
       'num_nodes', 'num_edges', 'num_wcc', 'largest_wcc',
       'diameter_largest_wcc', 'max_out_degree', 'max_in_degree',
       'mean_out_degree', 'mean_in_degree', 'wiener_index', 'time_per_node',
       'nodes_per_thread', 'label', 'is_mean_vec'],
      dtype='object')

In [232]:
X = clusters.loc[:, ['time_per_node', 'nodes_per_thread', 'largest_wcc', 'diameter_largest_wcc', 'num_nodes', 'sentiment_mean', 'user_tweet_count_mean', 'urls_mean', 'media_count_mean', 'user_follower_count_mean', 'user_friends_count_mean', 'num_edges', 'unverified']]

scaler = StandardScaler()
X = scaler.fit_transform(X)
labels = cluster_k_means_clusters(X, n_clusters=3)
graph_reduced_dimensions(X, labels, 200, 300, 'test')

In [233]:
labels.value_counts()

1    117
0     91
2     27
dtype: int64

# Overview

Data: Each network is a combination of all rumour threads relating to a specific rumour. All pheme rumours are included except ones that don't have any edges

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_features = clusters.select_dtypes(numerics).copy()
numeric_features.columns

## Clustering stats

In [None]:
cl = ClusterGraphs(clusters)
num_clusters, inertias = cl.choose_clust_num_k_means()
plt.plot(inertias)

In [None]:
num_clusters

In [None]:
len(clusters)

## Descriptive Stats

In [None]:
clusters.loc[:, ['num_threads', 'num_nodes', 'num_edges', 'num_wcc', 'largest_wcc',
       'diameter_largest_wcc', 'max_out_degree', 'max_in_degree',
       'mean_out_degree', 'mean_in_degree', 'wiener_index', 'nodes_per_thread']].describe()

In [None]:
clusters.event.value_counts()

In [None]:
clusters.truth.value_counts()

In [None]:
viz.graph_point_range_cluster_info(False, ['num_threads'], 300, 250, 1, 'event')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes'], 300, 250, 1, 'event')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 300, 250, 1, 'event')

# Exploring Differences between Clusters

## Central Network of Each cluster

In [None]:
viz.viz_graphs(clusters.loc[clusters.is_mean_vec == True].id.to_list())

## Cluster Size

In [None]:
viz.plot_cluster_size()

## Truth Composition of Each cluster

In [None]:
viz.viz_ind_cluster_truth()

## Topological Features

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes', 'num_edges'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_nodes', 'num_edges'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_wcc', 'num_threads'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['num_wcc', 'num_threads'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['largest_wcc'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['largest_wcc'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['diameter_largest_wcc'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['diameter_largest_wcc'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['wiener_index'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['wiener_index'], 200, 250, 3, 'truth')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['nodes_per_thread'], 200, 250, 3, 'truth')

## Text Features

In [None]:
viz.graph_point_range_cluster_info(False, ['sentiment_mean'], 200, 250, 3, 'label')

In [None]:
viz.graph_point_range_cluster_info(False, ['sentiment_mean'], 200, 250, 3, 'truth')

## Social Features

## TSNE Plot

In [None]:
viz.graph_reduced_dimensions(["id", "num_nodes", "title", 'truth', 'event'], 400, 400, "How Threads Networks Vary")

## Correlations

In [None]:
numeric_features

In [None]:
viz.get_corr_heat_map(numeric_features.columns.to_list())