In [None]:
import sys

In [None]:
sys.path.insert(0,"../python/")
import analyzer_utils as au

In [None]:
import networkx as nx
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

# Get collections

In [None]:
coll_name = "grenfell_fire"
coll, db = au.get_coll(coll_name)

#### Number of documents in the collections

In [None]:
coll_size = db.command("collstats", coll_name)["count"]

In [None]:
print("Number of docs in '%s': %i" % (coll_name, coll_size))

# Sample event

In [None]:
doc = coll.find_one()

doc

# Upper and lower bound of the available dataset

In [None]:
au.find_some_docs(coll,sort_params=[("id",1)],limit=10)

In [None]:
au.find_some_docs(coll,limit=5)

In [None]:
def get_hashtags_from_tweets(coll,limit=None):
    res = coll.find().limit(limit) if limit != None else coll.find()
    hashtags = {}
    for item in res:
        if "RT " == item['text'][:3]:
            continue
        for htag in item['entities']['hashtags']:
            htag_name = htag['text']
            if not htag_name in hashtags:
                hashtags[htag_name] = 0
            hashtags[htag_name] += 1
    return hashtags

In [None]:
hashtags = get_hashtags_from_tweets(coll)

In [None]:
from collections import Counter

In [None]:
cc = Counter(hashtags)

# Extract mention networks

## a.) Extract mentions from multiple collections

In [None]:
mentions, user_names, num_tweets, num_retweets = au.get_mentions(coll)

In [None]:
mentions_df = pd.DataFrame(mentions,columns=["epoch","src","trg","text"])
print(len(mentions_df))

In [None]:
mentions_df["src_str"] =  mentions_df["src"].apply(lambda x: user_names[x])
mentions_df["trg_str"] =  mentions_df["trg"].apply(lambda x: user_names[x])

### 1. Number of mentions

In [None]:
print("Number of tweets: %i" % num_tweets)
print("Number of retweets: %i" % num_retweets)
print("Number of mentions extracted from tweets: %i" % len(mentions_df))

#### Export mentions to .csv

In [None]:
cols_to_export = ["epoch","src","trg","src_str","trg_str"]
mentions_df.to_csv("/mnt/idms/fberes/network/grenfell_fire//data/gff17_mentions.csv",columns=cols_to_export,sep="|",index=False)

#### Read from .csv

In [None]:
mentions_df = pd.read_csv("/mnt/idms/fberes/network/grenfell_fire/data/gff17_mentions.csv",sep="|")
mentions_df.head()

In [None]:
src_map = dict(zip(mentions_df["src"],mentions_df["src_str"]))
trg_map = dict(zip(mentions_df["trg"],mentions_df["trg_str"]))
src_map.update(trg_map)
user_names = src_map

### 2. Popular source nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"src",k=10)

### 3. Popular target nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"trg",k=10)

### 4. Event distribution in time

In [None]:
import time

In [None]:
#mentions_df["date"] = mentions_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
mentions_df["date"] = mentions_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d %H', time.localtime(x)))
mentions_df["time"] = mentions_df["epoch"].apply(lambda x: time.strftime('%H:%M:%S', time.localtime(x)))

In [None]:
mentions_df.head()

In [None]:
plt.figure(figsize=(15,5))
plt.title("Number of mentions in time")
mentions_df["epoch"].hist(bins=500)
plt.show()

In [None]:
num_of_mentions_by_day = mentions_df["date"].value_counts().sort_index()

In [None]:
plt.figure(figsize=(15,5))
plt.title("Number of mentions per day")
plt.plot(num_of_mentions_by_day.values)
plt.xticks(range(len(num_of_mentions_by_day)),num_of_mentions_by_day.index,rotation='vertical')
plt.show()

### 5. Graph informations

In [None]:
date_splits = sorted(list(mentions_df["date"].unique()))
#date_splits

In [None]:
graph_stats = [] 
for split in date_splits[1:]:
    partial_df = mentions_df[mentions_df["date"] < split]
    graph_stats += [au.get_graph_stats(partial_df)]
graph_stats_df = pd.DataFrame(graph_stats, columns=["nodes","edges","weak_components","strong_components"])

In [None]:
graph_stats_df.tail(3)

In [None]:
plt.subplots(figsize=(16,5))

plt.subplot(1,2,1)
plt.title('Aggregated mention graph size')
plt.plot(graph_stats_df["nodes"],label="nodes")
plt.plot(graph_stats_df["edges"],label="edges")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.legend()

plt.subplot(1,2,2)
plt.title('Number of weak components in the aggregated mention graph')
plt.plot(graph_stats_df["weak_components"],label="number of ",c="r")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.show()