In [None]:
import sys, time

In [None]:
sys.path.insert(0, "../python/")
import analyzer_utils as au

In [None]:
import networkx as nx
import pandas as pd
import numpy as np

In [None]:
from datetime import datetime

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
from bokeh.io import output_notebook
output_notebook()

# 1. Get collections

In [None]:
fina_eng_coll, db = au.get_coll("fina_bp_eng")
fina_hun_coll, _ = au.get_coll("fina_bp_hun")
bp_coll, _ = au.get_coll("budapest")

In [None]:
colls = {
    "fina_bp_eng":fina_eng_coll, 
    "fina_bp_hun":fina_hun_coll, 
    "budapest":bp_coll
}

#### Number of documents in the collections

In [None]:
for col in colls:
    size = db.command("collstats", col)["count"]
    print("Number of docs in %s: %i" % (col, size))

## Sample event

In [None]:
doc = fina_eng_coll.find_one()

In [None]:
doc

In [None]:
doc.keys()

## Upper and lower bound of the available datasets

In [None]:
for col in colls:
    print()
    print("### %s ###" % col)
    print()
    print("Oldest events:")
    au.find_some_docs(colls[col], sort_params=[("id", 1)], limit=3)
    print()
    print("Latest events:")
    au.find_some_docs(colls[col], limit=3)

# 2. Analyze events

In [None]:
del colls["fina_bp_hun"]

## a.) Tweet, retweet, mention counts

In [None]:
mention_infos = {}

In [None]:
for coll_name in colls:
    mentions, user_names, user_screen_names, num_tweets, num_retweets = au.get_mentions(colls[coll_name])
    mentions_df = pd.DataFrame(mentions, columns=["epoch", "src", "trg", "lang", "text"])
    mentions_df["src_str"] =  mentions_df["src"].apply(lambda x: user_names[x])
    mentions_df["trg_str"] =  mentions_df["trg"].apply(lambda x: user_names[x])
    cols_to_export = ["epoch", "src", "trg", "src_str", "trg_str"]
    mentions_df.to_csv("/mnt/idms/fberes/network/fina_bp/data/%s_mentions_with_names.csv" % coll_name, columns=cols_to_export, sep="|", index=False)
    mention_infos[coll_name] = [mentions_df, user_names, user_screen_names, num_tweets, num_retweets]
    print(coll_name)

In [None]:
for coll_name in colls:
    coll_info = mention_infos[coll_name]
    print("%s: #tweets: %i, #retweets: %i, #mentions: %i" % (coll_name, coll_info[3], coll_info[4], len(coll_info[0])))

## b.) reload collections

In [None]:
for coll_name in colls:
    mentions_df = pd.read_csv("/mnt/idms/fberes/network/fina_bp/data/%s_mentions_with_names.csv" % coll_name, sep="|")
    src_map = dict(zip(mentions_df["src"], mentions_df["src_str"]))
    trg_map = dict(zip(mentions_df["trg"], mentions_df["trg_str"]))
    src_map.update(trg_map)
    user_names = src_map
    mention_infos[coll_name] = mentions_df, user_names

In [None]:
len(mention_infos["fina_bp_eng"][0])

## c.) Popular nodes

### i.) popular sources

In [None]:
for coll_name in colls:
    coll_info = mention_infos[coll_name]
    print("### %s ###" % coll_name)
    print(au.show_frequent_items(coll_info[0], coll_info[1], "src", k=10))
    print()

### i.) popular targets

In [None]:
for coll_name in colls:
    coll_info = mention_infos[coll_name]
    print("### %s ###" % coll_name)
    print(au.show_frequent_items(coll_info[0], coll_info[1], "trg", k=10))
    print()

## d.) Event distribution in time

In [None]:
for coll_name in colls:
    coll_info = mention_infos[coll_name]
    coll_info[0]["date"] = coll_info[0]["epoch"].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
    coll_info[0]["date_hour"] = coll_info[0]["epoch"].apply(lambda x: time.strftime('%Y-%m-%d %H', time.localtime(x)))
    coll_info[0]["time"] = coll_info[0]["epoch"].apply(lambda x: time.strftime('%H:%M:%S', time.localtime(x)))

In [None]:
def plot_dense_timeline(coll_name):
    plt.figure(figsize=(15, 2))
    plt.title("%s: Number of mentions in time" % coll_name)
    mention_infos[coll_name][0]["epoch"].hist(bins=500)
    locs, labels = plt.xticks()
    labels = list(map(lambda x: time.strftime('%Y-%m-%d %H:%M', time.localtime(x)), locs))
    plt.xticks(locs, labels, rotation="vertical")
    plt.show()
    
def plot_daily_timeline(coll_name):
    num_of_mentions_by_day = mention_infos[coll_name][0]["date"].value_counts().sort_index()
    plt.figure(figsize=(15, 2))
    plt.title("%s: Number of mentions per day" % coll_name)
    plt.plot(num_of_mentions_by_day.values)
    plt.xticks(range(len(num_of_mentions_by_day)), num_of_mentions_by_day.index, rotation='vertical')
    plt.show()

### budapest

In [None]:
plot_dense_timeline("budapest")

In [None]:
plot_daily_timeline("budapest")

### fina_bp_eng

In [None]:
plot_dense_timeline("fina_bp_eng")

In [None]:
plot_daily_timeline("fina_bp_eng")

### Drops in timeline were suspicious (e.g. at 2017-07-17), checking distribution of tweets

In [None]:
tweet_infos = {}
for coll_name in colls:
    print(coll_name)
    tweets = au.get_tweets(colls[coll_name])
    tweets_df = pd.DataFrame(tweets, columns=["epoch", "src_id", "src_name", "lang", "msg"])
    tweets_df["date"] = tweets_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
    tweets_df["date_hour"] = tweets_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d %H', time.localtime(x)))
    tweets_df["time"] = tweets_df["epoch"].apply(lambda x: time.strftime('%H:%M:%S', time.localtime(x)))
    tweets_df["time_sec"] = tweets_df["time"].apply(lambda s: 3600*int(s[:2]) + 60*int(s[3:5]) + int(s[-2:]))
    tweet_infos[coll_name] = tweets_df

In [None]:
plt.subplots(5, 2, figsize=(30, 25))
    
i = 1
for coll_name in colls:
    temp_df = tweet_infos[coll_name].query("date <= '2017-07-18'").query("date >= '2017-07-14'")
    j = 1
    for d, df in temp_df.groupby("date"):
        plt.subplot(5, 2, j*2 - (i%2) )
        
        df["time_sec"].hist(bins=250)
        plt.xlim(xmin=0, xmax=86400)
        plt.title("%s %s" % (coll_name, d))
        locs, labels = plt.xticks()
        labels = list(map(lambda sec: "%02i:%02i:%02i" % (sec / 3600, (sec%3600)/60, (sec%3600)%60), locs))
        plt.xticks(locs, labels)
        j += 1
    i += 1

In [None]:
for coll_name in colls:
    num_of_tweets_by_day = tweet_infos[coll_name]
    num_of_tweets_by_day = num_of_tweets_by_day.query("date >= '2017-07-14' & date <= '2017-07-18'")
    num_of_tweets_by_day = num_of_tweets_by_day["date"].value_counts().sort_index()
    plt.figure(figsize=(15, 2))
    plt.title("%s: Number of tweets per day" % coll_name)
    plt.plot(num_of_tweets_by_day.values)
    plt.xticks(range(len(num_of_tweets_by_day)), num_of_tweets_by_day.index, rotation='vertical')
    plt.show()

## e.) Changes in popular items

In [None]:
day_list = []
#day_list += ['2017-06-%i' % i for i in range(25, 30)]
day_list += ['2017-07-%.2i' % i for i in range(1, 16)]
day_list

In [None]:
coll_name = "budapest"
mentions_df = mention_infos[coll_name][0]
user_names = mention_infos[coll_name][1]

k = 20
popular_trgs = []
for i in range(len(day_list)):
    daily_df = mentions_df[mentions_df["date"]==day_list[i]]
    freq_trg = au.show_frequent_items(daily_df, user_names, "trg", k=k)
    popular_trgs += list(zip(np.ones(k)*i, freq_trg["name"], freq_trg["count"], freq_trg["count"]/len(daily_df)))
    popular_trg_df = pd.DataFrame(popular_trgs, columns=["day_idx", "name", "count", "dominance"])

In [None]:
very_pop_df = au.filter_for_support(popular_trg_df, min_times=4)

In [None]:
au.plot_user_popularity(very_pop_df, day_list)

In [None]:
au.plot_user_dominance(very_pop_df)

In [None]:
0