In [None]:
import sys

In [None]:
sys.path.insert(0,"../python/")
import analyzer_utils as au

In [None]:
import networkx as nx
import pandas as pd

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

# Get collections

In [None]:
raw_coll, db = au.get_coll("raw")
raw_begin_coll, _ = au.get_coll("raw_begining")

#### Number of documents in the collections

In [None]:
raw_size = db.command("collstats", "raw")["count"]
raw_begin_size = db.command("collstats", "raw_begining")["count"]

In [None]:
print("Number of docs in 'raw': %i" % raw_size)
print("Number of docs in 'raw_begining': %i" % raw_begin_size)

# Sample event

In [None]:
doc = raw_coll.find_one()

doc

# Upper and lower bound of the available dataset

In [None]:
au.find_some_docs(raw_begin_coll,sort_params=[("id",1)],limit=3)

In [None]:
au.find_some_docs(raw_begin_coll,limit=3)

In [None]:
au.find_some_docs(raw_coll,sort_params=[("id",1)],limit=3)

In [None]:
au.find_some_docs(raw_coll,limit=3)

# Extract mention networks

## a.) Extract mentions from multiple collections

In [None]:
mentions_begin, user_names_begin, num_tweets_begin, num_retweets_begin = au.get_mentions(raw_begin_coll)

In [None]:
mentions_df_begin = pd.DataFrame(mentions_begin,columns=["epoch","src","trg","text"])
print(len(mentions_df_begin))

In [None]:
mentions_raw, user_names_raw, num_tweets_raw, num_retweets_raw = au.get_mentions(raw_coll)

In [None]:
mentions_df_raw = pd.DataFrame(mentions_raw,columns=["epoch","src","trg","text"])
print(len(mentions_df_raw))

## b.) Rolland Garros mention network (concatenated)
   * first part: May 23 16:51 -> May 31
   * second part: June 01 -> WIP

In [None]:
mentions_df = pd.concat([mentions_df_begin, mentions_df_raw])
mentions_df = mentions_df.reset_index(drop=True)
user_names_begin.update(user_names_raw)
user_names = user_names_begin
num_tweets = num_tweets_begin + num_tweets_raw
num_retweets = num_retweets_begin + num_retweets_raw

In [None]:
mentions_df["src_str"] =  mentions_df["src"].apply(lambda x: user_names[x])
mentions_df["trg_str"] =  mentions_df["trg"].apply(lambda x: user_names[x])

### 1. Number of mentions

In [None]:
print("Number of tweets: %i" % num_tweets)
print("Number of retweets: %i" % num_retweets)
print("Number of mentions extracted from tweets: %i" % len(mentions_df))

#### Export mentions to .csv

In [None]:
cols_to_export = ["epoch","src","trg","src_str","trg_str"]
mentions_df.to_csv("/mnt/idms/fberes/network/roland_garros/data/rg17_mentions.csv",columns=cols_to_export,sep="|",index=False)

### 2. Popular source nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"src",k=20)

### 3. Popular target nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"trg",k=20)

### 4. Event distribution in time

In [None]:
import time

In [None]:
mentions_df["date"] = mentions_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
mentions_df["time"] = mentions_df["epoch"].apply(lambda x: time.strftime('%H:%M:%S', time.localtime(x)))

In [None]:
mentions_df.head()

In [None]:
mentions_df["epoch"].hist(bins=100)

In [None]:
num_of_mentions_by_day = mentions_df["date"].value_counts().sort_index()

In [None]:
plt.figure()
plt.plot(num_of_mentions_by_day.values)
plt.xticks(range(len(num_of_mentions_by_day)),num_of_mentions_by_day.index,rotation='vertical')
plt.show()

### 5. Graph informations

In [None]:
date_splits = sorted(list(mentions_df["date"].unique()))
#date_splits

In [None]:
graph_stats = [] 
for split in date_splits[1:]:
    partial_df = mentions_df[mentions_df["date"] < split]
    graph_stats += [au.get_graph_stats(partial_df)]
graph_stats_df = pd.DataFrame(graph_stats, columns=["nodes","edges","weak_components","strong_components"])

In [None]:
graph_stats_df

In [None]:
plt.subplots(figsize=(16,5))

plt.subplot(1,2,1)
plt.title('Aggregated mention graph size')
plt.plot(graph_stats_df["nodes"],label="nodes")
plt.plot(graph_stats_df["edges"],label="edges")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.legend()

plt.subplot(1,2,2)
plt.title('Number of weak components in the aggregated mention graph')
plt.plot(graph_stats_df["weak_components"],label="number of ",c="r")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.show()