In [None]:
import sys, os, time

In [None]:
sys.path.insert(0,"../python/")
import analyzer_utils as au

In [None]:
import networkx as nx
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
from bokeh.io import output_notebook
output_notebook()

# Get collections

In [None]:
uso17_coll, db = au.get_coll("usopen17")
uso17_qual_coll, _ = au.get_coll("usopen17_qual")

#### Number of documents in the collections

In [None]:
uso17_size = db.command("collstats", "usopen17")["count"]
uso17_qual_size = db.command("collstats", "usopen17_qual")["count"]

In [None]:
print("Number of docs in 'uso17': %i" % uso17_size)
print("Number of docs in 'uso17_qual': %i" % uso17_qual_size)

# Sample event

In [None]:
doc = uso17_coll.find_one()

doc

# Upper and lower bound of the available dataset

   * Qualifying days: Aug 22 - Aug 25
   * Normal contest: Aug 28 - Sept 10 

### Qualifications

In [None]:
au.find_some_docs(uso17_qual_coll,sort_params=[("id",1)],limit=3)

In [None]:
au.find_some_docs(uso17_qual_coll,limit=3)

### Days

In [None]:
au.find_some_docs(uso17_coll,sort_params=[("id",1)],limit=3)

In [None]:
au.find_some_docs(uso17_coll,limit=3)

#### Search notes:

   * when using since_id=903184720040783872 four the search it could not find all (it was shut down...)
   * so:
   903184720040783872 - "Thu Aug 31 09:16:32 +0000 2017"
   903489402940227585 - "Fri Sep 01 05:27:14 +0000 2017"

#### max_id 1. próba: duplikációkkal indult, ami jó... (nem kezdtem el túl régről felvenni)

int(903489402940227585 - (903489402940227585-903184720040783872) / 3)

# Extract tweet text

In [None]:
tweet_info_raw = au.get_tweets(uso17_coll)

In [None]:
tweet_info_df_raw = pd.DataFrame(tweet_info_raw, columns=["time","source_id","source_name","language","text"])

In [None]:
tweet_info_begin = au.get_tweets(uso17_qual_coll)

In [None]:
tweet_info_df_begin = pd.DataFrame(tweet_info_begin, columns=["time","source_id","source_name","language","text"])

In [None]:
tweet_info_df = pd.concat([tweet_info_df_raw, tweet_info_df_begin])

In [None]:
tweet_info_df = tweet_info_df.sort_values("time")

In [None]:
tweet_info_df.to_csv("/mnt/idms/fberes/network/usopen/data/uso17_tweets.csv",index=False,sep="|")

In [None]:
tweet_info_df_en = tweet_info_df[tweet_info_df["language"] == "en"]
tweet_info_df_en.to_csv("/mnt/idms/fberes/network/usopen/data/uso17_tweets_eng.csv",index=False,sep="|")

# Extract mention networks

## a.) Extract mentions from multiple collections

In [None]:
mentions_begin, user_names_begin, user_screen_names_begin, num_tweets_begin, num_retweets_begin = au.get_mentions(uso17_qual_coll)

In [None]:
mentions_df_begin = pd.DataFrame(mentions_begin,columns=["epoch","src","trg","lang","text"])
print(len(mentions_df_begin))

In [None]:
mentions_df_begin.head()

In [None]:
mentions_raw, user_names_raw, user_screen_names_raw, num_tweets_raw, num_retweets_raw = au.get_mentions(uso17_coll)

In [None]:
mentions_df_raw = pd.DataFrame(mentions_raw,columns=["epoch","src","trg","lang","text"])
print(len(mentions_df_raw))

## b.) USOpen17 mention network (concatenated)

In [None]:
mentions_df = pd.concat([mentions_df_begin, mentions_df_raw])
mentions_df = mentions_df.reset_index(drop=True)
mentions_df = mentions_df.sort_values("epoch")
user_names_begin.update(user_names_raw)
user_screen_names_begin.update(user_screen_names_raw)
user_names = user_names_begin
user_screen_names = user_screen_names_begin
num_tweets = num_tweets_begin + num_tweets_raw
num_retweets = num_retweets_begin + num_retweets_raw

In [None]:
mentions_df.head()

In [None]:
mentions_df["src_str"] =  mentions_df["src"].apply(lambda x: user_names[x])
mentions_df["trg_str"] =  mentions_df["trg"].apply(lambda x: user_names[x])
mentions_df["src_screen_str"] =  mentions_df["src"].apply(lambda x: user_screen_names[x])
mentions_df["trg_screen_str"] =  mentions_df["trg"].apply(lambda x: user_screen_names[x])

### 1. Number of mentions

In [None]:
print("Number of tweets: %i" % num_tweets)
print("Number of retweets: %i" % num_retweets)
print("Number of mentions extracted from tweets: %i" % len(mentions_df))

In [None]:
mentions_df["lang"].value_counts()

### Export mentions to .csv

#### with recoded ID-s

file_path = "/mnt/idms/temporalNodeRanking/data/filtered_timeline_data/tsv/usopen/usopen_mentions.csv"
au.recode_and_export_mentions(file_path,mentions_df,user_names)

#### with original ID-s

In [None]:
cols_to_export = ["epoch","src","trg","src_str","src_screen_str","trg_str","trg_screen_str"]
mentions_df.to_csv("/mnt/idms/fberes/network/usopen/data/uso17_mentions_with_names.csv",columns=cols_to_export,sep="|",index=False)

In [None]:
cols_to_export = ["epoch","src","trg","src_str","src_screen_str","trg_str","trg_screen_str","lang","text"]
mentions_df.to_csv("/mnt/idms/fberes/network/usopen/data/uso17_mentions_with_names_and_text.csv",columns=cols_to_export,sep="|",index=False)

### Read from .csv

mentions_df = pd.read_csv("/mnt/idms/fberes/network/usopen/data/uso17_mentions_with_names.csv",sep="|")
mentions_df.head()

In [None]:
src_map = dict(zip(mentions_df["src"],mentions_df["src_str"]))
trg_map = dict(zip(mentions_df["trg"],mentions_df["trg_str"]))
src_map.update(trg_map)
user_names = src_map

### 2. Popular source nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"src",k=10)

### 3. Popular target nodes

In [None]:
au.show_frequent_items(mentions_df,user_names,"trg",k=10)

### 4. Event distribution in time

In [None]:
import time

In [None]:
mentions_df["date"] = mentions_df["epoch"].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
mentions_df["time"] = mentions_df["epoch"].apply(lambda x: time.strftime('%H:%M:%S', time.localtime(x)))

In [None]:
mentions_df.head()

In [None]:
plt.figure(figsize=(15,5))
plt.title("Number of mentions in time")
mentions_df["epoch"].hist(bins=500)
plt.show()

In [None]:
num_of_mentions_by_day = mentions_df["date"].value_counts().sort_index()

In [None]:
plt.figure(figsize=(15,5))
plt.title("Number of mentions per day")
plt.plot(num_of_mentions_by_day.values)
plt.xticks(range(len(num_of_mentions_by_day)),num_of_mentions_by_day.index,rotation='vertical')
plt.show()

### 5. Graph informations

In [None]:
date_splits = sorted(list(mentions_df["date"].unique()))
#date_splits

In [None]:
graph_stats = [] 
for split in date_splits[1:]:
    partial_df = mentions_df[mentions_df["date"] < split]
    graph_stats += [au.get_graph_stats(partial_df)]
graph_stats_df = pd.DataFrame(graph_stats, columns=["nodes","edges","weak_components","strong_components"])

In [None]:
graph_stats_df.head(3)

In [None]:
plt.subplots(figsize=(16,5))

plt.subplot(1,2,1)
plt.title('Aggregated mention graph size')
plt.plot(graph_stats_df["nodes"],label="nodes")
plt.plot(graph_stats_df["edges"],label="edges")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.legend()

plt.subplot(1,2,2)
plt.title('Number of weak components in the aggregated mention graph')
plt.plot(graph_stats_df["weak_components"],label="number of ",c="r")
plt.xticks(range(len(graph_stats_df)),date_splits[:-1],rotation='vertical')
plt.show()

### 6. Changes in popular items

In [None]:
day_list = []
day_list += ['2017-08-%i' % i for i in range(20,32)]
day_list += ['2017-09-%.2i' % i for i in range(1,11)]
day_list

In [None]:
k = 20
popular_trgs = []
for i in range(len(day_list)):
    daily_df = mentions_df[mentions_df["date"]==day_list[i]]
    freq_trg = au.show_frequent_items(daily_df,user_names,"trg",k=k)
    popular_trgs += list(zip(np.ones(k)*i,freq_trg["name"],freq_trg["count"],freq_trg["count"]/len(daily_df)))
popular_trg_df = pd.DataFrame(popular_trgs, columns=["day_idx","name","count","dominance"])

In [None]:
popular_trg_df = popular_trg_df[~(popular_trg_df["name"] == "US Open Tennis")]

#### At least 3 times in top_20

In [None]:
au.plot_user_popularity(au.filter_for_support(popular_trg_df, min_times=5),day_list)

#### At least 8 times in top_20

In [None]:
very_pop_df = au.filter_for_support(popular_trg_df, min_times=7)
#au.plot_user_popularity(very_pop_df, day_list)
au.plot_user_dominance(very_pop_df)

#### At least 4 times in top_20 but less then 8 times

In [None]:
very_pop_df = au.filter_for_support(popular_trg_df, max_times=5, min_times=3)
#plot_user_popularity(very_pop_df,day_list)
au.plot_user_dominance(very_pop_df)