In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys, os

sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm

# Notes

This notenook only support popularity model score generation for pagerank. The optimal lambda parameters and daily correlations for indegree must be computed!

# Choose dataset

In [None]:
lambda_map = {'maidan':0.1,'15o':0.3,'oc':0.3,'olympics':0.1,'yo':0.2}
print lambda_map

In [None]:
dataset_id = '15o_pagerank'

In [None]:
LAMBDA = 0.0
for key in lambda_map:
    if key in dataset_id:
        print key, lambda_map[key]
        LAMBDA=lambda_map[key]
        break
if LAMBDA == 0.0:
    raise RuntimeError("Lambda were not found!!!")

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
print len(stat_df)

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)#-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_w_spearman = list(stat_df["w_spearman"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=LAMBDA)

# Export centrality scores (sorted daily toplists)

   * Originally all active node were exported to files
   * These nodes could have zero centrality values (e.g.: indegree, beta-measure)
   * For PageRank there was no zero value as there is the probability of teleportation

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/popularity_model_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
output_prefix = "/mnt/idms/fberes/NETWORK/andreas_article/nelly_model_scores/"
output_folder = output_prefix + '/%s/centrality_scores/' % dataset_id

export_daily_scores(output_folder, X)

export_daily_scores(output_folder, X_act)

In [None]:
export_daily_scores(output_folder, X_act_leaders)

### sort by scores + (normalization was for learning to rank!!!)

   * For normalization we divide the given value by the sum of scores
   * This type of normalization was used for all datasets

In [None]:
import subprocess

In [None]:
subprocess.check_call(['../scripts/sort_all_output.sh', output_folder, "True"])