In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys, os

sys.path.insert(0,'../../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm

# Notes

This notenook only support popularity model score generation for pagerank. The optimal lambda parameters and daily correlations for indegree must be computed!

# Choose dataset

In [None]:
lambda_map = {'maidan':0.1,'15o':0.3,'oc':0.3,'olympics':0.1,'yo':0.2}
print lambda_map

In [None]:
dataset_id = 'yo'
measure_id = "pagerank"

input_prefix = "/mnt/idms/fberes/NETWORK/andreas_article/results/corr_and_stats/"

In [None]:
LAMBDA = 0.0
for key in lambda_map:
    if key == dataset_id:
        print key, lambda_map[key]
        LAMBDA=lambda_map[key]
        break
if LAMBDA == 0.0:
    raise RuntimeError("Lambda were not found!!!")

In [None]:
dataset_stat_file = input_prefix + "/%s_%s.csv" % (dataset_id, measure_id)
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

In [None]:
stat_df.head()

#### extract dataset sizes

In [None]:
num_of_days = len(stat_df)
num_of_users = 0
with open(input_prefix + "/%s_total_vertex_count.txt" % (dataset_id)) as f:
    num_of_users = int(f.readline())
if num_of_users == 0:
    raise RuntimeError('Invalid total vertex count!')

In [None]:
print num_of_users, num_of_days

In [None]:
p = list(stat_df["prev_day_frac"])[:num_of_days]
p_overlap = list(stat_df["overlap_frac"])[:num_of_days]

# Correlations in real data

In [None]:
data_pearson = list(stat_df["pearson"])[:num_of_days-1]
data_spearman = list(stat_df["spearman"])[:num_of_days-1]

# Popularity model

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

ax = sns.distplot(model.U)

### II. daily variations

ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=LAMBDA)

# Export centrality scores (sorted daily toplists)

   * Originally all active node were exported to files
   * These nodes could have zero centrality values (e.g.: indegree, beta-measure)
   * For PageRank there was no zero value as there is the probability of teleportation

In [None]:
import os

def export_daily_scores(output_folder, M, measure_type):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/%s_scores_%i.txt' % (measure_type,i),'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
output_prefix = "/mnt/idms/fberes/NETWORK/andreas_article/popularity_model/%s" % dataset_id
output_folder = output_prefix + '/centrality_scores/'

In [None]:
export_daily_scores(output_folder, X_act_leaders, measure_type=measure_id)

#### write the selected LAMBDA value to file

In [None]:
with open(output_prefix + '/%s_lambda.txt' % measure_id, 'w') as f:
    f.write(str(LAMBDA))

### sort by scores + (normalization was applied for learning to rank input!!!)

   * For normalization we divide the given value by the sum of scores
   * This type of normalization was used for all datasets

In [None]:
import subprocess

In [None]:
subprocess.check_call(['../../scripts/sort_all_output.sh', output_folder, "True"])