In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys, os

sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm

# Choose dataset

In [None]:
dataset_id = '15o_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
print len(stat_df)

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)#-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_w_spearman = list(stat_df["w_spearman"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

# Export centrality scores (sorted daily toplists)

   * Originally all active node were exported to files
   * These nodes could have zero centrality values (e.g.: indegree, beta-measure)
   * For PageRank there was no zero value as there is the probability of teleportation

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/popularity_model_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
output_prefix = "/mnt/idms/fberes/NETWORK/andreas_article/nelly_model_scores/"
output_folder = output_prefix + '/%s/centrality_scores/' % dataset_id

export_daily_scores(output_folder, X)

export_daily_scores(output_folder, X_act)

In [None]:
export_daily_scores(output_folder, X_act_leaders)

### sort by scores + (normalization was for learning to rank!!!)

   * For normalization we divide the given value by the sum of scores
   * This type of normalization was used for all datasets

In [None]:
import subprocess

In [None]:
subprocess.check_call(['../scripts/sort_all_output.sh', output_folder, "True"])

In [None]:
raise RuntimeWarning("You must wait for the sorting scripts to finish!!!")

# Export centrality scores (learning to rank)

### Reload Popularity Model scores

In [None]:
def load_score_map(score_folder,day, measure):
    """The centrality maps were pre-sorted in decreasing order!!!"""
    scores = pd.read_csv(score_folder + '/%s_scores_%i.txt_s' % (measure,day), sep=" ", names=["id","score_%i" % day])
    scores = scores.set_index("id")
    return scores

load_score_map(output_folder,0,measure="popularity_model")

In [None]:
def load_score_maps(score_folder, days, measure="popularity_model"):
    """Load daily score maps. Then join them by ID."""
    daily_scores = load_score_map(score_folder,days[0],measure)
    for i in range(1,len(days)):
        print i, len(daily_scores)
        current_scores = load_score_map(score_folder,days[i],measure)
        daily_scores = daily_scores.join(current_scores, how='outer')
    return daily_scores.fillna(0.0)

In [None]:
scores = load_score_maps(output_folder,range(num_of_days))

In [None]:
scores.columns

scores.head(10)

### Parameters to set:

   * Number of features (4-8) - this determines the available number of queries
   * Index of queries (in test it will restart...)
   * unseen true/false : the fatures cannot be all zero!!!
   * top_k : the first k biggest label is included in the records
   * Number of test queries: 1-3-5-7 (the remaining queries are the trains) - **Maybe I should send full query list - then Levente can split it into train and test** 

In [None]:
def generate_learning_to_rank_input(score_df, num_of_features, k, unseen=True):
    num_cols = score_df.columns
    output_records = np.array([])
    for i in range(num_of_features, len(num_cols)):
        partial_score_mx = score_df.sort('score_%i' % i, ascending=0).head(k).as_matrix()
        features_and_labels = partial_score_mx[:,i-num_of_features:i+1]
        if not unseen: # filter records where all features are zero
            filtered = []
            for j in xrange(len(features_and_labels)):
                if not np.array_equal(features_and_labels[j,:-1],[0]*num_of_features):
                    filtered.append(list(features_and_labels[j,:]))
            features_and_labels = np.array(filtered)
        partial_output = np.zeros((len(features_and_labels),num_of_features+2))
        partial_output[:,0] = i-num_of_features+1
        partial_output[:,1:] = features_and_labels
        if i == num_of_features:
            output_records = partial_output
        else:
            output_records = np.concatenate((output_records,partial_output),axis=0)
        #print partial_output.shape, output_records.shape
    return output_records 

In [None]:
l2r_output_folder = '/mnt/idms/fberes/NETWORK/andreas_article/learning_to_rank_inputs/'
#os.makedirs(l2r_output_folder)

In [None]:
for feat in [4,8]:
    for k in [100,500]:
        for unseen in [True,False]:
            all_output = generate_learning_to_rank_input(scores, feat, k, unseen=unseen)
            output_df = pd.DataFrame(all_output)
            output_df[0] = output_df[0].astype('int')
            output_df.to_csv(l2r_output_folder + "%s_scores_k%i_f%i_unseen%s.csv" % (dataset_id,k,feat,unseen), sep=";",index=False,header=False,float_format='%.10f')