In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/LearningToRankInput.json",sys.argv)

In [None]:
import pandas as pd
import numpy as np
import os

# Reload centrality scores

In [None]:
def load_score_map(score_folder,day, measure):
    """The centrality maps were pre-sorted in decreasing order!!!"""
    scores = pd.read_csv(score_folder + '/%s_scores_%i.txt_s' % (measure,day), sep=" ", names=["id","score_%i" % day])
    return scores

In [None]:
def load_score_maps(score_folder, days, measure):
    """Load daily score maps. Then join them by ID."""
    daily_scores = load_score_map(score_folder,days[0],measure)
    for i in range(1,len(days)):
        print i, len(daily_scores)
        current_scores = load_score_map(score_folder,days[i],measure)
        daily_scores = pd.merge(daily_scores,current_scores,on="id",how='outer')
    daily_scores = daily_scores.fillna(0.0)
    daily_scores = daily_scores.set_index("id")
    return daily_scores

In [None]:
experiment_folder = ph.get("experiment_folder")
dataset_id = ph.get("dataset_id")
measure_prefix = ph.get("measure_id")
is_popularity_model = ph.get("is_popularity_model")
if is_popularity_model:
    input_folder = "%s/popularity_model/%s/centrality_scores/" % (experiment_folder, dataset_id)
else:
    input_folder = "/mnt/idms/fberes/network/DATA/temporal_centralities/centrality_output_for_datasets_normalized/%s/centrality_scores/" % dataset_id
days = range(21)

print is_popularity_model
print dataset_id, measure_prefix
print input_folder

In [None]:
scores = load_score_maps(input_folder,days,measure=measure_prefix)

In [None]:
scores.columns

In [None]:
scores.head()

# Export centrality scores (learning to rank)

### Parameters to set:

   * Number of features (4-8) - this determines the available number of queries
   * unseen true/false : the fatures cannot be all zeros = unseenTrue
   * top_k : the first k biggest label is included in the records
   * the output have to be splitted for train and test (later by hand)

In [None]:
def generate_learning_to_rank_input(score_df, num_of_features, k, unseen=True, append_ids=False):
    """If unseen==False then all formerly not seen records are removed."""
    cols = score_df.columns
    output_records = np.array([])
    for i in range(num_of_features, len(cols)):
        partial_df = score_df.sort('score_%i' % i, ascending=0).head(k)
        #print partial_df.head()
        partial_score_mx = partial_df.as_matrix()
        # last column is the label, other columns are the features (if append_ids==False)
        features_and_labels = partial_score_mx[:,i-num_of_features:i+1]
        if not unseen:
            filtered = []
            node_ids = []
            for j in xrange(len(features_and_labels)):
                # filter records where all features are zero
                if not np.array_equal(features_and_labels[j,:-1],[0]*num_of_features):
                    filtered.append(list(features_and_labels[j,:]))
                    node_ids.append(partial_df.index[j])
            features_and_labels = np.array(filtered)
        else:
            node_ids = partial_df.index
        if append_ids:
            tmp_arr = np.zeros((features_and_labels.shape[0], features_and_labels.shape[1]+1))
            tmp_arr[:,:-1] = features_and_labels
            tmp_arr[:,-1] = node_ids
            features_and_labels = tmp_arr
            partial_output = np.zeros((len(features_and_labels),num_of_features+3))
        else:
            partial_output = np.zeros((len(features_and_labels),num_of_features+2))
        partial_output[:,0] = i-num_of_features+1
        partial_output[:,1:] = features_and_labels
        if i == num_of_features:
            output_records = partial_output
        else:
            output_records = np.concatenate((output_records,partial_output),axis=0)
        #print partial_output.shape, output_records.shape
    return output_records 

In [None]:
if is_popularity_model:
    l2r_output_folder = '%s/learning_to_rank_inputs/popularity_model/' % experiment_folder
else:
    l2r_output_folder = '%s/learning_to_rank_inputs/original_datasets/' % experiment_folder
if not os.path.exists(l2r_output_folder):
    os.makedirs(l2r_output_folder)
print l2r_output_folder

### Using 10 digits for floating point numbers in the output files

In [None]:
for feat in [4]:
    for k in [100,500]:
        for unseen in [True,False]:
            all_output = generate_learning_to_rank_input(scores, feat, k, unseen=unseen)
            output_df = pd.DataFrame(all_output)
            output_df[0] = output_df[0].astype('int')
            #print len(output_df)
            output_df.to_csv(l2r_output_folder + "%s_%s_k%i_f%i_unseen%s.csv" % (dataset_id,measure_prefix,k,feat,unseen), sep=";",index=False,header=False,float_format='%.10f')