# MostPopular Template for Sparsity Evaluation

## Imports 

In [1]:
import os, sys, joblib, json, time, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, KFold
%matplotlib inline 
from caserec.recommenders.item_recommendation.most_popular import MostPopular
from random import choice
from string import ascii_uppercase
from IPython.display import clear_output
lib_path = './../Sources'
if (lib_path not in sys.path):
    sys.path.append(lib_path) #src directory
from lpsrec.messaging.print_functions import ProgressBar
from lpsrec.messaging.telegrambot import Bot
from lpsrec.utils import partition_dataframe, write_log
import lpsrec.database as db

## Parameters
These parameters are set by a [Papermill](https://github.com/nteract/papermill) runner script

In [9]:
dataset_tag = 'BOOKX'
model_tag = 'MostPopular'
rank_length = 30
random_state = 31415
evaluation_metrics = ['PREC', 'RECALL', 'NDCG', 'MRR', 'MAP']
bot_alive = False
partition = 1
nodes = 10
n_folds = 5

In [10]:
analysis_tag = '_'.join([str(x) for x in [rank_length, nodes, partition]])
n_folds = None if n_folds == 1 else n_folds
style_dict = json.load(open('./style_dict.json', 'r'))
dataset_output_folder = os.path.join('.', 'Outputs', dataset_tag)
variables_output_folder = os.path.join(dataset_output_folder, model_tag, 'Variables', analysis_tag)
figures_output_folder = os.path.join(dataset_output_folder, model_tag, 'Figures', analysis_tag)
progbar = ProgressBar(bar_length=20, bar_fill='#', elapsed_time=True)
if not os.path.exists(variables_output_folder):    
    os.makedirs(variables_output_folder)    
if not os.path.exists(figures_output_folder):        
    os.makedirs(figures_output_folder)

In [11]:
plt.rc('font', **style_dict['font'])
# plt.rc('axes.titlesize', fontsize=20)
plt.rc('xtick', labelsize=style_dict['tick']['fontsize']) 
plt.rc('ytick', labelsize=style_dict['tick']['fontsize']) 
plt.rcParams.update({'figure.max_open_warning': 5})

In [12]:
bot = Bot(user_credentials='./JFGS.json')
bot.send_message(text="{}\nHello, John. Initiating sparsity analysis for the {} dataset on the {} model [{}/{}]".format('-'*20, dataset_tag, model_tag, partition, nodes)) if bot_alive else ''

''

## Connecting to Database

In [13]:
username = 'postgres'
password = 'admin'
dbname = 'RecSys'
hostname = 'localhost:5432'
conn = db.get_database_connection(username, password, hostname, dbname)

## Loading Dataset

In [14]:
%%time
df_ratings = db.get_dataset_from_sparsity(data_path=None, conn=conn, dataset_tag=dataset_tag) 
df_ratings[['feedback_value']] = df_ratings[['feedback_value']].apply(pd.to_numeric)
df_ratings.drop(['user', 'item', 'timestamp'], axis=1, inplace=True)
df_ratings.columns = ['feedback_value', 'user', 'item']

Wall time: 14 s


In [15]:
print (df_ratings.shape)
df_ratings.head()

(1149780, 3)


Unnamed: 0,feedback_value,user,item
0,0,3909265,534021
1,0,3909270,558404
2,5,3909271,502082
3,0,3909271,512382
4,0,3909271,574144


In [16]:
df_train, df_test = train_test_split(df_ratings, test_size=0.3, random_state=random_state)
print ("Train size: {} \nTest size: {}".format(df_train.shape[0], df_test.shape[0]))

Train size: 804846 
Test size: 344934


## Training Model

In [None]:
model = MostPopular(train_file=df_train, test_file=df_test, rank_length=rank_length, as_binary=False, verbose = True)

In [None]:
%%time
model.compute(verbose=True)

## Evaluating Model

In [None]:
arr_k = np.arange(1, rank_length+1, 1)
model.evaluate(metrics=evaluation_metrics, n_ranks = arr_k)
joblib.dump(model.evaluation_results, os.path.join(variables_output_folder, 'evaluation_results.joblib'))

### Plotting Evaluations

In [None]:
evaluation_results = joblib.load(os.path.join(variables_output_folder, 'evaluation_results.joblib'))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,10))

for evaluation_name in [m + '@' for m in evaluation_metrics]:
    ax.plot(arr_k, [evaluation_results[evaluation_name + str(x)] for x in arr_k], label=evaluation_name + 'k', color=style_dict['evaluations'][evaluation_name + 'k'])

ax.set_xticks(np.arange(0, np.max(arr_k)+1, 2), minor=False)
ax.set_xlabel('Rank k', fontsize = style_dict['label']['fontsize'])
ax.tick_params(axis='both', which='major', labelsize=style_dict['tick']['fontsize'])
ax.legend()
ax.grid(True)
filepath = os.path.join(figures_output_folder, 'item_rec_metrics.png')
plt.savefig(filepath, bbox_inches = 'tight')
bot.send_message(filePath=filepath) if bot_alive else ''

## Evaluating on Sparsity Cenarios

In [None]:
dataset_info = pd.read_sql(con=conn , sql="select * from datasets.dataset where version='{}'".format(dataset_tag))
df_sparsity_cenario = pd.read_sql(con=conn , sql="select * from sparsity.cenario")
sql_str = "select id_user, id_item from sparsity.get_dataset_from_sparsity('{}', 1.0, 1.0)".format(dataset_tag)
df_whole = pd.read_sql(con=conn, sql=sql_str)
print ("Dataset {} (id_dataset = {}) with {} registers".format(dataset_info['name'][0], dataset_info['id_dataset'][0], df_whole.shape[0]))
print ("Number of sparsity cenarios: ", df_sparsity_cenario.shape[0])
df_sparsity_cenario.head()

In [None]:
sql_str = """select * from sparsity.get_dataset_uss('{}')""".format(dataset_tag)
df_users_uss = pd.read_sql(con=conn, sql=sql_str)
df_users_uss.sort_values(['uss'], ascending=False).head()

In [None]:
sql_str = """select * from sparsity.get_dataset_iss('{}')""".format(dataset_tag)
df_items_iss = pd.read_sql(con=conn, sql=sql_str)
df_items_iss.sort_values(['iss'], ascending=False).head()

In [None]:
uss_limits = df_sparsity_cenario['uss'].unique()
iss_limits = df_sparsity_cenario['iss'].unique()

In [None]:
df_ratings.head(3)

In [None]:
df_overall_sparsity = pd.read_csv(os.path.join('.', 'Outputs', dataset_tag, 'OS', 'Variables', 'df_overall_sparsity.tsv'), sep = '\t', header = 0)
if partition is not None:
    df_overall_sparsity = partition_dataframe(df=df_overall_sparsity, nodes=nodes, sort_by='os', ascending=False)[partition-1].reset_index(drop = True)
print ("Input sparsity cenario dataframe shape: ", df_overall_sparsity.shape)
df_overall_sparsity.head()

In [None]:
def get_evaluation_results(model, df_train, df_test, rank_length, as_binary=True):
    model = MostPopular(train_file=df_train, test_file=df_test, rank_length=rank_length, as_binary=as_binary)
    model.compute(verbose=False)
    model.evaluate(metrics=evaluation_metrics, n_ranks = arr_k, verbose=False)
    evaluation_results = pd.DataFrame.from_dict(data=model.evaluation_results, orient='index').T
    evaluation_results['iss'] = [cenario['iss_limit']]
    evaluation_results['uss'] = [cenario['uss_limit']]
    return evaluation_results

In [None]:
%%time
write_log(filepath=os.path.join(variables_output_folder, 'log.txt'), 
          mode="w+", 
          text='[{}]\tUSS \tISS \tIndex\tPerc\tFolder\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())))    

arr_k = np.arange(1, rank_length+1, 1)
arr_df_eval_metadata = np.repeat(None, 1) if n_folds == None else np.repeat(None, n_folds)
first_value = [True] if n_folds == None else np.repeat(True, n_folds)
kf = None if n_folds == None else KFold(n_splits=n_folds, random_state=random_state) 
# for uss_index, uss_limit in enumerate(uss_limits):
#     for iss_limit in iss_limits:   
for index, cenario in df_overall_sparsity.iterrows():
    clear_output()
    progbar.update_progress(index/float(df_overall_sparsity.shape[0]))   

    start_time = time.time()
    df_ratings = db.get_dataset_from_sparsity(conn=conn, dataset_tag=dataset_tag,                                               
                                              iss=cenario['iss_limit'], 
                                              uss=cenario['uss_limit'])

    df_ratings.drop(['user', 'item', 'timestamp'], axis=1, inplace=True)    
    df_ratings[['feedback_value']] = df_ratings[['feedback_value']].apply(pd.to_numeric)
    df_ratings.columns = ['feedback_value', 'user', 'item']    
    
    if n_folds == None:
        df_train, df_test = train_test_split(df_ratings, test_size=0.3, random_state=random_state)
        evaluation_results = get_evaluation_results(model, df_train, df_test, rank_length, as_binary=True)            
        elapsed_time = time.time() - start_time
        evaluation_results['elapsed_time'] = [elapsed_time]
        
        if (first_value[0]):            
            first_value[0] = False
            arr_df_eval_metadata[0] = evaluation_results.copy()    
        else:
            arr_df_eval_metadata[0] = arr_df_eval_metadata[0].append(evaluation_results)    
        write_log(filepath=os.path.join(variables_output_folder, 'log.txt'), 
                  text='[{}]\t{:.02f}\t{:.02f}\t{}/{}\t{:.02f}%\t{}/{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), 
                                                                                 cenario['uss_limit'], 
                                                                                 cenario['iss_limit'], 
                                                                                 index+1, 
                                                                                 df_overall_sparsity.shape[0], 
                                                                                 100*(index/float(df_overall_sparsity.shape[0])), 
                                                                                 1, 
                                                                                 1)
                 )
    else:
        
        index_folder = 1
        # Tentar fazer um shuffle e voltar com essa estrategia
#         for train_index, test_index in kf.split(df_ratings):              
#             df_train, df_test = df_ratings.iloc[train_index], df_ratings.iloc[test_index]            

        for index_folder in np.arange(1, n_folds+1, 1):
            df_train, df_test = train_test_split(df_ratings, test_size=0.3, random_state=random_state+10*index_folder-1)
            
            print ("Processing folder {}/{}...\n".format(index_folder, n_folds))
            evaluation_results = get_evaluation_results(model, df_train, df_test, rank_length, as_binary=True)            
            elapsed_time = time.time() - start_time
            evaluation_results['elapsed_time'] = [elapsed_time]

            if (first_value[index_folder-1]):            
                first_value[index_folder-1] = False
    #             df_eval_metadata = evaluation_results.copy()
                arr_df_eval_metadata[index_folder-1] = evaluation_results.copy()
            else:
                arr_df_eval_metadata[index_folder-1] = arr_df_eval_metadata[index_folder-1].append(evaluation_results)    
            write_log(filepath=os.path.join(variables_output_folder, 'log.txt'), 
                      text='[{}]\t{:.02f}\t{:.02f}\t{}/{}\t{:.02f}%\t{}/{}\n'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()), 
                                                                                     cenario['uss_limit'], 
                                                                                     cenario['iss_limit'], 
                                                                                     index+1, 
                                                                                     df_overall_sparsity.shape[0], 
                                                                                     100*(index/float(df_overall_sparsity.shape[0])), 
                                                                                     index_folder, 
                                                                                     n_folds)
                     )
            index_folder += 1

for index, df_eval_metadata in enumerate(arr_df_eval_metadata):
    folder_name = '1_1' if n_folds == None else str(n_folds) + '_' + str(index+1)
    df_eval_metadata.reset_index(drop = True, inplace=True)
    df_eval_metadata.to_csv(os.path.join(variables_output_folder, 'df_eval_metadata_fold{}.tsv'.format(folder_name)), sep = '\t', header = True, index = False)
text = "Finished creating sparsity datasets [partition = {}] for\t{} in\t{}".format(partition, dataset_tag, progbar.get_elapsed_time())
bot.send_message(text=text) if bot_alive else ''       
write_log(filepath=os.path.join(variables_output_folder, 'log.txt'), text=text)
print (text)

In [None]:
print ("Variables saved @ ", variables_output_folder)
bot.send_message(text="End of analysis for the {} dataset\n{}".format(dataset_tag, '-'*20)) if bot_alive else ''

_________________

In [None]:
# df_eval_metadata = pd.read_csv(os.path.join(variables_output_folder, 'df_eval_metadata.tsv'), sep = '\t', header = 0)

In [None]:
# df_eval_metadata.head()

In [None]:
# uss_limits = np.sort(np.array(df_eval_metadata['uss'].unique()))
# iss_limits = np.sort(np.array(df_eval_metadata['iss'].unique()))
# rank_lengths = np.arange(1, rank_length+1, 1) # Setting extra rank analysis

# for rank in rank_lengths:
#     arr_rank_metrics = [m + '@' + str(rank) for m in evaluation_metrics]
#     for column in arr_rank_metrics:
#         arr_metric = np.zeros([len(uss_limits), len(iss_limits)])
#         for uss_index, uss_limit in enumerate(uss_limits):
#             for iss_index, iss_limit in enumerate(iss_limits):            
#                 arr_metric[uss_index, iss_index] = df_eval_metadata[(df_eval_metadata['uss'] == uss_limit) & (df_eval_metadata['iss'] == iss_limit)][column].reset_index(drop = True)[0]

#         joblib.dump(arr_metric, os.path.join(variables_output_folder, 'arr_' + column.lower() + '_' + model_tag + '.joblib'))        

## Plotting Results from Sparsity Cenarios

In [None]:
# cmapping = "jet"
# tick_step = 5
# figs = dict()
# for metric in evaluation_metrics:
#     figs[metric+'@'] = list()

# for rank in rank_lengths:
#     arr_rank_metrics = [m + '@' + str(rank) for m in evaluation_metrics]
#     for column in arr_rank_metrics:
#         arr_prec = joblib.load(os.path.join(variables_output_folder, 'arr_' + column.lower() + '_' + model_tag + '.joblib'))        

#         fig, ax = plt.subplots(figsize=(style_dict['figure']['width'],style_dict['figure']['height']))    
#         cax = plt.imshow(arr_prec, cmap=cmapping)
#         plt.gca().invert_yaxis()
#         cbar = plt.colorbar(cax, ticks = [x/100.0 for x in np.arange(0,1000,10)], shrink = 0.83)

#         ax.set_xticklabels(uss_limits[0:len(uss_limits):tick_step])
#         ax.set_yticklabels(iss_limits[0:len(uss_limits):tick_step])
#         ax.set_xticks(np.arange(0, len(uss_limits), tick_step))    
#         ax.set_yticks(np.arange(0, len(iss_limits), tick_step))    
#         ax.set_xlabel('Last User Specific Sparsity', fontsize = style_dict['label']['fontsize'])
#         ax.set_ylabel('Last Item Specific Sparsity', fontsize = style_dict['label']['fontsize'])    
#         ax.tick_params(axis='both', which='major', labelsize=style_dict['tick']['fontsize'])
#         cbar.set_label(column.title(), labelpad=-50,  y=1.08, rotation=0, fontsize = style_dict['label']['fontsize'])
#         cbar.ax.tick_params(labelsize = style_dict['tick']['fontsize'])
#         plt.clim(0, 1)
#         plt.xticks(rotation = 'vertical')

#         filename = '2d-' + column + '.png'       
#         fig.savefig(os.path.join(figures_output_folder, filename), bbox_inches = 'tight')
        
#         fig.set_animated(True)
#         figs[re.split("\d", column)[0]].append(fig)        
#         if rank == rank_length: # Send only the target-analysis
#             bot.send_message(filePath=os.path.join(figures_output_folder, filename)) if bot_alive else ''

In [None]:


# for metric in evaluation_metrics:    
#     filepaths = [os.path.join(figures_output_folder, '2d-' + column + '.png' ) for column in [metric + '@' + str(rank) for rank in rank_lengths]]
#     output_filepath = os.path.join(figures_output_folder, '2d-' + metric + '@k' + '.gif' )
#     create_gif(filepaths, output_filepath, duration=0.2)

In [None]:
# from IPython import display
# # display.HTML('<img src="{}">'.format(output_filepath))

In [None]:
# show_metric = lambda metric, k: figs[metric + '@'][int(k)-1]
# interact(show_metric, k=widgets.IntSlider(min=1, max=rank_length, step=1, value=10), figs=figs, metric=evaluation_metrics)

__________