## Tarefas
- Definir variables output folder
- Criar pastas para cada fold


## Imports

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import random
from random import choice
from string import ascii_uppercase
from caserec.recommenders.rating_prediction.itemknn import ItemKNN
from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
from caserec.utils.process_data import ReadFile
from sklearn.model_selection import train_test_split
import utils

## Parameters

In [2]:
degree_user_thr = .1
degree_item_thr = .1
strategy = 'upper_degree_centrality'
k_neighbors = 30
rank_length = 10
n_fold = 2

In [3]:
config_file = './config_file.json'
params = Params(config_)
hash = utils.create_hash(config_file)
hash

'4ce39'

In [4]:
data_path = './Variables'
experiment_output_folder = os.path.join('.', 'Experiments', hash, 'fold_'+str(n_fold))
utils.create_folder(experiment_output_folder)

In [26]:
df_degrees = pd.read_csv(os.path.join(data_path, 'df_degrees.csv'), sep=';')
df_degrees.tail()

Unnamed: 0,from,to,rating,timestamp,degree_user,degree_centrality_user,degree_item,degree_centrality_item
99995,user_863,item_1679,3,889289491,107,0.040777,1,0.000381
99996,user_863,item_1678,1,889289570,107,0.040777,1,0.000381
99997,user_863,item_1680,2,889289570,107,0.040777,1,0.000381
99998,user_896,item_1681,3,887160722,362,0.137957,1,0.000381
99999,user_916,item_1682,3,880845755,317,0.120808,1,0.000381


In [27]:
import random
user_fraction, item_fraction = 0.3, 0.3
unique_users, unique_items = df_degrees['from'].unique(), df_degrees['to'].unique()
chosen_users = random.sample(list(unique_users), int(user_fraction*unique_users.shape[0]))
chosen_items = random.sample(list(unique_items), int(item_fraction*unique_items.shape[0]))
df_degrees = df_degrees[df_degrees['from'].isin(chosen_users)]
df_degrees = df_degrees[df_degrees['to'].isin(chosen_items)].reset_index(drop=True)

In [28]:
df_degrees

Unnamed: 0,from,to,rating,timestamp,degree_user,degree_centrality_user,degree_item,degree_centrality_item
0,user_286,item_251,5,876521678,288,0.109756,46,0.017530
1,user_63,item_251,4,875747514,93,0.035442,46,0.017530
2,user_181,item_251,1,878962052,435,0.165777,46,0.017530
3,user_15,item_251,2,879455541,104,0.039634,46,0.017530
4,user_270,item_251,5,876954752,138,0.052591,46,0.017530
5,user_144,item_251,4,888103929,206,0.078506,46,0.017530
6,user_416,item_251,5,893213405,493,0.187881,46,0.017530
7,user_532,item_251,4,888636374,274,0.104421,46,0.017530
8,user_655,item_251,3,888984417,685,0.261052,46,0.017530
9,user_707,item_251,5,880059647,236,0.089939,46,0.017530


## Setting Train/Test sets

In [6]:
df_filtered = utils.filter_dataset(df_degrees, degree_user=degree_user_thr, degree_item=degree_item_thr, strategy=strategy)
df_filtered = df_filtered[['from', 'to', 'rating']]
df_filtered.columns = ['user', 'item', 'feedback_value']
df_filtered.head()

Unnamed: 0,user,item,feedback_value
117,user_424,item_1193,5
152,user_524,item_1193,5
160,user_549,item_1193,5
213,user_752,item_1193,5
252,user_889,item_1193,1


In [7]:
df_train, df_test = train_test_split(df_filtered, test_size=0.3, random_state=n_fold)
print ("Train size: {} \nTest size: {}".format(df_train.shape[0], df_test.shape[0]))

Train size: 5924 
Test size: 2540


## Training Model

In [8]:
predictions_output_filepath = './predictions_output_' + ''.join(choice(ascii_uppercase) for i in range(12)) + '.dat'
model = ItemKNN(train_file=df_train, test_file=df_test, k_neighbors=k_neighbors, output_file=predictions_output_filepath)

In [9]:
%%time
model.compute(verbose=True)

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 44 users and 222 items (5924 interactions) | sparsity:: 39.35%
test data:: 44 users and 222 items (2540 interactions) | sparsity:: 74.00%

training_time:: 0.072192 sec
prediction_time:: 0.292727 sec
Eval:: MAE: 0.701527 RMSE: 0.915924 
Wall time: 958 ms


## Evaluating Model

In [10]:
# Creating evaluator with item-recommendation parameters
evaluator = RatingPredictionEvaluation(sep = '\t', 
                                       n_rank = np.arange(1,rank_length+1, 1), 
                                       as_rank = True,
                                       metrics = ['PREC', 'RECALL'])

In [11]:
reader = ReadFile(input_file=predictions_output_filepath)
predictions = reader.read()
eval_results = evaluator.evaluate(predictions['feedback'], model.test_set)   
for evaluation in model.evaluation_results.keys():
    eval_results[evaluation] = model.evaluation_results[evaluation]
os.remove(predictions_output_filepath)

Eval:: PREC@1: 0.909091 PREC@2: 0.852273 PREC@3: 0.848485 PREC@4: 0.840909 PREC@5: 0.836364 PREC@6: 0.829545 PREC@7: 0.821429 PREC@8: 0.821023 PREC@9: 0.818182 PREC@10: 0.8 RECALL@1: 0.025572 RECALL@2: 0.049078 RECALL@3: 0.073104 RECALL@4: 0.095036 RECALL@5: 0.119335 RECALL@6: 0.139213 RECALL@7: 0.159229 RECALL@8: 0.182072 RECALL@9: 0.202203 RECALL@10: 0.217223 


In [12]:
eval_results

{'MAE': 0.701527,
 'MAP': 0.877576,
 'MAP@1': 0.909091,
 'MAP@10': 0.877576,
 'MAP@2': 0.943182,
 'MAP@3': 0.931818,
 'MAP@4': 0.921717,
 'MAP@5': 0.909091,
 'MAP@6': 0.898939,
 'MAP@7': 0.892026,
 'MAP@8': 0.8832,
 'MAP@9': 0.878838,
 'MRR@1': 0.909091,
 'MRR@10': 0.950758,
 'MRR@2': 0.943182,
 'MRR@3': 0.950758,
 'MRR@4': 0.950758,
 'MRR@5': 0.950758,
 'MRR@6': 0.950758,
 'MRR@7': 0.950758,
 'MRR@8': 0.950758,
 'MRR@9': 0.950758,
 'NDCG@1': 0.909091,
 'NDCG@10': 0.923859,
 'NDCG@2': 0.977273,
 'NDCG@3': 0.95806,
 'NDCG@4': 0.946621,
 'NDCG@5': 0.93169,
 'NDCG@6': 0.928521,
 'NDCG@7': 0.926653,
 'NDCG@8': 0.924024,
 'NDCG@9': 0.923451,
 'PREC@1': 0.909091,
 'PREC@10': 0.8,
 'PREC@2': 0.852273,
 'PREC@3': 0.848485,
 'PREC@4': 0.840909,
 'PREC@5': 0.836364,
 'PREC@6': 0.829545,
 'PREC@7': 0.821429,
 'PREC@8': 0.821023,
 'PREC@9': 0.818182,
 'RECALL@1': 0.025572,
 'RECALL@10': 0.217223,
 'RECALL@2': 0.049078,
 'RECALL@3': 0.073104,
 'RECALL@4': 0.095036,
 'RECALL@5': 0.119335,
 'RECALL@6