## Tarefas
- Definir variables output folder
- Criar pastas para cada fold


## Imports

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
from random import choice
from string import ascii_uppercase
from caserec.recommenders.rating_prediction.itemknn import ItemKNN
from caserec.evaluation.rating_prediction import RatingPredictionEvaluation
from caserec.utils.process_data import ReadFile
from sklearn.model_selection import train_test_split

## Parameters

In [13]:
degree_user_thr = .1
degree_item_thr = .1
strategy = 'upper_degree_centrality'
k_neighbors = 30
rank_length = 10
n_fold = 2

In [3]:
data_path = './Variables'
os.listdir(data_path)

['desktop.ini', 'df_degrees.csv', 'ML1M_edgelist.txt']

In [4]:
df_degrees = pd.read_csv(os.path.join(data_path, 'df_degrees.csv'), sep=';')
df_degrees.head()

Unnamed: 0,from,to,rating,degree_user,degree_centrality_user,degree_item,degree_centrality_item
0,user_1,item_1193,5,53,0.005439,1725,0.177014
1,user_2,item_1193,5,129,0.013238,1725,0.177014
2,user_12,item_1193,4,23,0.00236,1725,0.177014
3,user_15,item_1193,4,201,0.020626,1725,0.177014
4,user_17,item_1193,5,211,0.021652,1725,0.177014


In [5]:
def filter_dataset(df_degrees, degree_user=None, degree_item=None, split_size=.3, strategy='upper_degree_centrality'):
    degree_user = 1 if degree_user == None else degree_user
    degree_item = 1 if degree_item == None else degree_item
    
    if strategy == 'upper_degree_centrality':       
        cond_user = df_degrees['degree_centrality_user'] > degree_user
        cond_item = df_degrees['degree_centrality_item'] > degree_item            
        df_result = df_degrees[cond_user & cond_item]        
    
    elif strategy == 'lower_degree_centrality':
        cond_user = df_degrees['degree_centrality_user'] <= degree_user
        cond_item = df_degrees['degree_centrality_item'] <= degree_item            
        df_result = df_degrees[cond_user & cond_item]    
    
    elif strategy == 'random':
        df_result, df_test = train_test_split(df_degrees, test_size=split_size)
        df_result.reset_index(drop=True, inplace=True)
        return df_result
        
    return df_result
filter_dataset(df_degrees, degree_user=degree_user_thr, degree_item=degree_item_thr, strategy=strategy)

Unnamed: 0,from,to,rating,degree_user,degree_centrality_user,degree_item,degree_centrality_item
117,user_424,item_1193,5,1226,0.125808,1725,0.177014
152,user_524,item_1193,5,1016,0.104259,1725,0.177014
160,user_549,item_1193,5,1152,0.118214,1725,0.177014
213,user_752,item_1193,5,1024,0.105080,1725,0.177014
252,user_889,item_1193,1,1518,0.155772,1725,0.177014
...,...,...,...,...,...,...,...
815512,user_5367,item_1183,1,1169,0.119959,989,0.101488
815570,user_5643,item_1183,4,1047,0.107440,989,0.101488
815594,user_5795,item_1183,4,1277,0.131042,989,0.101488
815600,user_5831,item_1183,4,1220,0.125192,989,0.101488


## Setting Train/Test sets

In [6]:
df_filtered = filter_dataset(df_degrees, degree_user=degree_user_thr, degree_item=degree_item_thr, strategy=strategy)
df_filtered = df_filtered[['from', 'to', 'rating']]
df_filtered.columns = ['user', 'item', 'feedback_value']
df_filtered.head()

Unnamed: 0,user,item,feedback_value
117,user_424,item_1193,5
152,user_524,item_1193,5
160,user_549,item_1193,5
213,user_752,item_1193,5
252,user_889,item_1193,1


In [15]:
df_train, df_test = train_test_split(df_filtered, test_size=0.3, random_state=n_fold)
print ("Train size: {} \nTest size: {}".format(df_train.shape[0], df_test.shape[0]))

Train size: 5924 
Test size: 2540


## Training Model

In [8]:
predictions_output_filepath = './predictions_output_' + ''.join(choice(ascii_uppercase) for i in range(12)) + '.dat'
model = ItemKNN(train_file=df_train, test_file=df_test, k_neighbors=k_neighbors, output_file=predictions_output_filepath)

In [9]:
%%time
model.compute(verbose=True)

[Case Recommender: Rating Prediction > ItemKNN Algorithm]

train data:: 44 users and 222 items (5924 interactions) | sparsity:: 39.35%
test data:: 44 users and 222 items (2540 interactions) | sparsity:: 74.00%

training_time:: 0.080812 sec
prediction_time:: 0.345049 sec
Eval:: MAE: 0.728431 RMSE: 0.943592 
Wall time: 996 ms


## Evaluating Model

In [10]:
# Creating evaluator with item-recommendation parameters
evaluator = RatingPredictionEvaluation(sep = '\t', 
                                       n_rank = np.arange(1,rank_length+1, 1), 
                                       as_rank = True,
                                       metrics = ['PREC', 'RECALL'])

In [11]:
reader = ReadFile(input_file=predictions_output_filepath)
predictions = reader.read()
eval_results = evaluator.evaluate(predictions['feedback'], model.test_set)   
for evaluation in model.evaluation_results.keys():
    eval_results[evaluation] = model.evaluation_results[evaluation]
os.remove(predictions_output_filepath)

Eval:: PREC@1: 0.772727 PREC@2: 0.738636 PREC@3: 0.772727 PREC@4: 0.789773 PREC@5: 0.804545 PREC@6: 0.82197 PREC@7: 0.811688 PREC@8: 0.798295 PREC@9: 0.792929 PREC@10: 0.795455 RECALL@1: 0.021974 RECALL@2: 0.03981 RECALL@3: 0.063672 RECALL@4: 0.086929 RECALL@5: 0.112203 RECALL@6: 0.138491 RECALL@7: 0.159107 RECALL@8: 0.17704 RECALL@9: 0.1956 RECALL@10: 0.21785 


In [12]:
eval_results

{'MAE': 0.728431,
 'MAP': 0.835813,
 'MAP@1': 0.772727,
 'MAP@10': 0.835813,
 'MAP@2': 0.829545,
 'MAP@3': 0.852273,
 'MAP@4': 0.840278,
 'MAP@5': 0.840309,
 'MAP@6': 0.837247,
 'MAP@7': 0.838994,
 'MAP@8': 0.839062,
 'MAP@9': 0.839822,
 'MRR@1': 0.772727,
 'MRR@10': 0.86553,
 'MRR@2': 0.829545,
 'MRR@3': 0.859848,
 'MRR@4': 0.86553,
 'MRR@5': 0.86553,
 'MRR@6': 0.86553,
 'MRR@7': 0.86553,
 'MRR@8': 0.86553,
 'MRR@9': 0.86553,
 'NDCG@1': 0.772727,
 'NDCG@10': 0.885679,
 'NDCG@2': 0.886364,
 'NDCG@3': 0.910169,
 'NDCG@4': 0.889271,
 'NDCG@5': 0.885087,
 'NDCG@6': 0.881006,
 'NDCG@7': 0.882832,
 'NDCG@8': 0.883915,
 'NDCG@9': 0.885569,
 'PREC@1': 0.772727,
 'PREC@10': 0.795455,
 'PREC@2': 0.738636,
 'PREC@3': 0.772727,
 'PREC@4': 0.789773,
 'PREC@5': 0.804545,
 'PREC@6': 0.82197,
 'PREC@7': 0.811688,
 'PREC@8': 0.798295,
 'PREC@9': 0.792929,
 'RECALL@1': 0.021974,
 'RECALL@10': 0.21785,
 'RECALL@2': 0.03981,
 'RECALL@3': 0.063672,
 'RECALL@4': 0.086929,
 'RECALL@5': 0.112203,
 'RECALL@6'

In [None]:
joblib.dump(eval_results, os.path.join(variables_output_folder, 'evaluation_results.joblib'))