In [17]:
import pandas as pd
import numpy as np
from collections import Counter
from data import PopularitySplitter, preprocess_events
from evaluation import HitRate_NDCG_MRR_from_CSV

DATA_DIR = 'data' 
STUDY_DIR = DATA_DIR + '/study'

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# use study for less data, for testing
events = pd.read_csv(f'{DATA_DIR}/events.csv.gz')
topics = pd.read_csv(f'{DATA_DIR}/topics_translated.csv')

In [20]:
from evaluation import *
from tqdm import tqdm

class Popularity:
    """
        @args: 
            k: recommend the topic for users based on the most popular last k interactions
            data: list of tuple (user, toic, timestamp)
            topk: top k elements that are recommended to calculate hitrate, ndcg, mrr
    """
    def __init__(self, data, k=100):
        self.k = k
        self.data = data
        self.pred = []

    def test_step(self, sample):
        """
            @args:
            sample: tuple(user, topic, timestamp)
        """
        target_timestamps = [item[2] for item in sample]
       
        for target_timestamp in tqdm(target_timestamps):
            nearest_df = sorted(list(filter(lambda x: x[2] < target_timestamp, self.data)))[:self.k]
            # Count the occurrences of each element
            topics = [item[1] for item in nearest_df]
            counter = Counter(topics)

            # Sort unique elements based on their count in descending order
            top_topics = sorted(counter.keys(), key=lambda x: counter[x], reverse=True)
            self.pred.append(top_topics)
        return self.pred
    
    def accuracy(self, pred, test_dataset, n=10):
        hit_list, ndcg_list, mrr_list = [], [], []
        for rcm_topics, item in tqdm(zip(pred, test_dataset)):
            topic = item[1]
            # Calculate HR
            hit_list.append(getHitRatio(rcm_topics, topic))
        
            # Calculate NDCG
            ndcg_list.append(getNDCG(rcm_topics, topic))

            # Calculate MRR
            mrr_list.append(getMRR(rcm_topics, topic))
        return {f'HitRate@{n}': np.array(hit_list).mean(), f'NDCG@{n}': np.array(ndcg_list).mean(), f'MRR@{n}': np.array(mrr_list).mean()}


In [29]:
def evaluate_pipeline(df, topics, math=True, german=True):
    df_preprocessed = preprocess_events(df, topics, math, german)
    data_splitter = PopularitySplitter(df_preprocessed, test_user_frac=0.05)
    train_dataset = data_splitter.get_train_dataset()
    test_dataset = data_splitter.get_test_dataset()
    val_dataset = data_splitter.get_val_dataset()
    
    model = Popularity(train_dataset)
    pred = model.test_step(test_dataset)
    model.accuracy(pred, test_dataset)
    result = pd.DataFrame({'predict': pred, 'test_data': test_dataset})
    result.to_csv('popularity-baseline_pred.csv')

    df = pd.DataFrame({'user_id':[], 'topic_id':[],'was_interaction':[],'predict_proba':[]})

    for predict, test_data in zip(result['predict'], result['test_data']):
        user = int(test_data[0])
        topic = int(test_data[1])
        
        user_prediction = pd.DataFrame({'user_id': [user]*len(predict), 'topic_id': list(map(int, predict)), 'was_interaction':[0]*len(predict), 'predict_proba': np.arange(len(predict), 0, -1)})
        
        if topic in user_prediction['topic_id']:
            user_prediction[user_prediction['topic_id'] == test_data[1]]['was_interaction'] = 1
        else:
            new_element={'user_id':test_data[0], 'topic_id':test_data[1],'was_interaction':1,'predict_proba':0}
            df.loc[df.index.max() + 1] = new_element
                
        df = pd.concat([df, user_prediction], axis=0)    
    
    heading = math*'math-'+german*'german-'
    df.to_csv(f'recommender_outputs/{heading}popularity_probas.csv')

    math_ids = list(set(topics[topics['math'] == 1]['id']))
    german_ids = list(set(topics[topics['math'] == 0]['id']))
    return HitRate_NDCG_MRR_from_CSV(f'recommender_outputs/{heading}popularity_probas.csv', n=10, math_ids=math_ids, german_ids=german_ids)


In [32]:
evaluate_pipeline(events, topics)

100%|██████████| 545/545 [01:22<00:00,  6.62it/s]
545it [00:00, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_prediction[user_prediction['topic_id'] == test_data[1]]['was_interaction'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_prediction[user_prediction['topic_id'] == test_data[1]]['was_interaction'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

{'math': {'HitRate@10': 0.31016042780748665,
  'NDCG@10': 0.14959456991651984,
  'MRR@10': 0.10262923351158647},
 'german': {'HitRate@10': 0.34441087613293053,
  'NDCG@10': 0.16095034953959275,
  'MRR@10': 0.10540809475854793}}

In [33]:
evaluate_pipeline(events, topics, math=True, german=False)

100%|██████████| 306/306 [00:13<00:00, 22.76it/s]
306it [00:00, 38386.63it/s]
  return {f'HitRate@{n}': np.array(hit_list).mean(), f'NDCG@{n}': np.array(ndcg_list).mean(), f'MRR@{n}': np.array(mrr_list).mean()}
  ret = ret.dtype.type(ret / rcount)


{'math': {'HitRate@10': 0.3660130718954248,
  'NDCG@10': 0.19445593368011846,
  'MRR@10': 0.14288696960265587},
 'german': {'HitRate@10': nan, 'NDCG@10': nan, 'MRR@10': nan}}

In [34]:
evaluate_pipeline(events, topics, math=False, german=True)

100%|██████████| 429/429 [00:43<00:00,  9.85it/s]
429it [00:00, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_prediction[user_prediction['topic_id'] == test_data[1]]['was_interaction'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_prediction[user_prediction['topic_id'] == test_data[1]]['was_interaction'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

{'math': {'HitRate@10': nan, 'NDCG@10': nan, 'MRR@10': nan}, 'german': {'HitRate@10': 0.45202020202020204, 'NDCG@10': 0.18446039823564067, 'MRR@10': 0.10544332210998879}}
