In [1]:
import os
import json
import pandas as pd
import numpy as np
import tqdm
import scipy.sparse as sp
from collections import defaultdict, Counter
from datetime import datetime
from copy import deepcopy

from pprint import pprint
tqdm.tqdm.pandas()

In [2]:
DATA_PATH = '/mnt/data_volume/site2vec/jupyter_home/temp/vladtitov/rekko_challenge'

def read_data(mode):
    with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
        test_users = set(json.load(f)['users'])
    
    if mode == 'production':
        predictions = dict()
        for user_uid in test_users:
            predictions[user_uid] = []
            
    with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
        catalogue = json.load(f)    
    catalogue = {int(k): v for k, v in catalogue.items()}
    
    catalogue_to_hahn = []
    for element_uid in catalogue:
        row = {'element_uid': element_uid}
        for key in catalogue[element_uid]:
            if key in {'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5'}:
                row[key] = float(catalogue[element_uid][key])
            else:
                row[key] = catalogue[element_uid][key]
        catalogue_to_hahn.append(row)
    catalogue = pd.DataFrame(catalogue_to_hahn)
    
    transactions = pd.read_csv(
        os.path.join(DATA_PATH, 'transactions.csv'),
        dtype={
            'element_uid': np.uint16,
            'user_uid': np.uint32,
            'consumption_mode': 'category',
            'ts': np.float64,
            'watched_time': np.uint64,
            'device_type': np.uint8,
            'device_manufacturer': np.uint8
        }
    )
    transactions_with_catalogue = transactions.join(catalogue.set_index('element_uid'), how='inner', 
                                                    on='element_uid', sort='ts')
    
    validation_threshold = np.percentile(transactions_with_catalogue['ts'], 70)
    if mode == 'testing':
        transactions_with_catalogue_val = transactions_with_catalogue.loc[transactions_with_catalogue['ts'] > validation_threshold].copy()
        transactions_with_catalogue = transactions_with_catalogue.loc[transactions_with_catalogue['ts'] <= validation_threshold]
    transactions_with_catalogue['ts'] = transactions_with_catalogue['ts'].progress_apply(
        lambda x: datetime.utcfromtimestamp(int(x * 60)))
    
    ratings = pd.read_csv(
        os.path.join(DATA_PATH, 'ratings.csv'),
        dtype={
            'element_uid': np.uint16,
            'user_uid': np.uint32,
            'ts': np.float64,
            'rating': np.uint8
        }
    )
    if mode == 'testing':
        ratings = ratings.loc[ratings['ts'] <= validation_threshold]
    ratings['ts'] = ratings['ts'].progress_apply(lambda x: datetime.utcfromtimestamp(int(x * 60)))
    
    bookmarks = pd.read_csv(
        os.path.join(DATA_PATH, 'bookmarks.csv'),
        dtype={
            'element_uid': np.uint16,
            'user_uid': np.uint32,
            'ts': np.float64
        }
    )

    if mode == 'testing':
        bookmarks = bookmarks.loc[bookmarks['ts'] <= validation_threshold]
        
    user_consumed_movies = defaultdict(set)
    for user_uid, element_uid in tqdm.tqdm(transactions_with_catalogue.loc[:, ['user_uid', 'element_uid']].values):
        user_consumed_movies[user_uid].add(element_uid)
       
    val_answers = None
    if mode == 'testing':
        predictions = dict()
        val_answers = defaultdict(set)

        for user_uid, element_uid in tqdm.tqdm(transactions_with_catalogue_val.loc[:, ['user_uid', 'element_uid']].values):
            if user_uid in test_users: #and element_uid not in unavailable_movies:
                val_answers[user_uid].add(element_uid)
                predictions[user_uid] = []
                
    return test_users, predictions, transactions_with_catalogue, ratings, bookmarks, user_consumed_movies, val_answers


In [3]:
data = read_data('production')

100%|██████████| 9643012/9643012 [00:29<00:00, 324350.76it/s]
100%|██████████| 438790/438790 [00:01<00:00, 312645.07it/s]
100%|██████████| 9643012/9643012 [00:24<00:00, 390544.78it/s]


In [4]:
def evaluate_model(data, model):
    test_users, predictions, movie_score_by_user, top_movies, users_bookmarked_movies, user_consumed_movies, val_answers = data
    
    movie_score_by_user['user_uid'] = movie_score_by_user['user_uid'].astype('category')
    movie_score_by_user['element_uid'] = movie_score_by_user['element_uid'].astype('category')
    ratings_matrix = sp.coo_matrix(
        (movie_score_by_user['rating'].astype(np.float64) + 1,
            (
                movie_score_by_user['element_uid'].cat.codes.copy(),
                movie_score_by_user['user_uid'].cat.codes.copy()
            )
        )
    )

    ratings_matrix = ratings_matrix.tocsr()
    ratings_matrix_T = ratings_matrix.T.tocsr()
    
    model.fit(ratings_matrix)
    
    user_uid_to_cat = dict(zip(
        movie_score_by_user['user_uid'].cat.categories,
        range(len(movie_score_by_user['user_uid'].cat.categories))
    ))

    element_uid_to_cat = dict(zip(
        movie_score_by_user['element_uid'].cat.categories,
        range(len(movie_score_by_user['element_uid'].cat.categories))
    ))
    
    filtered_elements_cat = {k: [element_uid_to_cat.get(x, None) for x in v] for k, v in user_consumed_movies.items()}
    
    model_predictions = defaultdict(list)  
    val_answers = val_answers if val_answers is not None else test_users
    for user_uid in tqdm.tqdm(val_answers):
        # transform user_uid to model's internal user category
        try:
            user_cat = user_uid_to_cat[user_uid]
        except LookupError:
            continue
    
        # perform inference
        recs = model.recommend(
            user_cat,
            ratings_matrix_T,
            N=100,
            filter_already_liked_items=True,
            filter_items=filtered_elements_cat.get(user_uid, set())
        )
    
        # drop scores and transform model's internal elelemnt category to element_uid for every prediction
        # also convert np.uint64 to int so it could be json serialized later
        model_predictions[user_uid] = [int(movie_score_by_user['element_uid'].cat.categories[i]) for i, _ in recs]
        
    for user_uid in tqdm.tqdm(val_answers):
        for movie in model_predictions[user_uid]:
            if len(predictions[user_uid]) == 100:
                break
            if movie in users_bookmarked_movies[user_uid] and movie not in user_consumed_movies[user_uid]: #and movie not in unavailable_movies:
                predictions[user_uid].append(movie)
                user_consumed_movies[user_uid].add(movie) 
            
        for movie in users_bookmarked_movies[user_uid]:
            if len(predictions[user_uid]) == 100:
                break
            if movie not in user_consumed_movies[user_uid]: #and movie not in unavailable_movies:
                predictions[user_uid].append(movie)
                user_consumed_movies[user_uid].add(movie) 
    
        for movie in model_predictions[user_uid]:
            if len(predictions[user_uid]) == 100:
                break
            if movie not in user_consumed_movies[user_uid]: #and movie not in unavailable_movies:
                predictions[user_uid].append(movie)
                user_consumed_movies[user_uid].add(movie)
        
        for movie in top_movies:
            if len(predictions[user_uid]) == 100:
                break
            if movie not in user_consumed_movies[user_uid]: #and movie not in unavailable_movies:
                predictions[user_uid].append(movie)
                user_consumed_movies[user_uid].add(movie)
    return predictions

In [5]:
def evaluate_rekko(data, smoothing, models_list):
    test_users, predictions, transactions_with_catalogue, ratings, bookmarks, user_consumed_movies, val_answers = data
    
    max_date = max(transactions_with_catalogue['ts'])
    transactions_with_catalogue['rating'] = pd.Series(np.full(len(transactions_with_catalogue), 10.0), 
                                                      index=transactions_with_catalogue.index)   
    new_ratings = []
    for rating, duration, watched_time, date, movie_type, consumption_mode in tqdm.tqdm(
        transactions_with_catalogue.loc[:, ['rating', 'duration', 'watched_time', 
                                            'ts', 'type', 'consumption_mode']].values):
        if consumption_mode in ('P', 'R'):
            new_ratings.append(rating * smoothing**((max_date - date).days) + 0.00001)
        elif movie_type == 'movie' and (watched_time + 1) / (duration * 60 + 1) >= 0.5:
            new_ratings.append(rating * smoothing**((max_date - date).days) + 0.00001)
        elif movie_type != 'movie' and (watched_time + 1) / (duration * 60 + 1) >= 2:
            new_ratings.append(rating * smoothing**((max_date - date).days) + 0.00001)
        else:
            new_ratings.append(0.00001)
    transactions_with_catalogue['rating'] = pd.Series(new_ratings, index=transactions_with_catalogue.index)
    
    new_ratings = []
    for rating, date in tqdm.tqdm(ratings.loc[:, ['rating', 'ts']].values):
        new_ratings.append(rating * smoothing**((max_date - date).days))
    ratings['rating'] = pd.Series(new_ratings, index=ratings.index)
    
    movie_score_by_user = pd.concat([ratings.loc[:, ['element_uid', 'user_uid', 'rating']], 
                                    transactions_with_catalogue.loc[:, ['element_uid', 'user_uid', 'rating']]], 
                                    ignore_index=True).groupby(
        ['element_uid', 'user_uid'], as_index=False).sum().sort_values(by=['user_uid'])
                                     
    movies_scores = movie_score_by_user.loc[:, ['element_uid', 'rating']].groupby(
        'element_uid', as_index=False).sum().sort_values(by=['rating'], ascending=False)
    top_movies = list(movies_scores['element_uid'])[:100]
                                     
    users_bookmarked_movies = defaultdict(list)
    bookmarks_with_scores = bookmarks.join(movies_scores.set_index('element_uid'), how='inner', 
                                           on='element_uid', sort='element_uid')
    for _, row in tqdm.tqdm(bookmarks_with_scores.iterrows()):
        users_bookmarked_movies[row['user_uid']].append((row['element_uid'], row['rating']))

    for user_uid in tqdm.tqdm(users_bookmarked_movies):
        users_bookmarked_movies[user_uid] = list(map(lambda x: int(x[0]), sorted(users_bookmarked_movies[user_uid], 
                                                                                 key=lambda x: x[1], reverse=True)))
    
    print('Ready for models execution')
                                     
    result = []
    data = (test_users, predictions, movie_score_by_user, top_movies, users_bookmarked_movies, 
            user_consumed_movies, val_answers)
    model_num = 0
    for model in models_list:
        model_num += 1
        result.append(evaluate_model(deepcopy(data), model))
        print('Executed model number {}'.format(model_num))
    return result

In [None]:
from implicit.nearest_neighbours import BM25Recommender, ItemItemRecommender, CosineRecommender

result = []
#result += evaluate_rekko(deepcopy(data), 0.995, [BM25Recommender(K=200), ItemItemRecommender(K=100), 
#                                                 CosineRecommender(K=100)])
for smoothing in (0.999, 0.995, 0.99):
    result += evaluate_rekko(deepcopy(data), smoothing, 
                             [BM25Recommender(K=200), ItemItemRecommender(K=200), CosineRecommender(K=100)])
    print('Processed smoothing {}'.format(smoothing))

100%|██████████| 9643012/9643012 [04:02<00:00, 39702.96it/s]
100%|██████████| 438790/438790 [00:15<00:00, 28553.05it/s]
921945it [01:52, 8182.70it/s]
100%|██████████| 143833/143833 [00:01<00:00, 77985.22it/s]


Ready for models execution


100%|██████████| 8670/8670 [00:00<00:00, 14843.81it/s]
100%|██████████| 50000/50000 [00:45<00:00, 1088.18it/s]
100%|██████████| 50000/50000 [00:38<00:00, 1296.91it/s]


Executed model number 1


100%|██████████| 8670/8670 [00:00<00:00, 14631.97it/s]
100%|██████████| 50000/50000 [00:42<00:00, 1175.76it/s]
100%|██████████| 50000/50000 [00:38<00:00, 1300.88it/s]


Executed model number 2


In [7]:
print(len(result))

9


In [9]:
def mnap_metric(predictions, answers, pred_size):
    assert isinstance(predictions, dict) and isinstance(answers, dict), 'wrong type of input'
    assert len(predictions) == len(answers), 'predictions and answers should have the same size'
    
    metric = 0.0
    for user in predictions:
        assert len(predictions[user]) == pred_size, 'len of user predictions should be {}'.format(pred_size)
        already_answered = set()
        total_consumed = min(pred_size, len(answers[user]))
        pk_metric = 0.0
        apk_metric = 0.0
        for i in range(total_consumed):
            if predictions[user][i] not in already_answered and predictions[user][i] in answers[user]:
                pk_metric += 1
                apk_metric += pk_metric / (i + 1)
                already_answered.add(predictions[user][i])
                
        metric += apk_metric / total_consumed
    
    return metric / len(predictions)

In [39]:
for j, model in enumerate(['BM25Recommender', 'ItemItemRecommender', 'CosineRecommender']):
    index = j
    predictions = dict()
    for user_uid in result[index]:
        predictions[user_uid] = result[index][user_uid][:20]
    print('mnap_metric {} on smoothing {} and model {}'.format(mnap_metric(predictions, data[-1], 20), 
                                                                0.995, model))

mnap_metric 0.03294918833669094 on smoothing 0.995 and model BM25Recommender
mnap_metric 0.03451084029406736 on smoothing 0.995 and model ItemItemRecommender
mnap_metric 0.03364825053036425 on smoothing 0.995 and model CosineRecommender


In [14]:
predictions = dict()
for user_uid in result[0]:
    predictions[user_uid] = result[0][user_uid][:20]
len(predictions)
#mnap_metric(predictions, data[-1], 20)

50000

In [None]:
# 0.034629372973859145 - ItemItem smoothing 0.995

In [None]:
# 0.03117753361301951 - ItemItem smoothing 0.98

In [8]:
for i, smoothing in enumerate([1, 0.999, 0.99]):
    for j, model in enumerate(['BM25Recommender', 'ItemItemRecommender', 'CosineRecommender']):
        index = i * 3 + j
        predictions = dict()
        for user_uid in result[index]:
            predictions[user_uid] = result[index][user_uid][:20]
        print('mnap_metric {} on smoothing {} and model {}'.format(mnap_metric(predictions, data[-1], 20), 
                                                                   smoothing, model))

mnap_metric 0.024218662364930674 on smoothing 1 and model BM25Recommender
mnap_metric 0.02115261107624082 on smoothing 1 and model ItemItemRecommender
mnap_metric 0.024359310616094997 on smoothing 1 and model CosineRecommender
mnap_metric 0.026675494240166988 on smoothing 0.999 and model BM25Recommender
mnap_metric 0.026743781185005918 on smoothing 0.999 and model ItemItemRecommender
mnap_metric 0.027536437111490335 on smoothing 0.999 and model CosineRecommender
mnap_metric 0.028835989137372793 on smoothing 0.99 and model BM25Recommender
mnap_metric 0.03328374125714043 on smoothing 0.99 and model ItemItemRecommender
mnap_metric 0.030320677577801516 on smoothing 0.99 and model CosineRecommender


In [10]:
for i, smoothing in enumerate([1, 0.999, 0.99]):
    for j, model in enumerate(['BM25Recommender', 'ItemItemRecommender', 'CosineRecommender']):
        index = i * 3 + j
        predictions = dict()
        for user_uid in result[index]:
            predictions[user_uid] = result[index][user_uid][:20]
        print('mnap_metric {} on smoothing {} and model {}'.format(mnap_metric(predictions, data[-1], 20), 
                                                                   smoothing, model))

mnap_metric 0.02796821919267791 on smoothing 1 and model BM25Recommender
mnap_metric 0.026855798901948606 on smoothing 1 and model ItemItemRecommender
mnap_metric 0.02760982425865799 on smoothing 1 and model CosineRecommender
mnap_metric 0.03294918833669094 on smoothing 0.999 and model BM25Recommender
mnap_metric 0.03457980164216801 on smoothing 0.999 and model ItemItemRecommender
mnap_metric 0.033591242868644004 on smoothing 0.999 and model CosineRecommender
mnap_metric 0.031386265026930746 on smoothing 0.99 and model BM25Recommender
mnap_metric 0.033382716111329085 on smoothing 0.99 and model ItemItemRecommender
mnap_metric 0.030892794258313108 on smoothing 0.99 and model CosineRecommender


In [8]:
def mix_solutions(result, rates, max_score, movies_num_to_leave):
    scores = dict()
    for user_uid in result[0]:
        scores[user_uid] = Counter()
        
    for i, prediction in tqdm.tqdm(enumerate(result)):
        for user_uid in prediction:
            for pos, movie in enumerate(prediction[user_uid]):
                scores[user_uid][movie] += rates[i] * max_score * 0.99**pos
    
    final_predictions = dict()
    for user_uid in tqdm.tqdm(scores):
        # print(scores[user_uid].values())
        final_predictions[user_uid] = list(map(lambda x: x[0], 
                                              sorted(scores[user_uid].items(), 
                                                     key=lambda x: x[1], reverse=True)))[:movies_num_to_leave]
    return final_predictions

In [9]:
ratings = [
    1 * 4, 1 * 3, 1 * 4,
    6 * 3, 9 * 4, 6 * 4,
    5 * 3, 9 * 3, 4 * 3
]

In [10]:
predictions = mix_solutions(result, ratings, 100, 20)

9it [00:38,  4.22s/it]
100%|██████████| 50000/50000 [00:05<00:00, 9620.33it/s]


In [20]:
mnap_metric(predictions, data[-1], 20)

0.037033576680949795

In [None]:
#ratings = [
#    1 * 4, 4 * 4, 4 * 4,
#    3 * 3, 6 * 4, 9 * 4,
#    1 * 3, 6 * 4, 1 * 3
#] - best public

In [None]:
ratings = [
    1 * 4, 1 * 3, 1 * 4,
    6 * 3, 9 * 4, 6 * 4,
    5 * 3, 9 * 3, 4 * 3
]

In [36]:
mnap_metric(predictions, data[-1], 20)

0.03705185079081703

In [None]:
#ratings = [
#    0 * 3, 1 * 4, 1 * 4,
#    3 * 3, 9 * 4, 6 * 4,
#    3 * 3, 9 * 3, 4 * 3
#]

In [74]:
mnap_metric(predictions, data[-1], 20)

0.03586838670807896

### Записываем ответ

In [11]:
assert len(predictions) == 50000
with open('bookmarks_bm25_cosine_knearest_feated_hack_0999_0995_099_decay_best_val_mix.json', 'w') as f:
    json.dump(predictions, f)

### Неудачные попытки

In [5]:
def find_users_not_using_bookmarks(data):
    test_users, transactions_with_catalogue, bookmarks = data
    validation_threshold = np.percentile(transactions_with_catalogue['ts'], 75)
    
    transactions_with_catalogue_val = transactions_with_catalogue.loc[transactions_with_catalogue['ts'] > validation_threshold].copy()
    transactions_with_catalogue = transactions_with_catalogue.loc[transactions_with_catalogue['ts'] <= validation_threshold]
    bookmarks = bookmarks.loc[bookmarks['ts'] <= validation_threshold]
    
    users_bookmarked_movies = defaultdict(set)
    for _, row in tqdm.tqdm(bookmarks.iterrows()):
        users_bookmarked_movies[row['user_uid']].add(row['element_uid'])
        
    val_answers = defaultdict(set)
    for user_uid, element_uid in tqdm.tqdm(transactions_with_catalogue_val.loc[:, ['user_uid', 'element_uid']].values):
        if user_uid in test_users:
            val_answers[user_uid].add(element_uid)

    users_not_using_bookmarks = set()
    for user_uid in val_answers:
        if len(users_bookmarked_movies[user_uid]) > 0 and \
            len(users_bookmarked_movies[user_uid].intersection(val_answers[user_uid])) == 0:
            users_not_using_bookmarks.add(user_uid)
    return users_not_using_bookmarks