In [1]:
import os
import json
import pandas as pd
import numpy as np
import tqdm
import scipy.sparse as sp

from implicit.nearest_neighbours import TFIDFRecommender
from pprint import pprint

In [4]:
!pip install Cython

Collecting Cython
[?25l  Downloading https://files.pythonhosted.org/packages/b2/20/46a78072ecd4fda072c3791a257b03af99b64673671663067d18bc4935ec/Cython-0.29.7-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K    100% |████████████████████████████████| 2.1MB 759kB/s ta 0:00:01
[?25hInstalling collected packages: Cython
Successfully installed Cython-0.29.7


In [3]:
%load_ext cython

In [4]:
%%cython
def average_precision(
        dict data_true,
        dict data_predicted,
        const unsigned long int k
) -> float:
    cdef:
        unsigned long int n_items_predicted
        unsigned long int n_items_true
        unsigned long int n_correct_items
        unsigned long int item_idx

        double average_precision_sum
        double precision

        set items_true
        list items_predicted

    if not data_true:
        raise ValueError('data_true is empty')

    average_precision_sum = 0.0

    for key, items_true in data_true.items():
        items_predicted = data_predicted.get(key, [])

        n_items_true = len(items_true)
        n_items_predicted = min(len(items_predicted), k)

        if n_items_true == 0 or n_items_predicted == 0:
            continue

        n_correct_items = 0
        precision = 0.0

        for item_idx in range(n_items_predicted):
            if items_predicted[item_idx] in items_true:
                n_correct_items += 1
                precision += <double>n_correct_items / <double>(item_idx + 1)

        average_precision_sum += <double>precision / <double>min(n_items_true, k)

    return average_precision_sum / <double>len(data_true)

def metric(true_data, predicted_data, k=20):
    true_data_set = {k: set(v) for k, v in true_data.items()}

    return average_precision(true_data_set, predicted_data, k=k)

In [5]:
DATA_PATH = './data/'

In [6]:
def watch2(row):
    if row['consumption_mode'] != 'S':
        return 1
    
    duration = row['duration'] 
    
    
    if row['type'] != 'series' and row['watched_time']/30 > duration: # row['watched_time']/60 > duration / 2
        return 1
    
    if row['type'] == 'series' and row['watched_time']/20 > duration: # row['watched_time']/60 > duration / 3
        return 1
    
    return 0

In [7]:
with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalogue = {int(k): v for k, v in catalogue.items()}

In [8]:
%%time
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

CPU times: user 6.41 s, sys: 273 ms, total: 6.68 s
Wall time: 6.68 s


In [14]:
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = set(json.load(f)['users'])

In [9]:
transactions['duration'] = transactions.element_uid.apply(lambda x: catalogue[x]['duration'])
transactions['type'] = transactions.element_uid.apply(lambda x: catalogue[x]['type'])

In [10]:
%%time
transactions['my_target'] = transactions[['consumption_mode', 'watched_time',  'duration', 'type']].apply(watch2, axis=1)

CPU times: user 5min 39s, sys: 864 ms, total: 5min 40s
Wall time: 5min 41s


In [16]:
from collections import defaultdict

filtered_elements = defaultdict(set)

for user_uid, element_uid in tqdm.tqdm(transactions.loc[:, ['user_uid', 'element_uid']].values):
    if user_uid not in test_users:
        continue
    filtered_elements[user_uid].add(element_uid)

100%|██████████| 9643012/9643012 [00:18<00:00, 524192.20it/s]


In [17]:
def to_cat(uid):
    uid_to_cat = dict(zip(
        uid.cat.categories,
        range(len(uid.cat.categories))
    ))
    return uid_to_cat

In [20]:
def tfd(rat, col):
    rat['user_uid'] = rat['user_uid'].astype('category')
    rat['element_uid'] = rat['element_uid'].astype('category')

    fmatrix = sp.coo_matrix(
        (rat[col].astype(np.float32) + 1,
            (
                rat['element_uid'].cat.codes.copy(),
                rat['user_uid'].cat.codes.copy()
            )
        )
    )

    fmatrix = fmatrix.tocsr()


    model_f = TFIDFRecommender(K=19)
    model_f.fit(fmatrix)

    fmatrix_T = fmatrix.T.tocsr()

    user_uid_to_cat = to_cat(rat['user_uid'])

    element_uid_to_cat = to_cat(rat['element_uid'])

    filtered_elements_cat_f = {k: [element_uid_to_cat.get(x, None) for x in v] for k, v in filtered_elements.items()}

    result = {}

    for user_uid in tqdm.tqdm(test_users):
        # transform user_uid to model's internal user category
        try:
            user_cat = user_uid_to_cat[user_uid]
        except LookupError:
            continue

        # perform inference
        recs = model_f.recommend(
            user_cat,
            fmatrix_T,
            N=20,
            filter_already_liked_items=True,
            filter_items=filtered_elements_cat_f.get(user_uid, set())
        )

        # drop scores and transform model's internal elelemnt category to element_uid for every prediction
        # also convert np.uint64 to int so it could be json serialized later
        result[user_uid] = [int(rat['element_uid'].cat.categories[i]) for i, _ in recs]
    return result

In [21]:
result = tfd(transactions, 'my_target')

100%|██████████| 8296/8296 [00:02<00:00, 3846.46it/s]
100%|██████████| 50000/50000 [00:18<00:00, 2681.27it/s]


In [22]:
with open('submissions/watch2_tfidf19.json', 'w') as f:
    json.dump(result, f)