In [1]:
import pandas as pd
from hnmchallenge.data_reader import DataReader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from hnmchallenge.dataset import Dataset
from hnmchallenge.filtered_dataset import FilterdDataset
from hnmchallenge.models.top_pop import TopPop
from hnmchallenge.evaluation.python_evaluation import map_at_k, recall_at_k
from hnmchallenge.constant import *
from hnmchallenge.models.sgmc.sgmc import SGMC
from hnmchallenge.models.ease.ease import EASE
from hnmchallenge.models.itemknn.itemknn import ItemKNN 

In [2]:
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

In [3]:
dataset = FilterdDataset()
dr = DataReader()

In [9]:
fd = dr.get_filtered_full_data()

In [14]:
size = fd.groupby(DEFAULT_USER_COL).size().values

In [16]:
len(size[size>=12])

368885

In [18]:
len(size[size<8])

634733

In [None]:
item_mb = dataset.get_filtered_item_multiply_buy()
item_mb_ids = item_mb[item_mb["count"] > 100][DEFAULT_ITEM_COL]

In [None]:
item_mb_ids

In [None]:
users_mb = dataset.get_filtered_user_multiply_buy()
CUTOFF = 0.0
normal_users = users_mb[users_mb["diff"] <= CUTOFF][DEFAULT_USER_COL]
users_top_pop = users_mb[users_mb["diff"] > CUTOFF][DEFAULT_USER_COL]

In [None]:
recom = SGMC(dataset, k=128, time_weight=True)

In [4]:
recom = ItemKNN(dataset, topk=1000, time_weight=True)

In [None]:
recom = EASE(dataset, l2=1e-3, time_weight=True)

In [5]:
train = dataset.get_train_df()
train_sub = dataset.get_train_df_user_subset()
fd = dr.get_filtered_full_data()

In [6]:
val_df_users = dataset.get_val_df()[DEFAULT_USER_COL].unique()
test_df_users = dataset.get_test_df()[DEFAULT_USER_COL].unique()

filtered_train = train[(train[DEFAULT_USER_COL].isin(val_df_users) | train[DEFAULT_USER_COL].isin(test_df_users))]

In [8]:
len(val_df_users)

37741

In [None]:
recom.compute_similarity_matrix(train)

In [8]:
recs = recom.recommend_multicore(interactions=filtered_train, batch_size=10_000, num_cpus=20, 
                                    remove_seen=False, white_list_mb_item=None, cutoff=200)

INFO:hnmchallenge.recommender_interface:[1;36mRecommending items MULTICORE[0m


  0%|          | 0/7 [00:00<?, ?it/s]

Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 6.931366682052612 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 5.737704038619995 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 4.965228319168091 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


func:interactions_to_sparse_matrix
 took: 3.7783453464508057 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 3.0527865886688232 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 2.315826177597046 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m


Predicting using time_weight importance...


INFO:hnmchallenge.utils.pandas_utils:[1;33munique customer_id: 9713[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;31mApplying time weight on user-item interactions[0m


func:interactions_to_sparse_matrix
 took: 1.3650367259979248 sec


INFO:hnmchallenge.recommender_interface:[1;36mSPARSE Item Similarity MUL...[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m
INFO:hnmchallenge.utils.sparse_matrix:[1;36mSort_top_k:True[0m


In [9]:
print("validation recall: {}".format(recall_at_k(rating_true=dataset.get_val_df(), rating_pred=recs)))
print("test recall: {}".format(recall_at_k(rating_true=dataset.get_test_df(), rating_pred=recs)))

validation recall: 0.09760101732329053
test recall: 0.09610887969987732


In [10]:
map_at_k(rating_true=dataset.get_val_df(), rating_pred=recs, adjust_user_count=False)

0.017844812863520996

In [11]:
map_at_k(rating_true=dataset.get_test_df(), rating_pred=recs, adjust_user_count=False)

0.01697209289727907

In [None]:
val_df_users = dataset.get_val_df()[DEFAULT_USER_COL].unique()
test_df_users = dataset.get_test_df()[DEFAULT_USER_COL].unique()

normal_users_val_train = train[(train[DEFAULT_USER_COL].isin(val_df_users)) & (train[DEFAULT_USER_COL].isin(normal_users))]
normal_users_test_train = train[(train[DEFAULT_USER_COL].isin(test_df_users)) & (train[DEFAULT_USER_COL].isin(normal_users))]
normal_users_train = pd.concat([normal_users_val_train, normal_users_test_train], axis=0)

mb_users_val_train = train[(train[DEFAULT_USER_COL].isin(val_df_users)) & (train[DEFAULT_USER_COL].isin(users_top_pop))]
mb_users_test_train = train[(train[DEFAULT_USER_COL].isin(test_df_users)) & (train[DEFAULT_USER_COL].isin(users_top_pop))]
mb_users_train = pd.concat([mb_users_val_train, mb_users_test_train], axis=0)

In [None]:
normal_users_recs = recom.recommend_multicore(interactions=normal_users_train, batch_size=10_000, num_cpus=20,
                                 remove_seen=True)

In [None]:
map_at_k(rating_true=dataset.get_val_df(), rating_pred=normal_users_recs, adjust_user_count=False)

In [None]:
map_at_k(rating_true=dataset.get_test_df(), rating_pred=normal_users_recs, adjust_user_count=False)

In [None]:
mb_users_recs = recom.recommend_multicore(interactions=mb_users_train, batch_size=10_000, num_cpus=20,
                                 remove_seen=False)

In [None]:
map_at_k(rating_true=dataset.get_val_df(), rating_pred=mb_users_recs, adjust_user_count=False)

In [None]:
map_at_k(rating_true=dataset.get_test_df(), rating_pred=mb_users_recs, adjust_user_count=False)

In [None]:
recom.compute_similarity_matrix(fd)  

In [None]:
normal_users_fd = fd[fd[DEFAULT_USER_COL].isin(normal_users)]
mb_users_fd = fd[fd[DEFAULT_USER_COL].isin(users_top_pop)]

In [None]:
normal_users_recs_fd = recom.recommend_multicore(interactions=normal_users_fd, batch_size=10_000, num_cpus=20,
                                 remove_seen=True)

In [None]:
mb_users_recs_fd = recom.recommend_multicore(interactions=mb_users_fd, batch_size=10_000, num_cpus=20,
                                 remove_seen=False)

In [None]:
recs_fd = recom.recommend_multicore(interactions=fd, batch_size=10_000, num_cpus=20, remove_seen=False)

In [None]:
from hnmchallenge.submission_handler import SubmissionHandler
sh = SubmissionHandler()                                               

In [None]:
sh.create_submission_filtered_data([recs_fd], sub_name="noob_submission")

In [None]:
sh.create_submission_filtered_data([normal_users_recs_fd, mb_users_recs_fd], sub_name="clusters_ease_new_time")