In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import torch
from torch import nn
import scipy.sparse as sp
import torch.utils.data as data
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [22]:
full_data = pd.read_csv('hse_train.csv')

In [23]:
full_grouped = full_data.groupby('user_id').apply(
    lambda x: [(t1, t2) for t1, t2 in sorted(zip(x.item_id,
                                                 x.timestamp), key=lambda x: x[1])]).reset_index()

full_grouped.rename({0:'all_interactions'}, axis=1, inplace=True)
full_grouped['ind'] = full_grouped['all_interactions'].map(lambda x: max(1,int(len(x) * 0.8)))
full_grouped['train_part'] = full_grouped.apply(lambda x: x['all_interactions'][:x['ind']],axis = 1)
full_grouped['test_part'] = full_grouped.apply(lambda x: x['all_interactions'][x['ind']:],axis = 1)

In [24]:
full_grouped

Unnamed: 0,user_id,all_interactions,ind,train_part,test_part
0,0,"[(144433, 1511541894), (90536, 1511788479)]",1,"[(144433, 1511541894)]","[(90536, 1511788479)]"
1,1,"[(153245, 1512052112)]",1,"[(153245, 1512052112)]",[]
2,2,"[(131868, 1511700038), (28437, 1511708559), (2...",6,"[(131868, 1511700038), (28437, 1511708559), (2...","[(121515, 1511934587), (130903, 1512018500)]"
3,3,"[(147375, 1511923865), (165134, 1511931348), (...",3,"[(147375, 1511923865), (165134, 1511931348), (...","[(67481, 1512190253)]"
4,4,"[(11870, 1511556452), (11870, 1511571520), (76...",3,"[(11870, 1511556452), (11870, 1511571520), (76...","[(5508, 1512136599)]"
...,...,...,...,...,...
701976,701976,"[(167996, 1511929328), (89700, 1511935865), (1...",4,"[(167996, 1511929328), (89700, 1511935865), (1...","[(167969, 1512095838)]"
701977,701977,"[(35174, 1511614898), (35174, 1511672074), (10...",2,"[(35174, 1511614898), (35174, 1511672074)]","[(104654, 1511698498)]"
701978,701978,"[(65677, 1511841281)]",1,"[(65677, 1511841281)]",[]
701979,701979,"[(62480, 1511597454), (107712, 1511597572), (5...",37,"[(62480, 1511597454), (107712, 1511597572), (5...","[(7085, 1511995823), (137063, 1511996194), (15..."


In [25]:
joined = full_grouped.rename(columns = {'train_part':'train_interactions','test_part':'test_interactions'})

In [26]:
class TopPopular:

    def __init__(self):

        self.trained = False

    def fit(self, df, col='train_interactions'):

        counts = {}
        for _, row in df.iterrows():
            for item, _ in row[col]:
                if item in counts:
                    counts[item] += 1
                else:
                    counts[item] = 1

        counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

        self.recommenations = [x[0] for x in counts]
        self.trained = True

    def predict(self, df, topn=100):

        assert self.trained
        return [self.recommenations[:topn]]*len(df)

In [27]:
class TopPopularWeighted(TopPopular):

    def __init__(self, min_window=3):
        super().__init__()
        self.min_window = min_window*24*60*60

    def fit(self, df, col='train_interactions'):
        max_time = df[col].map(lambda x: max([pair[1] for pair in x])).max()

        counts = {}
        for _, row in df.iterrows():
            for item, _ in filter(lambda pair: pair[1] >= max_time - self.min_window, row[col]):
                if item in counts:
                    counts[item] += 1
                else:
                    counts[item] = 1

        counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

        self.recommenations = [x[0] for x in counts]
        self.trained = True

In [28]:
class ModifiedTopPopular(TopPopularWeighted):

    def __init__(self, min_window = 4):

        self.trained = False
        self.min_window = min_window*24*60*60


    def predict(self, df, topn=100, n_max_left = 2):

        assert self.trained

        all_recs = []

        for idx, row in df.iterrows():

            user_recs = []

            user_interactions = set([x[0] for x in row['train_interactions']])

            cnt = 0
            for elem in self.recommenations:
                if elem not in user_interactions:
                    user_recs.append(elem)
                elif cnt < n_max_left:
                    user_recs.append(elem)
                    cnt += 1

                if len(user_recs) == topn:
                    break

            all_recs.append(user_recs)

        return all_recs

In [29]:
toppop_w = TopPopularWeighted(min_window = 4)
toppop_w.fit(joined,col = 'all_interactions')
joined['final_preds'] = toppop_w.predict(joined, topn = 40)

In [30]:
final_train = full_data.copy()

final_item2id = {k:v for v, k in enumerate(final_train.item_id.unique())}
final_user2id = {k:v for v, k in enumerate(final_train.user_id.unique())}

final_id2item = {k:v for v, k in final_item2id.items()}
final_id2user = {k:v for v, k in final_user2id.items()}

final_train['item_id'] = final_train.item_id.apply(lambda x: final_item2id[x])
final_train['user_id'] = final_train.user_id.apply(lambda x: final_user2id[x])

In [31]:
joined['innew_user_id'] = joined['user_id'].map(lambda x: final_user2id[x])
joined['inner_preds'] = joined['final_preds'].map(lambda x: [final_item2id[pred] for pred in x])

In [32]:
from scipy import sparse as sps

final_interactions = sps.coo_matrix(
    (np.ones(final_train.shape[0]), (final_train['user_id'], final_train['item_id'])),
    shape=(len(final_user2id), len(final_item2id)),
)
final_interactions


<701981x180599 sparse matrix of type '<class 'numpy.float64'>'
	with 4842338 stored elements in COOrdinate format>

In [33]:
!pip install implicit



In [34]:
import implicit

model = implicit.nearest_neighbours.CosineRecommender(K = 1000)
model.fit(final_interactions.tocsr())

  0%|          | 0/180599 [00:00<?, ?it/s]

In [35]:
user_idxs = list(range(len(final_user2id)))
recommended = model.recommend(user_idxs, final_interactions.tocsr(), N=20, filter_already_liked_items=False)

In [36]:
def mix_recommends(recs,popular):
    pop = list(filter(lambda x: x not in recs,popular))
    recs = np.array(recs)
    indices = np.where(recs == -1)[0]
    recs[indices] = pop[:len(indices)]
    return list(recs)


df = pd.DataFrame(user_idxs,columns = ['user_id'])
df['item_id'] = recommended[0].tolist()
df = df.merge(joined[['innew_user_id','inner_preds']].rename(columns = {'innew_user_id':'user_id'}),how = 'inner',on = 'user_id')
df['item_id'] = df.apply(lambda x: mix_recommends(x['item_id'],x['inner_preds']),axis = 1)

In [37]:
df = df.explode('item_id')
df['item_id'] = df['item_id'].map(lambda x: final_id2item[x])
df['user_id'] = df['user_id'].map(lambda x: final_id2user[x])
df = df.rename(columns = {'item_id':'items'})

In [38]:
df[['user_id','items']].to_csv('sub.csv',index = False)