In [26]:
# -*- coding: utf-8 -*-

from tqdm import tqdm

from utils.process_json import load_json
from utils.process_json import write_json

from utils.preprocessing import to_dataframe
from utils.preprocessing import get_item_index_dictionary
from utils.preprocessing import to_sparse_matrix
from utils.preprocessing import transform_idf

from utils.modeling import calculate_cosine_similarity
from utils.modeling import rate_per_playlist


class ItemCF:
    def __init__(self, idf=True):
        # options
        self.idf = idf

        # number of data
        self.n_train = 0
        self.n_test = 0

        # data dictionary
        self.tag2idx = dict()
        self.song2idx = dict()
        self.playlist2idx = dict()

        # data
        self.pt_train = None
        self.ps_train = None
        self.pt_test = None
        self.ps_test = None

        self.transformer_tag = None
        self.transformer_song = None

        # item-by-item similarity
        self.tt_similarity = None
        self.ss_similarity = None

        self.neighbors_tag = None
        self.neighbors_song = None

        self.ratings_tag = list()
        self.ratings_song = list()

    def preprocess(self, train, test):
        train = to_dataframe(train)
        test = to_dataframe(test)

        self.n_train = len(train)
        self.n_test = len(test)

        self.tag2idx = get_item_index_dictionary(train, test, 'tags')
        self.song2idx = get_item_index_dictionary(train, test, 'songs')
        self.playlist2idx = get_item_index_dictionary(train, test, 'id')

        self.pt_train = to_sparse_matrix(train, self.playlist2idx, self.tag2idx, 'tags')
        if self.idf:
            self.transformer_tag, _ = transform_idf(self.pt_train, False)
        self.pt_test = to_sparse_matrix(test, self.playlist2idx, self.tag2idx, 'tags', correction=self.n_train)

        self.ps_train = to_sparse_matrix(train, self.playlist2idx, self.song2idx, 'songs')
        if self.idf:
            self.transformer_song, _ = transform_idf(self.ps_train, False)
        self.ps_test = to_sparse_matrix(test, self.playlist2idx, self.song2idx, 'songs', correction=self.n_train)
        

    def train(self):
        self.tt_similarity = calculate_cosine_similarity(self.pt_train.T)
        self.ss_similarity = calculate_cosine_similarity(self.ps_train.T)

        self.neighbors_tag = self.tt_similarity.toarray().argsort(axis=-1)
        self.neighbors_song = self.tt_similarity.toarray().argsort(axis=-1)

    def predict(self):
        predictions = list()

        idx2tag = {idx:tag for tag, idx in self.tag2idx.items()}
        idx2song = {idx:song for song, idx in self.song2idx.items()}
        idx2playlist = {idx-self.n_train:playlist for playlist, idx in self.playlist2idx.items() if idx >= self.n_train}

        assert self.pt_test.shape[0] == self.ps_test.shape[0]

        for pid in tqdm(range(self.n_test)):
            playlist = idx2playlist[pid]

            rating = rate_per_playlist(
                self.pt_test[pid, :].nonzero()[1], 
                self.tt_similarity, 
                self.transformer_tag.idf_, 
                len(self.tag2idx)
            )
            self.ratings_tag.append(rating)

            self.ratings_song.append(rate_per_playlist(
                self.ps_test[pid, :].nonzero()[1], 
                self.ss_similarity, 
                self.transformer_song.idf_, 
                len(self.song2idx)
            ))
        
        return self.ratings_tag, self.ratings_song
  

In [27]:
# -*- coding: utf-8 -*-
import fire
from tqdm import tqdm

from utils.process_json import load_json
from utils.process_json import write_json

from methods.item_cf import ItemCF

train_fname='arena_data/orig/train.json'
test_fname='arena_data/questions/val.json'

print("Loading train file...")
train = load_json(train_fname)

print("Loading test file...")
test = load_json(test_fname)

model = ItemCF()
print('Preprocessing data...')
model.preprocess(train, test)
print('Training model...')
model.train()
print('Rating data for recommendation...')
ratings_tag, ratings_song = model.predict()

idx2tag = {idx:tag for tag, idx in model.tag2idx.items()}
idx2song = {idx:song for song, idx in model.song2idx.items()}
idx2playlist = {idx-model.n_train:playlist for playlist, idx in model.playlist2idx.items() if idx >= model.n_train}

answers = []

print("Generating answers...")
for idx in tqdm(range(model.n_test)):
    print('-----------------------------------------------')
    playlist = idx2playlist[idx]

    rating_tag = ratings_tag[idx]
    rating_song = ratings_song[idx]

    print(type(rating_tag))

    if sum(rating_tag) == 0:
        tag_ids = list(model.transformer_tag.idf_.argsort()[:10])
    else:
        tag_ids = list()
        for tid in rating_tag.argsort()[0, ::-1]:
            if rating_tag[tid] == 0 or len(tag_ids) == 10:
                break

            if tid not in list(model.pt_test[idx, :].nonzero()[1]):
                tag_ids.append(tid)

        m = len(tag_ids)
        if m < 10:
            tag_ids += list(model.transformer_tag.idf_.argsort()[:10-m])
    
    tags = [idx2tag[tid] for tid in tag_ids]

    if sum(rating_song) == 0:
        song_ids = list(model.transformer_song.idf_.argsort()[:100])
    else:
        song_ids = list()
        for sid in rating_song.argsort()[0, ::-1]:
            if rating_song[sid] == 0 or len(song_ids) == 100:
                break

            if sid not in list(model.ps_test[idx, :].nonzero()[1]):
                song_ids.append(sid)

        m = len(song_ids)
        if m < 100:
            song_ids += list(model.transformer_song.idf_.argsort()[:100-m])

    songs = [idx2song[sid] for sid in song_ids]

    answers.append({
        "id": playlist,
        "songs": songs,
        "tags": tags,
    })

100%|██████████| 9205/9205 [00:00<00:00, 640582.84it/s]Loading train file...
Loading test file...
Preprocessing data...

100%|██████████| 2302/2302 [00:00<00:00, 604854.21it/s]
100%|██████████| 11507/11507 [00:00<00:00, 799044.01it/s]
100%|██████████| 11507/11507 [00:00<00:00, 112340.02it/s]
100%|██████████| 9205/9205 [00:00<00:00, 45257.48it/s]
100%|██████████| 2302/2302 [00:00<00:00, 45925.95it/s]
100%|██████████| 9205/9205 [00:00<00:00, 24863.26it/s]
100%|██████████| 2302/2302 [00:00<00:00, 34593.13it/s]
Training model...
  0%|          | 0/2302 [00:00<?, ?it/s]Rating data for recommendation...



TypeError: 'csr_matrix' object cannot be interpreted as an integer

In [28]:
model.ratings_tag[0:5]

[array([0., 0., 0., ..., 0., 0., 0.])]

In [29]:
model.pt_test[0, :].nonzero()[1]

array([], dtype=int32)

In [30]:
model.tt_similarity

<6201x6201 sparse matrix of type '<class 'numpy.float64'>'
	with 110415 stored elements in Compressed Sparse Row format>

In [31]:
model.pt_train[:, 0].count_nonzero()

6

In [32]:
model.pt_train[:, 2].count_nonzero()

4

In [33]:
model.transformer_tag.idf_

array([8.18170058, 9.43446354, 8.51817281, ..., 7.48855339, 9.43446354,
       9.43446354])

In [34]:
len(model.tag2idx)

6201

In [35]:
model.pt_test[3, :].nonzero()

(array([0, 0], dtype=int32), array([1129, 4904], dtype=int32))

In [36]:
model.pt_test[3, 2103]

0.0

In [37]:
model.pt_test[3, 5155]

0.0

In [38]:
model.transformer_tag.idf_[2103]

9.434463543817241

In [39]:
A = (model.tt_similarity[2153, :] * model.transformer_tag.idf_[2153]).todense()
A[0, 9:20]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [40]:
B = (model.tt_similarity[5155, :] * model.transformer_tag.idf_[5155]).todense()
B[0, 9:20]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [41]:
(A + B)[0, 9:20]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [42]:
model.ratings_tag[3][0, 9:20]

IndexError: list index out of range

In [43]:
ratings_tag[3].argsort()[0, ::-1]

NameError: name 'ratings_tag' is not defined

In [44]:
ratings_tag[3][0, 0]

NameError: name 'ratings_tag' is not defined

In [45]:
ratings_tag[3][0, 2103]

NameError: name 'ratings_tag' is not defined

In [46]:
model.transformer_tag.idf_.argsort()[:10]

array([1452, 4849, 5632, 5549, 5077, 6194, 5504, 2191, 6040,  749])

In [47]:
model.pt_train[:, 3396].count_nonzero()

9

In [48]:
type(rating_tag)

NameError: name 'rating_tag' is not defined

In [49]:
type(model.tt_similarity)

scipy.sparse.csr.csr_matrix

In [50]:
type(model.ss_similarity)

scipy.sparse.csr.csr_matrix