In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import numpy as np
import pandas as pd
import sys
import scipy.sparse as sp

from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# sys.path.insert(1, '..')

# from repsys.dataset import Dataset
# from repsys.model import Model
# from repsys.evaluators import ModelEvaluator
# import repsys.dtypes as dtypes

In [33]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
sp.csr_matrix((data, (row, col)), shape=(3, 3)).tocoo().data

array([1, 2, 3, 4, 5, 6])

In [24]:
class MovieLens(Dataset):
  def name(self):
    return "movielens"

  def get_genres(self):
    return self.tags['genres']

  def item_dtypes(self):
    return {
        "movieId": dtypes.ItemID(),
        "title": dtypes.Title(),
        "genres": dtypes.Tags(sep="|"),
        "year": dtypes.Number()
    }

  def interact_dtypes(self):
    return {
        "movieId": dtypes.ItemID(),
        "userId": dtypes.UserID(),
        "rating": dtypes.Rating(min=0.5, step=0.5),
    }

  def load_items(self):
    df = pd.read_csv("./ml-20m/movies.csv")
    df['year'] = df['title'].str.extract('\((\d+)\)')
    df['year'] = df['year'].fillna(0)
    df['year'] = df['year'].astype(int)
    return df

  def load_interacts(self):
    return pd.read_csv("./ml-20m/ratings.csv")

In [25]:
dataset = MovieLens()
dataset.fit()

In [26]:
dataset.items

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995
2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995
3,Grumpier Old Men (1995),"[Comedy, Romance]",1995
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995
5,Father of the Bride Part II (1995),[Comedy],1995
...,...,...,...
131158,"Manta, Manta (1991)",[Comedy],1991
131164,Vietnam in HD (2011),[War],2011
131166,WWII IN HD (2009),[(no genres listed)],2009
131176,A Second Chance (2014),[Drama],2014


In [37]:
class KNN(Model):
  def __init__(self, k=5):
    self.model = NearestNeighbors(n_neighbors=k, metric="cosine")

  def name(self):
    return "knn"

  def fit(self):
    self.model.fit(self.dataset.train_data)

  def predict(self, X, **kwargs):
    # the slowest phase of the prediction
    distances, indexes = self.model.kneighbors(X)

    n_distances = distances[:, 1:]
    n_indexes = indexes[:, 1:]

    n_distances = 1 - n_distances

    sums = n_distances.sum(axis=1)
    n_distances = n_distances / sums[:, np.newaxis]
        
    def f(dist, idx):
        A = self.dataset.train_data[idx]
        D = sp.diags(dist)
        return D.dot(A).sum(axis=0)
    
    vf = np.vectorize(f, signature='(n),(n)->(m)')

    pred = vf(n_distances, n_indexes)
        
    pred[(X > 0).toarray()] = 0

    return pred

In [38]:
model = KNN(k=20)
model.update_dataset(dataset)
model.fit()

In [39]:
X_pred = model.predict(dataset.vad_data_tr)

phase 1 done
phase 2 done


In [97]:
X_pred

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.103728  , 0.21258559, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.10169589, 0.2073023 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.05775506, 0.48042479, 0.0515738 , ..., 0.        , 0.        ,
        0.        ]])