In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import sys
import pymde
import scipy.sparse as sp

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.cluster import DBSCAN

from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

sys.path.insert(1, '..')

from repsys.dataset import Dataset
from repsys.model import Model
from repsys.evaluators import DatasetEvaluator, ModelEvaluator
import repsys.dtypes as dtypes

In [3]:
class MovieLens(Dataset):
    def name(self):
        return "movielens"

    def get_genres(self):
        return self.tags.get('genres')

    def item_cols(self):
        return {
            "movieId": dtypes.ItemID(),
            "title": dtypes.Title(),
            "genres": dtypes.Tag(sep="|"),
            "year": dtypes.Number(data_type=int),
        }

    def interaction_cols(self):
        return {
            "movieId": dtypes.ItemID(),
            "userId": dtypes.UserID(),
            "rating": dtypes.Interaction(),
        }

    def load_items(self):
        df = pd.read_csv("./ml-sm/movies.csv")
        df["year"] = df["title"].str.extract("\((\d+)\)")
        # df["year"] = df["year"].fillna(0)
        # df["year"] = df["year"].astype(int)
        return df

    def load_interactions(self):
        return pd.read_csv("./ml-sm/ratings.csv")

In [4]:
dataset = MovieLens()
dataset.split()

In [27]:
# dataset.save('./.repsys_checkpoints/latest.zip')
# dataset.load('./.repsys_checkpoints/dataset-split-1645457256.zip')

In [None]:
# fig = px.scatter(x=data[:, 0], y=data[:, 1], width=700, height=500)
# fig.update_traces(marker=dict(size=2))
# fig.update_layout(yaxis=dict(scaleanchor="x", scaleratio=1))
# fig.show()

In [5]:
class KNN(Model):
  def __init__(self, k=5):
    self.model = NearestNeighbors(n_neighbors=k, metric="cosine")

  def name(self):
    return "knn"

  def fit(self):
    self.model.fit(self.dataset.get_train_data())

  def predict(self, X, **kwargs):
    # the slowest phase of the prediction
    distances, indexes = self.model.kneighbors(X)

    n_distances = distances[:, 1:]
    n_indexes = indexes[:, 1:]

    n_distances = 1 - n_distances

    sums = n_distances.sum(axis=1)
    n_distances = n_distances / sums[:, np.newaxis]
        
    def f(dist, idx):
        A = self.dataset.get_train_data()[idx]
        D = sp.diags(dist)
        return D.dot(A).sum(axis=0)
    
    vf = np.vectorize(f, signature='(n),(n)->(m)')

    pred = vf(n_distances, n_indexes)
        
    pred[(X > 0).toarray()] = 0

    return pred

In [6]:
model = KNN(k=20)
model.update_dataset(dataset)
model.fit()

In [8]:
evaluator = ModelEvaluator()
evaluator.update_dataset(dataset)

In [9]:
evaluator.evaluate(model, split='validation')



In [10]:
evaluator.save('foo.zip')

In [15]:
evaluator.print()

Model knn:
        recall@5  recall@20  recall@50
count  46.000000  46.000000  46.000000
mean    0.294203   0.283369   0.385433
std     0.230196   0.234040   0.291111
min     0.000000   0.000000   0.000000
25%     0.200000   0.114583   0.175435
50%     0.225000   0.211111   0.303846
75%     0.400000   0.448611   0.528846
max     1.000000   1.000000   1.000000
