## Imports

In [1]:
import sys
sys.path.append("..") # fix for relative imports

In [2]:
from models.mf import MatrixFactorization
from models.knn_popular import KNNpopularity
from models.ease import EASE

In [3]:
import os
from collections import defaultdict
import gc

import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from utils.metrics import similarity, serendipity, distance, novelty, unexpectedness, relevance
from utils.helpers import get_control_items

from joblib import load, dump
from tqdm import tqdm

## Data loading

In [4]:
DATA_PATH = '../data/movielens/1m/clean/'

In [5]:
ratings = pd.read_csv(os.path.join(DATA_PATH, 'ratings.csv'))
movies = pd.read_csv(os.path.join(DATA_PATH, 'movies.csv'))

In [6]:
movies

Unnamed: 0,movieId,title,date,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18
0,1,Toy Story,788918400.0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,788918400.0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,788918400.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,946684800.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House,946684800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'))
test_data = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), index_col='userId')

In [10]:
item_global_rating = load(os.path.join(DATA_PATH, 'item_sum_dif_rating.pickle'))

In [52]:
genres_cols = [f'feature{i + 1}' for i in range(18)]

def get_movies_by_profile(profile):
    profile = profile.T
    return movies[movies['movieId'].isin(profile[profile != 0].dropna(axis=0).T.columns)]

def get_average_genre(movies):
    return movies.mean(axis=0)[genres_cols]

def get_movies_by_ids(ids):
    return movies[movies['movieId'].isin(ids)]

In [63]:
train_data = pd.read_csv(DATA_PATH + 'train_data.csv', index_col='userId')
test_df = pd.read_csv(DATA_PATH + 'test_data.csv', index_col='userId')
test_data, _ = get_control_items(ratings, user_profiles=test_df)
ratings, control_items = get_control_items(ratings, user_ids=test_df.index.values)

In [69]:
profile = test_data.iloc[0]

In [73]:
profile[profile != 0].index.astype(int)

Int64Index([   1,    2,    8,   11,   13,   17,   19,   24,   26,   27,
            ...
            3673, 3686, 3691, 3699, 3701, 3705, 3712, 3713, 3723, 3742],
           dtype='int64', length=651)

In [75]:
movies[movies['movieId'].isin(profile[profile != 0].index.astype(int))][genres_cols].values

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]], dtype=int64)

### MF

In [None]:
def get_model(alg):
    model = None
    if alg == 'knn':
        model = UserBasedKNNpopularity('knn', '../config/config.ini', train_data, 0, item_global_rating)
        model.preprocess()
        model.fit({'K': k, 'random_state': 42})
    elif alg == 'mf':
        model = UserBasedMatrixFactorization('mf', '../config/config.ini', train_data, item_global_rating)
        model.preprocess()
        model.fit({'n_components': comps, 'random_state': 42, 'corr': 1})
        
    return model

In [None]:
recommendations = []
grid = [50]#, 20, 50, 100]

for comps in grid:
    model = get_model('mf')
    recommendatons_ = {}
    
    for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
        pred = mf.predictItemByUser(row[1], row[0], 10)
        recommendatons_[i] = pred
    
    recommendations[comps] = recommendatons_
    gc.collect()

In [None]:
dump(recommendations, './values/mf_{}comp_val_predictions.pickle'.format('_'.join([str(i) for i in grid])), protocol=2)

### KNN

In [None]:
recommendations = []
grid = [20, 30, 50]#, 20, 50, 100]

for k in grid:
    model = get_model('knn')
    recommendatons_ = {}
    
    for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
        pred = model.predictItemByUser(row[1], row[0], 10)
        recommendatons_[i] = pred
    
    recommendations[k] = recommendatons_
    gc.collect()

In [None]:
dump(recommendations, './values/knn_{}k_0beta_val_predictions.pickle'.format('_'.join([str(i) for i in grid])), protocol=2)

### EASE

In [None]:
from utils.helpers import *

In [None]:
DATASET = '1m'
DATA_PATH = '../data/movielens/' + DATASET + '/clean/'

ratings = pd.read_csv(DATA_PATH + 'ratings.csv')
movies = pd.read_csv(os.path.join(DATA_PATH, 'movies.csv'))

train_data = pd.read_csv(DATA_PATH + 'train_data.csv', index_col='userId')
test_df = pd.read_csv(DATA_PATH + 'test_data.csv', index_col='userId')


In [None]:
ratings = pd.read_csv(DATA_PATH + 'ratings.csv')
movies = pd.read_csv(os.path.join(DATA_PATH, 'movies.csv'))

train_data = pd.read_csv(DATA_PATH + 'train_data.csv', index_col='userId')
test_df = pd.read_csv(DATA_PATH + 'test_data.csv', index_col='userId')
test_data, _ = get_control_items(ratings, user_profiles=test_df)
ratings, control_items = get_control_items(ratings, user_ids=test_df.index.values)

user_embeddings = {}
for user_id, user_profile in test_data.iterrows():
    user_embeddings[user_id] = get_movies_by_profile(movies, user_profile)

In [None]:
user_embeddings[5530][0]

In [None]:

ratings, control_items = get_control_items(ratings, user_ids=test_df.index.values)

In [None]:
ease = EASE()
ease.fit(ratings, implicit=False)

In [None]:
pred = ease.predict(ratings, list(control_items.keys()), np.unique(ratings['movieId']), 10)

In [None]:
pred

In [None]:
user_embeddings = get_user_profiles(ratings)

In [None]:
user_embeddings

In [None]:
pred[pred['userId'] == 5530]

In [None]:
recommendations = defaultdict(list)

for user_id, df in pred.groupby('userId'):
    recommendations[user_id].extend(df['movieId'].values)

In [None]:
recommendations[5530]

### Profile and predictions similarity

In [None]:
recommendations[3071]

In [None]:
recs = recommendations

In [None]:
similarities = []

for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
    try:
        pred_features = get_average_genre(get_movies_by_ids(recs[i]))
        user_features = get_average_genre(get_movies_by_profile(row[1]))

        similarities.append(similarity(pred_features, user_features))
    except:
        print(i)

In [None]:
np.mean(similarities)

In [None]:
get_average_genre(get_movies_by_profile(val_data['user_movies_profile_dataframe'].iloc[0]))

In [None]:
get_average_genre(get_movies_by_ids(d[5530]))

### Serendipity implementation checking

In [None]:
embeddings = load('../t_film_profile_sem_0_and_com_001.pickle')

In [None]:
embeddings[embeddings.index == 586]

In [None]:
embeddings[embeddings.index == 2986]

In [None]:
get_movies_by_ids([2985])[genres_cols].values[0, :]

In [None]:
get_movies_by_ids([519])[genres_cols].values

In [None]:
get_movies_by_ids([1])[genres_cols].values

In [None]:
get_movies_by_ids([3114])[genres_cols].values

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from utils.metrics import distance

distance(get_movies_by_ids([1])[genres_cols].values[0, :], get_movies_by_ids([3114])[genres_cols].values[0, :])

In [None]:
movies[movies['title'].str.contains('Cind')]

In [None]:
movies[movies['movieId'] == 2571]

Toy Story - id 1  
Toy Story 2 - id 3114  
The Lion King - id 364

In [None]:
user = pd.DataFrame([np.zeros(len(train_data.columns))], columns=train_data.columns)

In [None]:
# set ratings for cartoons

user[1] = 5.0
user[3114] = 4.0
user[364] = 5.0
user[1022] = 5.0

user = get_movies_by_profile(user)[genres_cols].values

In [None]:
user

In [None]:
primitive_recommendation = [2081, 595, 588] # cartoons only
recommendations = [2571, 2628, 1196, 586]

In [None]:
np.mean(np.array([0.3195861825602283, 1.0, 1.0, 1.0]) * 0.3) + np.mean(0.3 * np.array([0.6804138174397716, 0.0, 0.0, 0.0])) + 1.0 * 0.4

In [None]:
np.mean([0.11628826929126164, 0.3, 0.3, 0.3]) + np.mean([0.18371173070873834, 0.0, 0.0, 0.0]) + 0.4

In [None]:
np.array([0.11628827, 0.3, 0.3, 0.3]) + np.array([0.18371173, 0., 0., 0.]) + 0.4

In [None]:
serendipity(get_movies_by_ids(recommendations)[genres_cols].values,
            recommendations,
            primitive_recommendation,
            user, 
#             keepdims=True,
            verbose=True)

In [None]:
get_movies_by_ids([2571])[genres_cols].values[0, :]

In [None]:
relevance(get_movies_by_ids([1022])[genres_cols].values[0, :], user)

In [None]:
from scipy.spatial.distance import cosine

In [None]:
tmp = get_movies_by_ids([2571])[genres_cols].values[0, :]

for item in user:
    print(item, tmp)
    print(cosine(item, tmp))
    print(similarity(tmp, item))

In [None]:
user_profile = [
    [1, 1, 0, 0],
    [0, 1, 1, 0],
    [1, 0, 0, 0]
]

In [None]:
recommendations = [
    [-1, 1, 0, 1],
    [0, 1, 1, 1],
]

primitive_recommendations = [
    [0, 0, 0, 1],
]

In [None]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

In [None]:
%%timeit
novelty(recommendations, user_profile)

In [None]:
%%time
np.mean(cosine_distances(recommendations, user_profile), axis=1)

In [None]:
%%timeit

for i in recommendations:
    sim = [similarity(i, j) for j in user_profile]
    a = 1 / len(user_profile) * np.sum(sim)

In [None]:
np.mean([0.3169102,  0.17752551]) + (0.3 - np.mean([0.3169102,  0.17752551]))

In [None]:
np.mean([0.3169102,  0.17752551]) + np.mean([-0.0169102, 0.12247449]) + 0.4

In [None]:
serendipity(recommendations, [1,2], [3,4], user_profile, verbose=True)

In [4]:
a = np.array([1, 2])
b = [[0, 1], [2, 3]]

In [5]:
np.array([np.isin(a, bb) for bb in b]).mean(axis=0)

array([0.5, 0.5])

In [6]:
unexpectedness(a, b)

array([0.5, 0.5])

In [7]:
np.mean(0.0)

0.0