## Imports

In [1]:
import sys
sys.path.append("..") # fix for relative imports

In [2]:
from algorithms.mf import UserBasedMatrixFactorization
from algorithms.knn_popular import UserBasedKNNpopularity

from algorithms.ease import EASE

In [3]:
import os
from collections import defaultdict
import gc

import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from utils.metrics import similarity, serendipity, distance, novelty, unexpectedness, relevance

from joblib import load, dump
from tqdm import tqdm

## Data loading

In [4]:
DATA_PATH = '../data/movielens/1m/clean/'

In [5]:
ratings = pd.read_csv(os.path.join(DATA_PATH, 'ratings.csv'))
movies = pd.read_csv(os.path.join(DATA_PATH, 'movies.csv'))

In [6]:
train_data = load(os.path.join(DATA_PATH, 'train_data.pickle'))
val_data = load(os.path.join(DATA_PATH, 'val_data_prepare_for_recom.pickle'))
test_data = load(os.path.join(DATA_PATH, 'test_data_prepare_for_recom.pickle'))

In [7]:
item_global_rating = load(os.path.join(DATA_PATH, 'item_sum_dif_rating.pickle'))

In [8]:
genres_cols = [f'feature{i + 1}' for i in range(18)]

def get_movies_by_profile(profile):
    profile = profile.T
    return movies[movies['movieId'].isin(profile[profile != 0].dropna(axis=0).T.columns)]

def get_average_genre(movies):
    return movies.mean(axis=0)[genres_cols]

def get_movies_by_ids(ids):
    return movies[movies['movieId'].isin(ids)]

### MF

In [None]:
def get_model(alg):
    model = None
    if alg == 'knn':
        model = UserBasedKNNpopularity('knn', '../config/config.ini', train_data, 0, item_global_rating)
        model.preprocess()
        model.fit({'K': k, 'random_state': 42})
    elif alg == 'mf':
        model = UserBasedMatrixFactorization('mf', '../config/config.ini', train_data, item_global_rating)
        model.preprocess()
        model.fit({'n_components': comps, 'random_state': 42, 'corr': 1})
        
    return model

In [None]:
recommendations = []
grid = [50]#, 20, 50, 100]

for comps in grid:
    model = get_model('mf')
    recommendatons_ = {}
    
    for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
        pred = mf.predictItemByUser(row[1], row[0], 10)
        recommendatons_[i] = pred
    
    recommendations[comps] = recommendatons_
    gc.collect()

In [None]:
dump(recommendations, './values/mf_{}comp_val_predictions.pickle'.format('_'.join([str(i) for i in grid])), protocol=2)

### KNN

In [None]:
recommendations = []
grid = [20, 30, 50]#, 20, 50, 100]

for k in grid:
    model = get_model('knn')
    recommendatons_ = {}
    
    for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
        pred = model.predictItemByUser(row[1], row[0], 10)
        recommendatons_[i] = pred
    
    recommendations[k] = recommendatons_
    gc.collect()

In [None]:
dump(recommendations, './values/knn_{}k_0beta_val_predictions.pickle'.format('_'.join([str(i) for i in grid])), protocol=2)

### EASE

In [None]:
train_df, rest_df = train_test_split(ratings, test_size=0.2, shuffle=True, random_state=42)

In [None]:
ease = EASE()
ease.fit(train_df, implicit=False)

In [None]:
pred = ease.predict(train_df, val_data.index.values, np.unique(train_df['movieId']), 10)

In [None]:
pred[pred['userId'] == 3071]

In [None]:
recommendations = defaultdict(list)

for user_id, df in pred.groupby('userId'):
    recommendations[user_id].extend(df['movieId'].values)

### Profile and predictions similarity

In [None]:
recommendations[3071]

In [None]:
recs = recommendations

In [None]:
similarities = []

for i, row in tqdm(val_data.iterrows(), position=0, total=len(val_data)):
    try:
        pred_features = get_average_genre(get_movies_by_ids(recs[i]))
        user_features = get_average_genre(get_movies_by_profile(row[1]))

        similarities.append(similarity(pred_features, user_features))
    except:
        print(i)

In [None]:
np.mean(similarities)

In [None]:
get_average_genre(get_movies_by_profile(val_data['user_movies_profile_dataframe'].iloc[0]))

In [None]:
get_average_genre(get_movies_by_ids(d[5530]))

### Serendipity implementation checking

In [None]:
embeddings = load('../t_film_profile_sem_0_and_com_001.pickle')

In [None]:
embeddings[embeddings.index == 586]

In [None]:
embeddings[embeddings.index == 2986]

In [None]:
get_movies_by_ids([2985])[genres_cols].values[0, :]

In [None]:
get_movies_by_ids([519])[genres_cols].values

In [None]:
get_movies_by_ids([1])[genres_cols].values

In [None]:
get_movies_by_ids([3114])[genres_cols].values

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from utils.metrics import distance

distance(get_movies_by_ids([1])[genres_cols].values[0, :], get_movies_by_ids([3114])[genres_cols].values[0, :])

In [None]:
movies[movies['title'].str.contains('Cind')]

In [None]:
movies[movies['movieId'] == 2571]

Toy Story - id 1  
Toy Story 2 - id 3114  
The Lion King - id 364

In [9]:
user = pd.DataFrame([np.zeros(len(train_data.columns))], columns=train_data.columns)

In [10]:
# set ratings for cartoons

user[1] = 5.0
user[3114] = 4.0
user[364] = 5.0
user[1022] = 5.0

user = get_movies_by_profile(user)[genres_cols].values

In [11]:
user

array([[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
primitive_recommendation = [2081, 595, 588] # cartoons only
recommendations = [2571, 2628, 1196, 586]

In [13]:
np.mean(np.array([0.3195861825602283, 1.0, 1.0, 1.0]) * 0.3) + np.mean(0.3 * np.array([0.6804138174397716, 0.0, 0.0, 0.0])) + 1.0 * 0.4

0.7

In [14]:
np.mean([0.11628826929126164, 0.3, 0.3, 0.3]) + np.mean([0.18371173070873834, 0.0, 0.0, 0.0]) + 0.4

0.7

In [15]:
np.array([0.11628827, 0.3, 0.3, 0.3]) + np.array([0.18371173, 0., 0., 0.]) + 0.4

array([0.7, 0.7, 0.7, 0.7])

In [21]:
serendipity(get_movies_by_ids(recommendations)[genres_cols].values,
            recommendations,
            primitive_recommendation,
            user, 
#             keepdims=True,
            verbose=True)

Novelty: [0.11628826929126164, 0.3, 0.3, 0.3], relevance: [0.18371173070873834, 0.0, 0.0, 0.0], unexp: 1.0


0.7

In [17]:
get_movies_by_ids([2571])[genres_cols].values[0, :]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0])

In [18]:
relevance(get_movies_by_ids([1022])[genres_cols].values[0, :], user)

0.8333333333333333

In [19]:
from scipy.spatial.distance import cosine

In [20]:
tmp = get_movies_by_ids([2571])[genres_cols].values[0, :]

for item in user:
    print(item, tmp)
    print(cosine(item, tmp))
    print(similarity(tmp, item))

[0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0] [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
1.0
0.0
[0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
1.0
0.0
[0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
1.0
0.0
[0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0] [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
1.0
0.0
