In [1]:
import sys
sys.path.append("..") # fix for relative imports

In [2]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix

import numpy as np
import pandas as pd
from joblib import load

from models.knn_popular import KNNpopularity
import models.knn_popular_optimized as knn_opt

In [3]:
DATA_PATH = '../data/movielens/1m/clean/'

In [4]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), index_col='userId')
test_data = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), index_col='userId', nrows=1)

In [5]:
item_ratings = load(os.path.join(DATA_PATH, 'item_sum_dif_rating.pickle'))

In [6]:
knn = KNNpopularity('knn', '../config/config.ini', train_data, item_ratings)
knn.preprocess()
knn.fit({'K': 5})

In [7]:
knn2 = knn_opt.KNNpopularity('knn', train_data, item_ratings)
knn2.fit({'k': 5, 'beta': 0.0})

In [8]:
preds = []
for user_id, user_profile in test_data.iterrows():
    preds.append(knn.predict(user_profile, 10))

In [9]:
new_preds = []

for user_id, user_profile in test_data.iterrows():
    new_preds.append(knn2.predict(user_profile, 10))

In [10]:
preds = np.array(preds)

In [11]:
new_preds = np.array(new_preds)

In [12]:
preds

array([[  47,  527, 1148, 2100, 2324, 2329, 2762, 2795, 2908, 3897]])

In [13]:
new_preds

array([[3897, 2908, 2762, 2329, 2324,  527, 2100, 2795,   47, 1148]],
      dtype=int64)

In [None]:
user = test_data.iloc[0]

In [None]:
#for user in test_data.iloc[:10]:
user_csr_profile = csr_matrix(user)
user_rated_items = user[user != 0].index
rows, cols = user_csr_profile.nonzero()

distances, indices = knn.model_knn.kneighbors(user_csr_profile)

In [None]:
distances2, indices2 = knn2.model_knn.kneighbors(user_csr_profile)

In [None]:
# %%timeit
my_neighbors = indices.squeeze().tolist()
my_neighbors_distance = distances.squeeze().tolist()

my_neighbors2 = indices[0]
my_neighbors_distance2 = distances[0]

In [None]:
neighbours_profiles = knn2.train_data.iloc[my_neighbors2]
average_ratings = np.mean(neighbours_profiles[neighbours_profiles != 0], axis=1)

In [None]:
popular_items = []
popular_items_score = []

for i in range(len(neighbours_profiles)):
    row = neighbours_profiles.iloc[i]
    user_average_rating = average_ratings.iloc[i]
    best_items = row[row > user_average_rating]
    popular_items.append(best_items.index)
    score = (best_items.values - user_average_rating) * (1 - my_neighbors_distance2[i])
    popular_items_score.append(score)

In [None]:
a = pd.Series(np.zeros(user.shape[0]), index=user.index)
for i in range(len(popular_items)):
    a.loc[popular_items[i]] += popular_items_score[i]

In [None]:
final_score = a / item_ratings.sort_values(by='movieId')['sum_rating'].values

In [None]:
final_score = final_score.loc[~final_score.index.isin(user_rated_items)]
final_score.dropna(inplace=True)
final_score = final_score[final_score != 0]

In [None]:
argsort = np.argsort(final_score.values)[::-1][:10]

In [None]:
final_score[argsort]

In [None]:
pred1 = final_score[argsort].index.astype(np.int)

In [None]:
def f_avg_rating(row):
    tmp = knn.user_movie_mat_sparse.getrow(row['my_neighbors_index'])
    return tmp.sum() / tmp.count_nonzero()

# ziskame nadprumerne popularni itemy
def f_popular_item(row):
    tmp = knn.user_movie_mat_sparse.getrow(row['my_neighbors_index'])
    rows, cals = tmp.nonzero()
    return pd.DataFrame(
        [[i, j, row['my_neighbors_distance'], row['avg_rating']] for (i, j) in zip(cols, tmp.data) if
         j > row['avg_rating']],
        columns=['movieIndex', 'rating', 'my_neighbors_distance', 'avg_rating'])

def f_rating_diff(row):
    tmp = row['rating'] - row['avg_rating']
    return tmp

# vypocet vazeneho score
def f_score(row):
    tmp = row['rating_diff'] * (1 - row['my_neighbors_distance'])
    return tmp

def f_beta_score(row):
    item_glob_popularity = \
        knn.item_global_rating[knn.item_global_rating['index'] == row['index']][['sum_rating']].values[0][
            0]
    bottom_score = item_glob_popularity ** knn.beta
    return row['upper_score'] / bottom_score

In [None]:
# vytahnu svoje sousedy
k_users_orig_items = pd.DataFrame(my_neighbors, columns=['my_neighbors_index'])
# pridam jejich cosine similaritu (distance)
k_users_orig_items['my_neighbors_distance'] = my_neighbors_distance
# vypoctu prumerny rating uzivatele
k_users_orig_items['avg_rating'] = k_users_orig_items.apply(f_avg_rating, axis=1)

# nechame jen nadprumerny itemy
k_users_orig_items['popular_item'] = k_users_orig_items.apply(f_popular_item, axis=1)
# prevedu itemy do samostatneho dataframe
frame = k_users_orig_items['popular_item'].values
recomItems = pd.concat(frame)
# vypocet rozdilu ratingu
recomItems['rating_diff'] = recomItems.apply(f_rating_diff, axis=1)
# vypocitam score
recomItems['upper_score'] = recomItems.apply(f_score, axis=1)
# provedu summu score a ratingu
recomItems = recomItems.groupby(['movieIndex']).sum()
# vratim zpet index
recomItems['index'] = recomItems.index
# vypocitam final beta score
recomItems['final_score'] = recomItems.apply(f_beta_score, axis=1)

# vyhodim itemy, ktery muj user uz videl
recomItems = recomItems.loc[~recomItems.index.isin(cols)]
# vyhodim nan
recomItems = recomItems.dropna()
# vse seradim
recomItems = recomItems.sort_values(by=['final_score'], ascending=False)

In [None]:
recomItems

In [None]:
# riznu pocet rekomentaco
# recomItems = recomItems[:10]
# !!!! moje rekomendace jsou indexy ve sparce matrix, ted potrebuje je prevest na movieId
tmp = list(recomItems[:10].index)

# todle je kvuli onto divers
#####
# _itemIdArr = []
# for i in tmp:
#     _itemId = self.train_data_by_row_number[self.train_data_by_row_number.index.isin([i])]['index'].values
#     _itemIdArr.append(_itemId[0])
#
# recomItems['movieId'] = _itemIdArr
# self.recomItemsCache = recomItems
######

recoms_movie = knn.train_data_by_row_number[knn.train_data_by_row_number.index.isin(tmp)]

return_recoms = np.array(recoms_movie['index'].values).astype(int)

In [None]:
tmp

In [None]:
np.allclose(recomItems['final_score'].values.astype(np.float16).round(-4), final_score.values.astype(np.float16).round(-4))

In [None]:
return_recoms

In [None]:
pred1