In [1]:
import sys
sys.path.append("..") # fix for relative imports

In [2]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix

import numpy as np
import pandas as pd
from joblib import load

from models.knn_popular import KNNpopularity
from models.knn_popular_optimized import KNNpopularity2

In [3]:
DATA_PATH = '../data/movielens/1m/clean/'

In [4]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), index_col='userId')
test_data = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), index_col='userId')

In [5]:
item_ratings = load(os.path.join(DATA_PATH, 'item_sum_dif_rating.pickle'))

In [6]:
knn = KNNpopularity('knn', '../config/config.ini', train_data[:100], item_ratings)
knn.preprocess()
knn.fit({'K': 5})

In [7]:
knn2 = KNNpopularity2('knn2', '../config/config.ini', train_data[:100], item_ratings)
knn2.preprocess()
knn2.fit({'K': 5})

In [8]:
%%timeit
preds = []
for user in test_data.iloc[:10]:
    preds.append(knn.predict(user, 10))

595 ms ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
new_preds = []
for user in test_data.iloc[:10]:
    new_preds.append(knn2.predict(user, 10))

615 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
user = test_data.iloc[0]

In [680]:
#for user in test_data.iloc[:10]:
user_crc_profile = csr_matrix(user)
rows, user_cols = user_crc_profile.nonzero()

distances, indices = knn.model_knn.kneighbors(user_crc_profile)

In [681]:
# %%timeit
my_neighbors = indices[0]
my_neighbors_distance = distances[0]

In [682]:
my_neighbors, my_neighbors_distance

(array([71, 92, 72, 10, 36], dtype=int64),
 array([0.54066119, 0.54485882, 0.5531729 , 0.55548728, 0.56885666]))

In [683]:
my_neighbors

array([71, 92, 72, 10, 36], dtype=int64)

In [684]:
neighbours_profiles[neighbours_profiles.columns[20:40]]

Unnamed: 0_level_0,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
839,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,3.0,0.0
2777,0.0,4.0,3.0,4.0,3.0,3.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0,4.0,0.0
1088,0.0,3.0,0.0,0.0,5.0,0.0,3.0,4.0,0.0,0.0,3.0,4.0,0.0,4.0,0.0,4.0,0.0,0.0,3.0,0.0
5689,2.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,0.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0
3067,5.0,4.0,0.0,0.0,5.0,0.0,3.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0


In [685]:
neighbours_profiles = knn2.train_data.iloc[my_neighbors]
average_ratings = np.mean(neighbours_profiles[neighbours_profiles != 0], axis=1)
average_ratings

userId
839     3.988372
2777    3.498286
1088    3.337585
5689    3.627049
3067    3.753027
dtype: float64

In [687]:
neighbours_profiles['21']

userId
839     0.0
2777    0.0
1088    0.0
5689    2.0
3067    5.0
Name: 21, dtype: float64

In [722]:
popular_items = []
popular_items_score = []

for i in range(len(neighbours_profiles)):
    row = neighbours_profiles.iloc[i]
    user_average_rating = average_ratings.iloc[i]
    best_items = row[row > user_average_rating]
    popular_items.append(best_items.index)
    score = (best_items.values - user_average_rating)# * (1 - my_neighbors_distance[i])
    popular_items_score.append(score)

In [689]:
for i, el in enumerate(popular_items):
    if '21' in el:
        print(popular_items[i])

Index(['1', '3', '6', '16', '17', '21', '22', '25', '29', '32',
       ...
       '3844', '3848', '3863', '3870', '3893', '3897', '3911', '3925', '3948',
       '3952'],
      dtype='object', length=505)


In [723]:
a = pd.Series(np.zeros(user.shape[0]), index=user.index)
for i in range(len(popular_items)):
    a.loc[popular_items[i]] += popular_items_score[i]

In [724]:
a.loc['1']

1.921016238580572

In [692]:
final_score = a / item_ratings['sum_rating'].values

In [693]:
final_score = final_score.loc[~final_score.index.isin(user_cols)]
final_score.dropna(inplace=True)

In [694]:
final_score[final_score.index == '21']

21    5.501609
dtype: float64

In [695]:
argsort = np.argsort(final_score.values)[::-1][:10]

In [696]:
final_score[argsort]

2889    12.218959
21       5.501609
3763     2.091018
3950     2.004862
610      1.526020
1563     1.372176
3020     0.951409
862      0.763873
3244     0.651679
1061     0.554968
dtype: float64

In [697]:
pred1 = final_score[argsort].index.astype(np.int)

In [713]:
def f_avg_rating(row):
    tmp = csr_matrix(knn.train_data.values).getrow(row['my_neighbors_index'])
    return tmp.sum() / tmp.count_nonzero()

def f_popular_item(row):
#     tmp = csr_matrix(knn.train_data).getrow(row['my_neighbors_index'])
#     rows, cals = tmp.nonzero()
    tmp = knn2.train_data.iloc[int(row['my_neighbors_index'])]
    tmp = tmp[tmp != 0]
    cals = tmp.index
    return pd.DataFrame(
        [[i, j, row['my_neighbors_distance'], row['avg_rating']] for (i, j) in zip(cals, tmp.values) if j > row['avg_rating']],
        columns=['movieIndex', 'rating', 'my_neighbors_distance', 'avg_rating'])

def f_rating_diff(row):
    tmp = row['rating'] - row['avg_rating']
    return tmp

# vypocet vazeneho score
def f_score(row):
    tmp = row['rating_diff'] * (1 - row['my_neighbors_distance'])
    return tmp

def f_beta_score(row):
    item_glob_popularity = item_ratings[item_ratings['index'] == row['index']]['sum_rating'].values[0]
    bottom_score = item_glob_popularity ** 1.0
    return row['upper_score'] / bottom_score

In [720]:
k_users_orig_items = pd.DataFrame(my_neighbors, columns=['my_neighbors_index'])
k_users_orig_items['my_neighbors_distance'] = my_neighbors_distance
k_users_orig_items['avg_rating'] = k_users_orig_items.apply(f_avg_rating, axis=1)
k_users_orig_items['popular_item'] = k_users_orig_items.apply(f_popular_item, axis=1)
# # prevedu itemy do samostatneho dataframe
frame = k_users_orig_items['popular_item'].values
recomItems = pd.concat(frame)

recomItems['rating_diff'] = recomItems.apply(f_rating_diff, axis=1)
# vypocitam score
recomItems['upper_score'] = recomItems.apply(f_score, axis=1)
# provedu summu score a ratingu
recomItems = recomItems.groupby(['movieIndex']).agg('sum')
# vratim zpet index
# recomItems['index'] = recomItems.index
# vypocitam final beta score
# recomItems['final_score'] = recomItems.apply(f_beta_score, axis=1)

In [721]:
recomItems

Unnamed: 0_level_0,rating,my_neighbors_distance,avg_rating,rating_diff,upper_score
movieIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,13.0,1.662691,11.078984,1.921016,0.867146
10,12.0,1.649321,10.953006,1.046994,0.467107
1009,4.0,0.568857,3.753027,0.246973,0.106481
101,4.0,0.540661,3.988372,0.011628,0.005341
1012,5.0,0.555487,3.627049,1.372951,0.610294
...,...,...,...,...,...
986,4.0,0.568857,3.753027,0.246973,0.106481
991,8.0,1.109518,7.741399,0.258601,0.111822
994,9.0,1.093834,7.325957,1.674043,0.760665
998,5.0,0.568857,3.753027,1.246973,0.537624


In [540]:
# vyhodim itemy, ktery muj user uz videl
recomItems = recomItems.loc[~recomItems.index.isin(cals_user)]
# vyhodim nan
recomItems = recomItems.dropna()
# vse seradim
recomItems = recomItems.sort_values(by=['final_score'], ascending=False)
# riznu pocet rekomentaco
# recomItems = recomItems[:10]
# !!!! moje rekomendace jsou indexy ve sparce matrix, ted potrebuje je prevest na movieId
tmp = list(recomItems[:10].index)

# todle je kvuli onto divers
#####
# _itemIdArr = []
# for i in tmp:
#     _itemId = self.train_data_by_row_number[self.train_data_by_row_number.index.isin([i])]['index'].values
#     _itemIdArr.append(_itemId[0])
#
# recomItems['movieId'] = _itemIdArr
# self.recomItemsCache = recomItems
######

recoms_movie = knn.train_data_by_row_number[knn.train_data_by_row_number.index.isin(tmp)]

return_recoms = np.array(recoms_movie['index'].values).astype(int)

In [543]:
recomItems.loc[3020]

rating                                     9.0
my_neighbors_distance                  1.12203
avg_rating                            7.090612
rating_diff                           1.909388
upper_score                           0.833609
index                                     3020
final_score              [0.28451278451564554]
Name: 3020, dtype: object

In [525]:
final_score[argsort]

2889    5.464623
21      2.371982
3763    0.934775
3950    0.892697
610     0.700960
1563    0.591605
3020    0.425494
862     0.329339
3244    0.284513
3599    0.246871
dtype: float64

In [526]:
recomItems

Unnamed: 0_level_0,rating,my_neighbors_distance,avg_rating,rating_diff,upper_score,index,final_score
movieIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2682,8.0,1.113715,7.251312,0.748688,0.334832,2682,[5.464622535192243]
20,5.0,0.568857,3.753027,1.246973,0.537624,20,[2.371982303259214]
3520,8.0,1.093834,7.325957,0.674043,0.301326,3520,[0.934774933908628]
3703,9.0,1.10866,6.964634,2.035366,0.906279,3703,[0.8926967288493933]
595,5.0,0.540661,3.988372,1.011628,0.46468,595,[0.7009603037378633]
1436,5.0,0.568857,3.753027,1.246973,0.537624,1436,[0.5916047595827558]
806,5.0,0.568857,3.753027,1.246973,0.537624,806,[0.3293386308480652]
3020,9.0,1.12203,7.090612,1.909388,0.833609,3020,[0.28451278451564554]
3359,4.0,0.553173,3.337585,0.662415,0.295985,3359,[0.24687129489063112]
993,13.0,1.666888,10.588897,2.411103,1.06196,993,[0.24443351240426314]


In [527]:
item_ratings[item_ratings['index'].isin(tmp)]['movieId'].values

array([  21,  610,  862, 1061, 1563, 2889, 3244, 3599, 3763, 3950],
      dtype=int64)

In [528]:
pred2 = return_recoms

In [529]:
np.array_equal(pred1.values, pred2)

False

In [530]:
pred1.sort_values()

Int64Index([21, 610, 862, 1563, 2889, 3020, 3244, 3599, 3763, 3950], dtype='int64')

In [531]:
pred2.sort()

In [532]:
pred2

array([  21,  610,  862, 1061, 1563, 2889, 3244, 3599, 3763, 3950])