In [14]:
import os
import numpy as np
from joblib import load, dump
from sklearn.model_selection import ParameterGrid
import pandas as pd

In [15]:
primitive_recommendations = load('../cache/primitive_recommendations.joblib')

In [16]:
primitive_recommendations[5530]

[array([ 589,  608, 1210, 2028,  593, 2571, 1198, 1196,  260, 2858]),
 array([ 589, 1210,  608, 2028,  593, 1198, 2571, 2858,  260, 1196]),
 array([1210,  589,  608, 2028,  593, 2571, 1198, 2858,  260, 1196]),
 array([ 589,  608, 1210, 2028, 2571,  593, 1198, 1196,  260, 2858]),
 array([1210, 2028,  589,  593, 1198,  608, 2571, 1196,  260, 2858]),
 array([ 608,  589, 1210,  593, 1198, 2028, 2571, 1196,  260, 2858]),
 array([ 260,    1, 1197, 1196, 2028,  356,  593,  318, 1265, 2858]),
 array([1270,  527,  356, 1265, 2028,  296,  608,  318,  593, 2858]),
 array([2028, 1265, 2762,  260,  296, 1196,  608,  318,  593, 2858]),
 array([2762,    1, 2997,  296,  593,  356, 2396,  318, 1265, 2858]),
 array([   1, 2762,  608,  296, 2396,  593,  356, 1265,  318, 2858]),
 array([1270,  593,  608,  296, 2997, 1197,  318, 1265, 2396, 2858]),
 array([1307, 2918, 1197,  356, 2396,  296,    1, 1265,  318, 2858]),
 array([2396, 2762,    1, 2997,  318,  608,  296, 1265,  593, 2858]),
 array([   1, 2997, 

In [3]:
np.unique(primitive_recommendations[5530])

array([   1,   16,   32,   39,   47,   50,   58,  166,  253,  260,  296,
        300,  318,  356,  357,  364,  368,  412,  457,  480,  508,  520,
        527,  551,  586,  587,  588,  589,  590,  593,  597,  608,  610,
        661,  750,  778,  838,  858,  904,  909,  911,  912,  916,  923,
        926, 1012, 1034, 1035, 1036, 1049, 1101, 1120, 1136, 1148, 1162,
       1172, 1188, 1193, 1196, 1197, 1198, 1199, 1200, 1206, 1210, 1219,
       1220, 1221, 1222, 1230, 1233, 1235, 1236, 1238, 1240, 1246, 1248,
       1250, 1251, 1252, 1254, 1255, 1265, 1270, 1291, 1307, 1340, 1342,
       1345, 1347, 1356, 1370, 1372, 1374, 1394, 1401, 1405, 1441, 1527,
       1580, 1584, 1617, 1704, 1732, 1784, 1885, 1921, 1923, 1968, 1974,
       1982, 2000, 2001, 2028, 2100, 2133, 2150, 2193, 2291, 2311, 2324,
       2329, 2352, 2357, 2371, 2391, 2395, 2396, 2502, 2571, 2583, 2622,
       2657, 2677, 2746, 2762, 2795, 2858, 2871, 2872, 2899, 2908, 2916,
       2918, 2938, 2968, 2973, 2976, 2985, 2987, 29

In [4]:
primitive_grid = {
            'AutoRec': {'hidden_layer_size': [8, 16, 64, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048],
                        'random_state': [11, 42, 77]},
            'KNNpopularity': {'k': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 300, 500, 700, 1000]},
            'MatrixFactorization': {'n_components': [5, 10, 20, 50, 100, 200, 500, 1000, 2000], 'random_state': [42]},
            'KNN': {'k': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 300, 500, 700, 1000]}
        }

In [10]:
for i, v in primitive_grid.items():
    for grid in ParameterGrid(v):
        print(grid)

{'hidden_layer_size': 8, 'random_state': 11}
{'hidden_layer_size': 8, 'random_state': 42}
{'hidden_layer_size': 8, 'random_state': 77}
{'hidden_layer_size': 16, 'random_state': 11}
{'hidden_layer_size': 16, 'random_state': 42}
{'hidden_layer_size': 16, 'random_state': 77}
{'hidden_layer_size': 64, 'random_state': 11}
{'hidden_layer_size': 64, 'random_state': 42}
{'hidden_layer_size': 64, 'random_state': 77}
{'hidden_layer_size': 128, 'random_state': 11}
{'hidden_layer_size': 128, 'random_state': 42}
{'hidden_layer_size': 128, 'random_state': 77}
{'hidden_layer_size': 192, 'random_state': 11}
{'hidden_layer_size': 192, 'random_state': 42}
{'hidden_layer_size': 192, 'random_state': 77}
{'hidden_layer_size': 256, 'random_state': 11}
{'hidden_layer_size': 256, 'random_state': 42}
{'hidden_layer_size': 256, 'random_state': 77}
{'hidden_layer_size': 384, 'random_state': 11}
{'hidden_layer_size': 384, 'random_state': 42}
{'hidden_layer_size': 384, 'random_state': 77}
{'hidden_layer_size': 512

In [9]:
a = np.array([[1, 2], [2, 3]])
np.unique(a)

array([1, 2, 3])

In [11]:
flat_pr = {}

In [12]:
for user, value in primitive_recommendations.items():
    flat_pr[user] = np.unique(value)

In [15]:
flat_pr

{5530: array([   1,   39,   47,  318,  356,  527,  586,  778, 1136, 1162, 1193,
        1196, 1198, 1210, 1219, 1221, 1265, 1291, 1394, 2150, 2324, 2329,
        2352, 2357, 2396, 2622, 2762, 2858, 2871, 2918, 2997, 3100, 3897],
       dtype=int64),
 711: array([  50,  111,  231,  260,  296,  527,  593,  608,  858,  908,  912,
         913, 1136, 1196, 1198, 1219, 1244, 1270, 1290, 1610, 1617, 2028,
        2406, 2423, 2571, 2858, 3098, 3614], dtype=int64),
 4924: array([ 246,  260,  296,  318,  527,  593,  608,  750,  858,  913, 1094,
        1193, 1198, 1208, 1213, 1214, 1219, 1221, 1222, 1617, 2019, 2028,
        2700, 2762, 2858, 2918, 2921, 2997], dtype=int64),
 2154: array([  50,  260,  318,  527,  589,  593,  595,  608,  858,  904,  912,
        1136, 1196, 1197, 1213, 1221, 1261, 1263, 1617, 1912, 2028, 2067,
        2324, 2622, 2858, 2989, 3793, 3911], dtype=int64),
 1273: array([  50,  296,  608,  784,  858,  903,  908,  912,  913,  923,  953,
        1079, 1136, 1196, 1197, 

In [14]:
dump(flat_pr, '../cache/flat_pr.joblib')

['../cache/flat_pr.joblib']

In [2]:
PATH = '../data/movielens/1m/'
CLEAN_PATH = os.path.join(PATH, 'clean')

In [3]:
test_data =  pd.read_csv(os.path.join(CLEAN_PATH, 'test_data.csv'), index_col='userId')

In [4]:
test_data

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5530,3.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2154,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1273,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
2081,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
5674,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
recommendations = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]] * len(test_data))

In [6]:
recommendations

array([[ 1,  2,  3, ...,  8,  9, 10],
       [ 1,  2,  3, ...,  8,  9, 10],
       [ 1,  2,  3, ...,  8,  9, 10],
       ...,
       [ 1,  2,  3, ...,  8,  9, 10],
       [ 1,  2,  3, ...,  8,  9, 10],
       [ 1,  2,  3, ...,  8,  9, 10]])

In [7]:
a = test_data.iloc[0]

In [31]:
a

1       3.0
2       2.0
3       0.0
4       0.0
5       0.0
       ... 
3948    0.0
3949    0.0
3950    0.0
3951    0.0
3952    0.0
Name: 5530, Length: 3706, dtype: float64

In [38]:
b = a[a.index.astype(int).isin(recommendations[0])]

In [42]:
b[b != 0].shape[0]

3

In [51]:
(recommendations == 2).sum()

1208

In [66]:
(test_data[movie_id] != 0).sum()

79

In [5]:
recommendations = np.random.choice(test_data.columns.astype(int), size=(test_data.shape[0], 10))

In [6]:
recommendations

array([[ 946, 2108, 3147, ..., 3847, 2582, 3155],
       [1667, 2026,  671, ..., 2591,  697, 2920],
       [1604, 1529, 2859, ..., 2807, 1464, 2384],
       ...,
       [1933,  304,  172, ...,   70, 2922, 2143],
       [3305,  988, 3757, ..., 2947, 3573, 3054],
       [2058, 1916,  718, ..., 2294,  121,  140]], dtype=int64)

In [21]:
(recommendations == int(3))

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [7]:
movies_nov = pd.Series(np.zeros(len(test_data.columns)), index=test_data.columns, dtype=np.float32)
for movie_id in test_data.columns:
    recommended_fraction = (recommendations == int(movie_id)).sum()
    not_interacted_fraction = (test_data[movie_id] == 0).sum() + 1e-10
#     print(recommended_fraction, not_interacted_fraction, 1 - (recommended_fraction / not_interacted_fraction))
    movies_nov[movies_nov.index == movie_id] =  1 - (recommended_fraction / not_interacted_fraction)

In [8]:
movies_nov = (movies_nov - movies_nov.min()) / (movies_nov.max() - movies_nov.min())

In [11]:
recommendations[0]

array([ 946, 2108, 3147,  657, 1518,  314, 3362, 3847, 2582, 3155],
      dtype=int64)

In [15]:
movies_nov

1       0.722540
2       0.897618
3       1.000000
4       0.353426
5       0.428790
          ...   
3948    0.792102
3949    0.338806
3950    0.909097
3951    0.728665
3952    0.807577
Length: 3706, dtype: float32

In [19]:
movies_nov[movies_nov.index.astype(int).isin(recommendations[0])].mean()

0.6136652

In [16]:
movies_nov.index

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '3943', '3944', '3945', '3946', '3947', '3948', '3949', '3950', '3951',
       '3952'],
      dtype='object', length=3706)