In [3]:
import os

import pandas as pd

In [4]:
dirpath = '../data/ml-latest-small'
ratings_csv = pd.read_csv(os.path.join(dirpath, 'ratings.csv'))
ratings_csv.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
from scipy import sparse

In [6]:
users = ratings_csv.userId.unique()
movies = ratings_csv.movieId.unique()

print(f'# of user: {len(users)}\t# of movie: {len(movies)}')

# of user: 610	# of movie: 9724


In [7]:
user2idx = {user: idx for idx, user in enumerate(users)}
movie2idx = {movie: idx for idx, movie in enumerate(movies)}

print(f'# of user: {len(user2idx)}\t# of movie: {len(movie2idx)}')

# of user: 610	# of movie: 9724


In [8]:
rows = list()
cols = list()
data = list()
for user, movie, rating in zip(ratings_csv['userId'], ratings_csv['movieId'], ratings_csv['rating']):
    user_idx = user2idx[user]
    movie_idx = movie2idx[movie]
    
    rows.append(user_idx)
    cols.append(movie_idx)
    if rating >= 3.0:
        data.append(1.0)
    else:
        data.append(-1.0)

rating_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(user2idx), len(movie2idx)))
rating_matrix

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [9]:
from collections import defaultdict

feedback = defaultdict(int)
for rdx, cdx in zip(rating_matrix.nonzero()[0], rating_matrix.nonzero()[1]):
    if rating_matrix[rdx, cdx] == 1.0:
        feedback['positive'] += 1
    elif rating_matrix[rdx, cdx] == -1.0:
        feedback['negative'] += 1

print(f'Positive Feedback: {feedback["positive"]}\tNegative Feedback: {feedback["negative"]}')

Positive Feedback: 81763	Negative Feedback: 19073


In [10]:
from tqdm import tqdm
import numpy as np

N_user = rating_matrix.shape[0]
N_movie = rating_matrix.shape[1]

np.random.seed(2020)

rows_tr = list()
cols_tr = list()
data_tr = list()

rows_val = list()
cols_val = list()
data_val = list()

for rdx, cdx in tqdm(zip(rating_matrix.nonzero()[0], rating_matrix.nonzero()[1])):
    rated_movie = len(rating_matrix[rdx, :].nonzero()[1])
    rated_user = len(rating_matrix[:, cdx].nonzero()[0])

    threshold = (rated_movie / N_movie) * (rated_user / N_user) + 0.8
    random_number = np.random.rand()
    if random_number <= threshold:
        rows_tr.append(rdx)
        cols_tr.append(cdx)
        data_tr.append(rating_matrix[rdx, cdx])
    else:
        rows_val.append(rdx)
        cols_val.append(cdx)
        data_val.append(rating_matrix[rdx, cdx])

rating_matrix_tr = sparse.csr_matrix((data_tr, (rows_tr, cols_tr)), shape=(len(user2idx), len(movie2idx)))
rating_matrix_val = sparse.csr_matrix((data_val, (rows_val, cols_val)), shape=(len(user2idx), len(movie2idx)))


100836it [00:53, 1898.68it/s]


In [11]:
rating_matrix_tr

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 81216 stored elements in Compressed Sparse Row format>

In [12]:
rating_matrix_val

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 19620 stored elements in Compressed Sparse Row format>

In [13]:
# from itertools import product

# for rdx, cdx in zip(*rating_matrix_tr.nonzero()):
#     print(f'({rdx}, {cdx})\t{rating_matrix_tr[rdx, cdx]}')

In [14]:
from implicit.als import AlternatingLeastSquares

model = AlternatingLeastSquares(factors=20, calculate_training_loss=True, iterations=50, num_threads=4)
model.fit(rating_matrix_tr.T)

100%|██████████| 50/50 [00:06<00:00,  7.23it/s, loss=0.0093]


In [15]:
model.recommend(0, rating_matrix_tr, 10)

[(478, 0.66567826),
 (1134, 0.5918534),
 (69, 0.58202124),
 (401, 0.57713497),
 (1028, 0.56855),
 (101, 0.5352546),
 (1030, 0.5342044),
 (1179, 0.5205779),
 (34, 0.51859057),
 (292, 0.5121572)]

In [16]:
counter = 0
for cdx, _ in model.recommend(0, rating_matrix_tr, 100):
    counter += int(rating_matrix_val[0, cdx] == 1)
print(counter)

19


In [17]:
from tensorflow import keras
import tensorflow as tf

In [18]:
X = rating_matrix_tr.toarray()
X.shape

(610, 9724)

In [54]:
from tensorflow.keras.layers import BatchNormalization, Dense, LeakyReLU
import tensorflow.keras.backend as K


class AutoEncoder:
    def __init__(self, input_dim=9724, encoding_dim=20):
        self.autoencoder = self.build_autoencoder(input_dim, encoding_dim)

    def build_autoencoder(self, input_dim, encoding_dim):
        model = keras.models.Sequential()

        model.add(Dense(256, input_dim=input_dim))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dense(128))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dense(encoding_dim, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dense(256, activation='relu'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dense(input_dim, activation='tanh'))

        model.summary()

        rating = keras.Input(shape=(input_dim))
        rating_hat = model(rating)

        return keras.Model(rating, rating_hat)
  
    def loss_function(self, y_true, y_pred):
        N = K.sum(tf.cast((y_true == 1.0) | (y_pred == -1.0), tf.float32))

        mask = y_pred != 0
        mask = tf.cast(mask, tf.float32)
        y_pred = (y_pred + 1) / 2
        y_true = (y_true + 1) / 2

        total_cost = mask * (-y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred))
        total_cost = K.sum(total_cost)

        return total_cost / N

    def train(self, rating, epochs=50, batch_size=100):
        optimizer = keras.optimizers.Adam(lr=0.00001)
        self.autoencoder.compile(optimizer='adam', loss=self.loss_function)
        self.autoencoder.fit(rating, rating, epochs=epochs, batch_size=batch_size, shuffle=True)

In [55]:
autoencoder = AutoEncoder(input_dim=X.shape[1], encoding_dim=20)
autoencoder.train(X)


Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_154 (Dense)            (None, 256)               2489600   
_________________________________________________________________
batch_normalization_4 (Batch (None, 256)               1024      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_155 (Dense)            (None, 128)               32896     
_________________________________________________________________
batch_normalization_5 (Batch (None, 128)               512       
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_156 (Dense)            (None, 20)              

In [None]:
arr = np.array([1, 0, -1, 1, 0])
sum((arr==1) | (arr==-1))
np.array(arr == 0) * np.array([1, 2, 3, 4, 5])

In [61]:
np.argsort(autoencoder.autoencoder.predict(X[0:1]))[:100]

array([[1582, 1218, 1796, ...,   20,   84,   26]])

In [76]:
counter = 0
for cdx in np.argsort(autoencoder.autoencoder.predict(X[1:2]))[:100][0]:
    counter += int(rating_matrix_val[1, cdx] == 1)
print(counter)

6


In [78]:
counter = 0
for cdx, _ in model.recommend(1, rating_matrix_tr, 100, filter_already_liked_items=True):
    counter += int(rating_matrix_val[1, cdx] == 1)
print(counter)

2


In [94]:
from itertools import combinations

X = list()
Y = list()
for rdx in tqdm(range(rating_matrix_tr.shape[0])):
    positives = list()
    negatives = list()
    for cdx in rating_matrix_tr[rdx, :].nonzero()[1]:
        if rating_matrix_tr[rdx, cdx] == 1:
            positives.append(cdx)
        elif rating_matrix_tr[rdx, cdx] == -1:
            negatives.append(cdx)
    
    for i, j in combinations(positives, 2):
        X.append((i, j))
        Y.extend([1])
    
    for items in combinations(negatives, 2):
        X.append(items)
        Y.extend([0])

100%|██████████| 610/610 [00:08<00:00, 74.91it/s]


In [95]:
len(X)

[(2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 9),
 (2, 10),
 (2, 11),
 (2, 12),
 (2, 13),
 (2, 14),
 (2, 15),
 (2, 16),
 (2, 17),
 (2, 18),
 (2, 19),
 (2, 21),
 (2, 22),
 (2, 24),
 (2, 25),
 (2, 26),
 (2, 27),
 (2, 28),
 (2, 29),
 (2, 30),
 (2, 32),
 (2, 33),
 (2, 35),
 (2, 36),
 (2, 37),
 (2, 38),
 (2, 40),
 (2, 42),
 (2, 44),
 (2, 45),
 (2, 46),
 (2, 47),
 (2, 48),
 (2, 49),
 (2, 50),
 (2, 51),
 (2, 52),
 (2, 53),
 (2, 54),
 (2, 55),
 (2, 56),
 (2, 59),
 (2, 60),
 (2, 61),
 (2, 62),
 (2, 63),
 (2, 64),
 (2, 65),
 (2, 68),
 (2, 70),
 (2, 71),
 (2, 72),
 (2, 73),
 (2, 74),
 (2, 75),
 (2, 78),
 (2, 79),
 (2, 80),
 (2, 81),
 (2, 83),
 (2, 85),
 (2, 88),
 (2, 89),
 (2, 91),
 (2, 92),
 (2, 93),
 (2, 95),
 (2, 97),
 (2, 98),
 (2, 99),
 (2, 100),
 (2, 102),
 (2, 103),
 (2, 104),
 (2, 107),
 (2, 109),
 (2, 110),
 (2, 112),
 (2, 113),
 (2, 114),
 (2, 115),
 (2, 117),
 (2, 118),
 (2, 119),
 (2, 120),
 (2, 121),
 (2, 122),
 (2, 123),
 (2, 124),
 (2, 126),
 (2, 127),
 (2, 128),
 (2, 129),
 (2

In [None]:
len(Y)