In [1]:
import pandas as pd
from collections import defaultdict, deque
import itertools
import numpy as np
import dask.dataframe as dd
from tqdm import tqdm

In [2]:
MOVIES_DATASET = "./data/movies.csv"
RATINGS_DATASET = "./data/ratings.csv"
NUMBER_OF_USER_WATCHED_MIN = 0
NUMBER_OF_MOVIES_WATCHED_MIN = 10
TRAIN_TEST_SPLIT = 0.9
SEQ_LEN = NUMBER_OF_MOVIES_WATCHED_MIN

# Preprocessing Movies Dataset

In [3]:
# Mapping movieId to indexes for later use in one-hot encoding  
movie_mapper = defaultdict(tuple)
def map_movie_to_idx():
    movies_df = pd.read_csv(MOVIES_DATASET)
    
    ratings_df = pd.read_csv(RATINGS_DATASET)
    movie_ids = ratings_df['movieId']
    movie_dict = defaultdict(int)
    for movie in movie_ids:
        movie_dict[movie] += 1
    
    movies_df.dropna(inplace=True)
    counter = 0
    
    # Fastest way to iterate a df
    iter_dict = movies_df.to_dict('records')
    for row in tqdm(iter_dict):
        if (row['movieId'] not in movie_dict) or (movie_dict[row['movieId']] < NUMBER_OF_USER_WATCHED_MIN): 
            continue
        movie_mapper[row['movieId']] = (counter, row['title'])
        counter += 1
    print("Number of movies {}".format(counter))

map_movie_to_idx()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9742/9742 [00:00<00:00, 2054137.82it/s]

Number of movies 9724





# Preprocessing Ratings Dataset

In [8]:
def one_hot_encode_movie(movieId):
    num_movies = len(movie_mapper)
    encoded_movie = np.zeros(num_movies, dtype=int) 
#     encoded_movie = [0] * num_movies
    encoded_movie[movie_mapper[movieId]] = 1
    return encoded_movie

def one_hot_encode_user_seq(user_seq):
    encoded = []
    for movie in user_seq:
        encoded.append(one_hot_encode_movie(movie))
    return np.array(encoded, dtype=object)
#     return encoded

def sequentialize(ratings_df):
    ratings_df = ratings_df.dropna()
    ratings_df = ratings_df.sort_values(by=['userId', 'timestamp'])
    
    ratings_df = ratings_df.dropna()
    userIds = ratings_df['userId'].unique()
    
    X = []
    Y = []
    for user in userIds:
        user_seq = list(ratings_df[(ratings_df['userId'] == user)]['movieId'])
        if len(user_seq) >= NUMBER_OF_MOVIES_WATCHED_MIN:
            sequence = deque()
            print(len(user_seq))
            for movieId in user_seq:
#                 if movieId not in movie_mapper: continue
                movie = movie_mapper[movieId][1]
                sequence.append(movie.strip())
                if len(sequence) == SEQ_LEN:
                    X.append(list(itertools.islice(sequence, 0, SEQ_LEN - 1)))
                    Y.append(sequence[-1])
#             res = one_hot_encode_user_seq(user_seq)
#             X.append(res[:-1])
#             Y.append(res[-1])
    return np.array(X), np.array(Y)

def get_datasets():
    ratings_df = pd.read_csv(RATINGS_DATASET)
    ratings_df = ratings_df.dropna()
#     ratings_df = ratings_df[:10000]
    
    sz = len(ratings_df)
    train_df = ratings_df[:int(TRAIN_TEST_SPLIT * sz)]
    test_df = ratings_df[int(TRAIN_TEST_SPLIT * sz):]
    
    X_train, Y_train = sequentialize(train_df)
    X_test, Y_test = sequentialize(test_df)
    
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = get_datasets()

232
29
39
216
44
314
152
47
46
140
64
32
31
48
135
98
105
502
703
242
443
119
121
110
26
21
135
570
81
34
50
102
156
86
23
60
21
78
100
103
217
440
114
48
399
42
140
33
21
310
359
130
20
33
25
46
476
112
107
22
39
366
271
517
34
345
36
1260
46
62
35
45
210
177
69
119
29
61
64
167
26
227
118
293
34
70
21
56
518
54
575
24
97
56
168
78
36
92
53
148
61
56
377
273
722
33
34
76
127
51
646
65
150
31
112
87
165
22
215
22
58
292
56
50
360
38
22
33
140
28
69
347
35
35
279
111
141
22
194
608
168
38
71
128
23
32
20
48
58
26
59
63
179
34
46
398
21
26
97
437
39
38
23
36
65
190
173
94
269
50
82
26
25
67
24
36
904
77
69
24
118
977
57
134
47
226
258
48
20
66
85
22
35
20
187
32
35
230
363
334
110
403
45
83
27
25
20
26
35
138
89
248
84
22
98
164
613
26
528
207
331
250
75
54
75
507
94
25
65
139
24
862
150
202
65
30
50
44
279
128
76
35
36
93
21
204
150
51
1046
27
23
38
47
127
44
174
20
25
29
152
53
57
210
56
163
180
57
129
29
40
43
31
55
1346
403
41
28
20
176
196
21
237
35
88
36
106
152
1055
24
267
31
446


In [5]:
# X_train =  X_train.reshape(-1, X_train.shape[0])
for i in range(0, 2):
    print(X_train[i], Y_train[i])

["She's the One (1996)"
 'Star Wars: Episode VI - Return of the Jedi (1983)' 'Bambi (1942)'
 'Star Wars: Episode I - The Phantom Menace (1999)'
 '13th Warrior, The (1999)' 'Gladiator (2000)' 'Road Trip (2000)'
 'Shaft (2000)' 'Bottle Rocket (1996)'] Dazed and Confused (1993)
['Shawshank Redemption, The (1994)' 'Inception (2010)'
 'The Jinx: The Life and Deaths of Robert Durst (2015)'
 'Ex Machina (2015)' 'Django Unchained (2012)' 'Whiplash (2014)'
 'Gladiator (2000)' 'Dark Knight Rises, The (2012)'
 'Shutter Island (2010)'] Girl with the Dragon Tattoo, The (2011)


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM # CuDNNLSTM

tf.config.list_physical_devices('GPU')

[]

In [8]:
model = Sequential()

# IF you are running with a GPU, try out the CuDNNLSTM layer type instead (don't pass an activation, tanh is required)
model.add(LSTM(128, input_shape=(None, 590), activation='relu', return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)


model.fit(X_train,
          Y_train,
          epochs=1,
          validation_data=(X_test, Y_test))

2022-03-27 09:35:16.600503: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-27 09:35:16.600574: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-27 09:35:16.600631: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (kali): /proc/driver/nvidia/version does not exist
2022-03-27 09:35:16.602248: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super(Adam, self).__init__(name, **kwargs)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).