In [1]:
# !pip install elephas

In [2]:
import os
import subprocess
import time
import findspark
findspark.init()

### Experiment setup

In [21]:
start_year = 1950
end_year = 1960
num_workers = 8
setup = f'_{str(end_year)[-2:]}_{num_workers}'

In [4]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('AE_Rec_Sys') \
        .setMaster('local[*]') \
        .set("spark.driver.memory", "9g") \
        .set("spark.executor.memory", "2g") \
        .set("spark.driver.maxResultSize", "1g")

conf.set("spark.executor.instances", str(num_workers))
sc = SparkContext(conf=conf)

In [5]:
sc

### ratings data loading and processing

In [6]:
# GET (movieId, (userId, rating)) ratings RDD

ratings = sc.textFile('ml-25m/ratings.csv') \
            .filter(lambda line: line != 'userId,movieId,rating,timestamp') \
            .map(lambda line: line.split(',')) \
            .map(lambda rating: (rating[1], (rating[0], float(rating[2]))))

### movies data loading and processing

In [7]:
# GET (movie_id, (title, year)) movies RDD
# remove 'movieId,title,genres' header line and all films without a specified year
# movie title may contain ','
# movie titles containing ',' are enclosed in double quotes

import re
pattern = r'\(\d{4}\)' # '(yyyy)' year format

def remove_enclosing_double_quotes(movie_string):
    if movie_string[0] == '"': movie_string = movie_string[1:]
    if movie_string[-1] == '"': movie_string = movie_string[:-1]
    return movie_string

movies = sc.textFile('ml-25m/movies.csv') \
            .filter(lambda line: re.search(pattern, line)) \
            .map(lambda line: line.split(',')) \
            .map(lambda movie: (movie[0], ','.join(movie[1:-1]))) \
            .map(lambda movie: (movie[0], remove_enclosing_double_quotes(movie[1]))) \
            .map(lambda movie: (movie[0], movie[1][:-7], re.findall(pattern, movie[1])[-1])) \
            .map(lambda movie: (movie[0], movie[1], movie[2].translate(str.maketrans('', '', '()')))) \
            .map(lambda movie: (movie[0], (movie[1], int(movie[2])))) \
            .filter(lambda movie: movie[1][1] >= start_year and movie[1][1] <= end_year)

### data integration

In [8]:
ratings = ratings.join(movies) \
            .map(lambda rating: (rating[1][0][0], rating[0], rating[1][0][1]))

ratings.persist()

PythonRDD[11] at RDD at PythonRDD.scala:53

### statistical analysis of the subset

In [9]:
# {'movie_id': rating_array_index}
movies_ids = ratings.map(lambda x: x[1]).distinct()
movies_id_index_pairs = {}
for i, film in enumerate(movies_ids.collect()): 
    movies_id_index_pairs[film] = i
movies_ids.unpersist()
    
# number of movies
number_of_movies = len(movies_id_index_pairs)

# max rating (useful for normalization)
unique_ratings = ratings.map(lambda x: x[2]).distinct()
max_rating = max(unique_ratings.collect())
unique_ratings.unpersist()

# number of users
users_ids = ratings.map(lambda x: x[0]).distinct()
number_of_users = users_ids.count()
users_ids.unpersist()

# number of ratings
numer_of_ratings = ratings.count()

# subset stats:
print('number of movies', number_of_movies)
print('number of users', number_of_users)
print('number of ratings', numer_of_ratings)

number of movies 2821
number of users 77048
number of ratings 565530


### ratings arrays generation

In [10]:
import numpy as np 

def full_ratings_array(ratings, num_films, films_idx):
    ratings_array = np.zeros(num_films)
    for rating in ratings: 
        ratings_array[films_idx[rating[0]]] = rating[1]
    return ratings_array

aggregated_user_ratings = ratings.map(lambda x: (x[0], (x[1], x[2]))) \
                            .groupByKey() \
                            .map(lambda x : full_ratings_array(x[1], number_of_movies, movies_id_index_pairs)) \
                            .map(lambda rating: rating/max_rating)

In [11]:
aggregated_user_ratings.count()

77048

### autoencoder

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adam

def create_autoencoder(samples_dim):
    
    model = Sequential()
    model.add(Dense(150, input_dim=samples_dim))
    model.add(Activation('relu'))
    model.add(Dense(25))
    model.add(Activation('relu'))
    
    model.add(Dense(150))
    model.add(Activation('relu'))
    model.add(Dense(samples_dim))
    model.add(Activation('sigmoid'))

    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001))
    
    return model

In [13]:
autoencoder = create_autoencoder(number_of_movies)
autoencoder.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 150)               423300    
                                                                 
 activation (Activation)     (None, 150)               0         
                                                                 
 dense_1 (Dense)             (None, 25)                3775      
                                                                 
 activation_1 (Activation)   (None, 25)                0         
                                                                 
 dense_2 (Dense)             (None, 150)               3900      
                                                                 
 activation_2 (Activation)   (None, 150)               0         
                                                                 
 dense_3 (Dense)             (None, 2821)              4

In [14]:
# # Self-made epoch by epoch model training just to plot loss curves and show training behaviour. 
# # Elephas fit method does not provide losses history

# from evaluable_training import evaluable_distributed_training
# import numpy as np

# train_losses, val_losses = evaluable_distributed_training(sc, aggregated_user_ratings, autoencoder, num_workers, 100)
# np.save(f'train_losses{setup}.npy', np.array(train_losses))
# np.save(f'val_losses{setup}.npy', np.array(val_losses))

### model training

In [15]:
train_rdd, test_rdd = aggregated_user_ratings.randomSplit([0.8, 0.2], seed=42)

# CHECK IF SAMPLES ARE WELL DISTRIBUTED OVER PARTITIONS
partition_sizes = train_rdd.mapPartitions(lambda partition: [len(list(partition))])
print(partition_sizes.collect())

[2733, 2642, 2707, 2723, 2692, 2697, 2621, 2653, 2703, 2750, 2662, 2755, 2650, 2601, 2588, 2722, 2624, 2710, 2678, 2571, 2611, 2577, 2821]


In [16]:
from elephas.spark_model import SparkModel

# (input, target) elephas rdd required format
train_rdd = train_rdd.map(lambda ratings_array: (ratings_array,ratings_array))

epochs = 50
spark_ae_model = SparkModel(autoencoder, frequency='epoch', mode='synchronous', num_workers=num_workers)
spark_ae_model.fit(train_rdd, epochs=epochs, batch_size=64, verbose=1, validation_split=0.1)
spark_ae_model.save(f'trained AEs/AE_model{setup}.keras')

train_rdd = train_rdd.map(lambda rating_array: rating_array[0]) # removing redundancy

>>> Fit model
>>> Synchronous training complete.


### model evaluation

In [18]:
from evaluable_training import calculate_avg_mse_loss
import pandas as pd

train_recs = spark_ae_model.predict(train_rdd)
train_recs_rdd = sc.parallelize(train_recs)
train_mse = calculate_avg_mse_loss(train_rdd, train_recs_rdd)
train_recs_rdd.unpersist()

test_recs = spark_ae_model.predict(test_rdd)
test_recs_rdd = sc.parallelize(test_recs)
test_mse = calculate_avg_mse_loss(test_rdd, test_recs_rdd)
test_recs_rdd.unpersist()

print('model\'s performances on train set:')
print('\tmse:  ', train_mse)
print('\trmse: ', np.sqrt(train_mse))
print()
print('model\'s performances on test set:')
print('\tmse:  ', test_mse)
print('\trmse: ', np.sqrt(test_mse))

model's performances on train set:
	mse:   0.0012489393410993225
	rmse:  0.03534033589397988

model's performances on test set:
	mse:   0.001259961455509893
	rmse:  0.03549593576044859


In [19]:
model_eval = {
    'train_mse': [train_mse],
    'train_rmse': [np.sqrt(train_mse)],
    'test_mse': [test_mse],
    'test_rmse': [np.sqrt(test_mse)],
}
df = pd.DataFrame(model_eval)
dest = f'./{folder_path}/{folders[step]}/model_eval{setup}.csv'
df.to_csv(dest, index=False)

### example of recommendation

In [24]:
from tensorflow.keras.models import load_model
from IPython.display import display, HTML

AE = load_model(f'trained AEs/AE_model{setup}.keras')
user_ratings = aggregated_user_ratings.takeSample(False, 1)[0]  # sampling a random user's ratings

user_ratings = np.expand_dims(user_ratings, axis=0)
rec = AE.predict(user_ratings, verbose=0)

merged = np.stack([np.arange(0, number_of_movies), 
                    np.squeeze(user_ratings, axis=0), 
                    np.squeeze(rec, axis=0)
                  ], axis=1)
filtered_ratings = list(filter(lambda rating_pair: rating_pair[1] == 0, merged))
recom_movie_index = max(filtered_ratings,key=lambda item: item[2])[0]
movie_id = list(movies_id_index_pairs.keys())[int(recom_movie_index)]
movie_data = movies.filter(lambda movie: movie[0]==movie_id).collect()[0]
suggested_movie = f'{movie_data[1][0]} ({movie_data[1][1]})'

# print('Suggested movie: ', movie_data[1][0], f'({movie_data[1][1]})')
display(HTML(f'<br><font size="3">Suggested movie: </font><br><h1>{suggested_movie}</h1>'))

In [None]:
sc.stop()