In [1]:
%load_ext pycodestyle_magic
%flake8_on --max_line_length 120

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, substring

In [4]:
spark = (SparkSession
         .builder
         .master('local')
         .appName('Movie Recommendations')
         .config('spark.executor.memory', '8g')
         .config('spark.driver.memory', '8g')
         .getOrCreate())

In [5]:
movies = spark.read.load('data/ml-20m/movies.csv', format='csv', sep=',', inferSchema='true', header='true')
ratings = spark.read.load('data/ml-20m/ratings.csv', format='csv', sep=',', inferSchema='true', header='true')

# Data prep

In [6]:
# take subset of database (can't do ordering here as it is really slow over a distributed database)
ratings_500k = ratings.limit(500000)

# remove movies and users 1 rating
user_filter = (ratings_500k.groupBy('userId')
                           .agg(count('userId').alias('count'))
                           .filter(col('count') == 1)
                           .select('userId'))
movie_filter = (ratings_500k.groupBy('movieId')
                            .agg(count('movieId').alias('count'))
                            .filter(col('count') == 1)
                            .select('movieId'))
ratings_500k = ratings_500k.join(user_filter, ['userId'], how='left_anti')
ratings_500k = ratings_500k.join(movie_filter, ['movieId'], how='left_anti')

# movies with valid genre
movies_genre = movies.filter(col('genres') != '(no genres listed)')
movies_genre = movies_genre.withColumn('year', substring(col('title'), -5, 4))  # extract year
genre_filter = movies_genre.select('movieId')

# keep only movies with genre
ratings_500k = ratings_500k.join(genre_filter, ['movieId'], how='left_semi')

# Test train split

In [7]:
# test train split 20% balance
train, test = ratings_500k.randomSplit([0.8, 0.2], seed=12345)

# take union set of users, movies in both data pieces
train = train.join(test.select('userId'), ['userId'], how='left_semi')
train = train.join(test.select('movieId'), ['movieId'], how='left_semi')

test = test.join(train.select('userId'), ['userId'], how='left_semi')
test = test.join(train.select('movieId'), ['movieId'], how='left_semi')

# Write data

In [8]:
train.write.parquet('data/sample_data_train.parquet', mode='overwrite')
test.write.parquet('data/sample_data_test.parquet', mode='overwrite')
movies_genre.write.parquet('data/movie_genre.parquet', mode='overwrite')