### Set Up Notebook

#### Import Required Modules

In [0]:
import os
import json

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Set Workflow Constants

In [0]:
RAW_PATH = "dbfs:/FileStore/data/raw"
CLN_PATH = "dbfs:/FileStore/data/clean"

In [0]:
MIN_RELEASE_DATE = '1970-01-01'
MIN_POPULARITY = 5.0

### Import and Clean Raw Data

#### Movie Metadata

In [0]:
metadata_raw = spark.read.csv(os.path.join(RAW_PATH, "movies_metadata.csv"), header=True)
metadata_raw.show(1, truncate=False, vertical=True)
metadata_raw.printSchema()

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 adult                 | False                                                                                                                                                                                                                                                                                                           
 belongs_to_collection | {'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}                                                                                                                                                           
 budget   

In [0]:
metadata = metadata_raw \
    .select(
        f.col('adult').cast('BOOLEAN').alias('adult'),
        f.col('budget').cast('INTEGER').alias('budget'),
        f.transform(f.from_json('genres', "ARRAY<STRUCT<id:INTEGER,name:STRING>>"), lambda x: x['name']).alias('genres'),
        f.col('id').cast('STRING').alias('tmdb_id'),
        f.col('overview'),
        f.col('popularity').cast('DOUBLE').alias('popularity'),
        f.col('release_date').cast('DATE').alias('release_date'),
        f.col('revenue').cast('INTEGER').alias('revenue'),
        f.col('runtime').cast('DOUBLE').alias('runtime'),
        f.transform(f.from_json('spoken_languages', "ARRAY<STRUCT<iso_639_1:STRING,name:STRING>>"), lambda x: x['name']).alias('spoken_languages'),
        f.col('status'),
        f.col('title'),
        f.col('vote_average').cast('DOUBLE').alias('vote_average'),
        f.col('vote_count').cast('INTEGER').alias('vote_count')
    ) \
    .withColumn('budget', f.when(f.col('budget') == 0, f.lit(None)).otherwise(f.col('budget'))) \
    .withColumn('revenue', f.when(f.col('revenue') == 0, f.lit(None)).otherwise(f.col('revenue'))) \
    .withColumn('id_rank', f.row_number().over(Window.partitionBy('tmdb_id').orderBy(f.rand(seed=1492)))) \
    .filter(f.col('id_rank') == 1) \
    .filter(f.col('status') == 'Released') \
    .filter(f.col('adult') == False) \
    .filter(f.array_contains('spoken_languages', 'English') == True) \
    .filter(f.col('release_date') >= MIN_RELEASE_DATE) \
    .filter(f.col('popularity') >= MIN_POPULARITY) \
    .select('tmdb_id', 'title', 'release_date', 'runtime', 'genres', 'overview', 'budget', 'revenue', 'popularity', 'vote_average', 'vote_count') \
    .sort('tmdb_id')

metadata.printSchema()
metadata.show(1, truncate=False, vertical=True)
metadata.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()

root
 |-- tmdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- runtime: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overview: string (nullable = true)
 |-- budget: integer (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 tmdb_id      | 10003                                                                                                                   

#### Movie Keywords

In [0]:
keywords_raw = spark.read.csv(os.path.join(RAW_PATH, "keywords.csv"), header=True)
keywords_raw.show(1, truncate=False, vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id       | 862                                                                                                                                                                                                                                                                                                                        
 keywords | [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}] 
only showing top

In [0]:
keywords = keywords_raw \
    .select(
        f.col('id').cast('STRING').alias('tmdb_id'),
        f.transform(f.from_json('keywords', 'ARRAY<STRUCT<id:INTEGER,name:STRING>>'), lambda x: x['name']).alias('keywords')
    ) \
    .withColumn('id_rank', f.row_number().over(Window.partitionBy('tmdb_id').orderBy(f.rand(seed=1492)))) \
    .filter(f.col('id_rank') == 1) \
    .select('tmdb_id', 'keywords') \
    .sort('tmdb_id')

keywords.printSchema()
keywords.show(1, truncate=False, vertical=True)
keywords.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()

root
 |-- tmdb_id: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 tmdb_id  | 100                                                                                                                                                                                       
 keywords | [ambush, alcohol, shotgun, tea, joint, machismo, cocktail, rifle, marijuana, cockney accent, pot smoking, hatchet, antique, cardsharp, anger, carjacking, piano, strip show, high stakes] 
only showing top 1 row

+-------+-------+
|tmdb_id|records|
+-------+-------+
|  45432|  45432|
+-------+-------+



#### Movie ID Crosswalk

In [0]:
links_path = "dbfs:/FileStore/data/raw/links.csv"
links = spark \
    .read.csv(os.path.join(RAW_PATH, "links.csv"), header=True) \
    .withColumnRenamed('movieId', 'movie_id') \
    .withColumnRenamed('imdbId', 'imdb_id') \
    .withColumnRenamed('tmdbId', 'tmdb_id')

links.printSchema()
links.show(5, truncate=False)
links.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.countDistinct('movie_id').alias('movie_id'), f.count('*').alias('records')).show()

root
 |-- movie_id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- tmdb_id: string (nullable = true)

+--------+-------+-------+
|movie_id|imdb_id|tmdb_id|
+--------+-------+-------+
|1       |0114709|862    |
|2       |0113497|8844   |
|3       |0113228|15602  |
|4       |0114885|31357  |
|5       |0113041|11862  |
+--------+-------+-------+
only showing top 5 rows

+-------+--------+-------+
|tmdb_id|movie_id|records|
+-------+--------+-------+
|  45594|   45843|  45843|
+-------+--------+-------+



#### Movie Ratings

In [0]:
ratings_raw = spark.read.csv(os.path.join(RAW_PATH, "ratings.csv"), header=True)
ratings_raw.show(5, truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |110    |1.0   |1425941529|
|1     |147    |4.5   |1425942435|
|1     |858    |5.0   |1425941523|
|1     |1221   |5.0   |1425941546|
|1     |1246   |5.0   |1425941556|
+------+-------+------+----------+
only showing top 5 rows



In [0]:
ratings = ratings_raw \
    .select(
        f.col('userId').alias('user_id'),
        f.col('movieId').alias('movie_id'),
        f.col('rating').cast('DOUBLE').alias('rating'),
        f.from_unixtime('timestamp').alias('timestamp')
    )

ratings.printSchema()
ratings.show(5, truncate=False)
ratings.agg(f.countDistinct('user_id').alias('total_users'), f.countDistinct('movie_id').alias('total_movies'), f.count('*').alias('total_ratings')).show()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+-------+--------+------+-------------------+
|user_id|movie_id|rating|timestamp          |
+-------+--------+------+-------------------+
|1      |110     |1.0   |2015-03-09 22:52:09|
|1      |147     |4.5   |2015-03-09 23:07:15|
|1      |858     |5.0   |2015-03-09 22:52:03|
|1      |1221    |5.0   |2015-03-09 22:52:26|
|1      |1246    |5.0   |2015-03-09 22:52:36|
+-------+--------+------+-------------------+
only showing top 5 rows

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|     270896|       45115|     26024289|
+-----------+------------+-------------+



### Save Clean Data Entities

#### Clean Ratings Data

In [0]:
ratings = ratings.join(links, on="movie_id", how="inner").select('user_id', 'tmdb_id', 'rating', 'timestamp')
ratings.printSchema()
ratings.show(5)
ratings.count()

root
 |-- user_id: string (nullable = true)
 |-- tmdb_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+-------+-------+------+-------------------+
|user_id|tmdb_id|rating|          timestamp|
+-------+-------+------+-------------------+
|      1|    197|   1.0|2015-03-09 22:52:09|
|      1|  10474|   4.5|2015-03-09 23:07:15|
|      1|    238|   5.0|2015-03-09 22:52:03|
|      1|    240|   5.0|2015-03-09 22:52:26|
|      1|    207|   5.0|2015-03-09 22:52:36|
+-------+-------+------+-------------------+
only showing top 5 rows

Out[20]: 26024289

In [0]:
ratings.repartition(10).write.parquet(os.path.join(CLN_PATH, "ratings"), mode="overwrite")

#### Clean Movies Data

In [0]:
movies = metadata.join(keywords, on='tmdb_id', how='inner')
movies.show(10)
movies.printSchema()
movies.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()
movies.agg(*[f.sum(f.col(col).isNull().cast('int')).alias(col) for col in movies.columns]).show()

+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|tmdb_id|               title|release_date|runtime|              genres|            overview|   budget|  revenue|popularity|vote_average|vote_count|            keywords|
+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|  10010|      Brother Bear 2|  2006-08-17|   73.0|[Adventure, Anima...|Kenai finds his c...|     null|     null| 10.861154|         6.3|       318|[grizzly bear, hu...|
|  10012|              Cursed|  2005-02-25|   97.0|    [Horror, Comedy]|A werewolf loose ...| 35000000| 19294901|  8.949722|         5.1|       168|[brother sister r...|
| 100402|Captain America: ...|  2014-03-20|  136.0|[Action, Adventur...|After the catacly...|170000000|714766572| 18.717704|         7.6|      5881|[w

In [0]:
movies.coalesce(1).write.parquet(os.path.join(CLN_PATH, "movies"), mode="overwrite")