In [344]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, StructType, IntegerType, StructField
from pyspark.sql.functions import from_json, col, explode, desc, split


In [345]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Dataset") \
    .getOrCreate()

In [346]:
path = "../ratings.csv"
df = spark.read.csv(path, header=True)
df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [347]:
path2 = "../movies.csv"
df2 = spark.read.csv(path2, header=True)
df2.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [348]:
df = df.join(df2, df.movieId == df2.movieId, 'inner')
df.show(truncate=False)

+------+-------+------+----------+-------+------------------------------------------------------------------+-------------------------------+
|userId|movieId|rating|timestamp |movieId|title                                                             |genres                         |
+------+-------+------+----------+-------+------------------------------------------------------------------+-------------------------------+
|1     |296    |5.0   |1147880044|296    |Pulp Fiction (1994)                                               |Comedy|Crime|Drama|Thriller    |
|1     |306    |3.5   |1147868817|306    |Three Colors: Red (Trois couleurs: Rouge) (1994)                  |Drama                          |
|1     |307    |5.0   |1147868828|307    |Three Colors: Blue (Trois couleurs: Bleu) (1993)                  |Drama                          |
|1     |665    |5.0   |1147878820|665    |Underground (1995)                                                |Comedy|Drama|War               |
|1    

In [349]:
df = df.withColumn("genres", split(df.genres, r"\|"))

In [350]:
df = df.withColumn("genres", explode(df.genres).alias("genre"))
df.show()

+------+-------+------+----------+-------+--------------------+--------+
|userId|movieId|rating| timestamp|movieId|               title|  genres|
+------+-------+------+----------+-------+--------------------+--------+
|     1|    296|   5.0|1147880044|    296| Pulp Fiction (1994)|  Comedy|
|     1|    296|   5.0|1147880044|    296| Pulp Fiction (1994)|   Crime|
|     1|    296|   5.0|1147880044|    296| Pulp Fiction (1994)|   Drama|
|     1|    296|   5.0|1147880044|    296| Pulp Fiction (1994)|Thriller|
|     1|    306|   3.5|1147868817|    306|Three Colors: Red...|   Drama|
|     1|    307|   5.0|1147868828|    307|Three Colors: Blu...|   Drama|
|     1|    665|   5.0|1147878820|    665|  Underground (1995)|  Comedy|
|     1|    665|   5.0|1147878820|    665|  Underground (1995)|   Drama|
|     1|    665|   5.0|1147878820|    665|  Underground (1995)|     War|
|     1|    899|   3.5|1147868510|    899|Singin' in the Ra...|  Comedy|
|     1|    899|   3.5|1147868510|    899|Singin' i