# EDA

In [1]:
import numpy as np
import pandas as pd

import pyspark as ps

In [2]:
spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("mov_rec") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [3]:
rat_rdd = spark.read.format('com.databricks.spark.csv').\
                            options(header='true',\
                            inferschema='true').\
                            load('data/ml-latest-small/ratings.csv', header=True)
        

In [4]:
rat_rdd.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [5]:
rat_rdd.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [6]:
mov_rdd = spark.read.format('com.databricks.spark.csv').\
                            options(header='true',\
                            inferschema='true').\
                            load('data/ml-latest-small/movies.csv', header=True)
        

In [7]:
tags_rdd = spark.read.format('com.databricks.spark.csv').\
                             options(header='true',\
                             inferschema='true').\
                             load('data/ml-latest-small/tags.csv', header=True)
        

In [8]:
mov_rdd.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [9]:
rat_rdd.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [10]:
mov_rdd.createOrReplaceTempView("mov_rdd")
rat_rdd.createOrReplaceTempView("rat_rdd")
tags_rdd.createOrReplaceTempView("tags_rdd")

In [26]:
mov_rat_join = spark.sql(
'''
SELECT movies.title, movies.genres, ratings.rating, ratings.userId, ratings.movieId, ratings.timestamp
  FROM mov_rdd AS movies
  JOIN rat_rdd AS ratings
    ON movies.movieId = ratings.movieId
'''
)

In [27]:
mov_rat_join.createOrReplaceTempView("mov_rat_join")

In [28]:
mov_rat_join.show(5)

+--------------------+--------------------+------+------+-------+---------+
|               title|              genres|rating|userId|movieId|timestamp|
+--------------------+--------------------+------+------+-------+---------+
|    Toy Story (1995)|Adventure|Animati...|   4.0|     1|      1|964982703|
|Grumpier Old Men ...|      Comedy|Romance|   4.0|     1|      3|964981247|
|         Heat (1995)|Action|Crime|Thri...|   4.0|     1|      6|964982224|
|Seven (a.k.a. Se7...|    Mystery|Thriller|   5.0|     1|     47|964983815|
|Usual Suspects, T...|Crime|Mystery|Thr...|   5.0|     1|     50|964982931|
+--------------------+--------------------+------+------+-------+---------+
only showing top 5 rows



In [None]:
toy_story_1 = spark.sql(
'''
SELECT COUNT(*)
  FROM rat_rdd as ratings
 WHERE movieId == 1

'''   
)

toy_story_1.show()

In [None]:
mov_all = spark.sql(
'''
SELECT *
  FROM mov_rat_join AS movies
  JOIN tags_rdd AS tags
    ON movies.movieId = tags.movieId
''')