# Spark Ratings Test Notebook

In [23]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf, col, month, year, dayofyear
from pyspark.sql.types import TimestampType
from datetime import datetime

### Create Spark Session

In [24]:
spark = SparkSession.builder.appName("rating_analysis").getOrCreate()
sc = spark.sparkContext

### Read Data

In [25]:
df = spark.read.csv("../data/raw/ratings.csv", sep=",", header=True, inferSchema=True)
df.count()

26024289

In [26]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



### ID Analysis

#### userId -> user_id

In [27]:
df = df.withColumnRenamed("userId", "user_id")

#### movieId -> movie_id

In [28]:
df = df.withColumnRenamed("movieId", "movie_id")

### Check Null Values

In [29]:
df.filter(col("user_id").isNull()).count()

0

In [30]:
df.filter(col("movie_id").isNull()).count()

0

#### timestamp - Cast TimeStamp & Extract Feats

In [31]:
convert_timestamp = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
df = df.withColumn("rating_time", convert_timestamp("timestamp"))

In [32]:
df = df.withColumn("rating_year", year("rating_time"))
df = df.withColumn("rating_month", month("rating_time"))
df = df.withColumn("rating_day", dayofyear("rating_time"))

In [33]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- rating_time: timestamp (nullable = true)
 |-- rating_year: integer (nullable = true)
 |-- rating_month: integer (nullable = true)
 |-- rating_day: integer (nullable = true)



In [35]:
df.write.partitionBy("rating_year", "rating_month", "rating_day").parquet("../data/processed/ratings/")