# 1. Data Loading

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

In [3]:
def load_dfs():
    global movies, users, ratings
    movies = spark.read.csv("./Data/movieLens/movies.dat", sep="::", encoding="latin1")
    movies = movies.toDF("movie_id", "movie_name", "genre").cache()
    movies.createOrReplaceTempView("movies_info")
    
    users = spark.read.csv("./Data/movieLens/users.dat", sep="::", encoding="latin1")
    users = users.toDF("user_id", "gender", "age", "occupation", "zipcode").cache()
    users.createOrReplaceTempView("users_info")
    
    ratings = spark.read.csv("./Data/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

In [4]:
load_dfs()

In [5]:
movies.show(5)

+--------+--------------------+--------------------+
|movie_id|          movie_name|               genre|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
+--------+--------------------+--------------------+
only showing top 5 rows



In [6]:
users.show(5)

+-------+------+---+----------+-------+
|user_id|gender|age|occupation|zipcode|
+-------+------+---+----------+-------+
|      1|     F|  1|        10|  48067|
|      2|     M| 56|        16|  70072|
|      3|     M| 25|        15|  55117|
|      4|     M| 45|         7|  02460|
|      5|     M| 25|        20|  55455|
+-------+------+---+----------+-------+
only showing top 5 rows



In [7]:
ratings.show(5)

+-------+--------+------+----------+
|user_id|movie_id|rating|time_stamp|
+-------+--------+------+----------+
|      1|    1193|     5| 978300760|
|      1|     661|     3| 978302109|
|      1|     914|     3| 978301968|
|      1|    3408|     4| 978300275|
|      1|    2355|     5| 978824291|
+-------+--------+------+----------+
only showing top 5 rows



# 2. Data Cleaning

## 2.1. Checking Null Values

In [8]:
def inspect_null(df):
    for col in df.columns:
        empty = df.filter(df[col].isNull()).count()
        print(f"For columns {col}:\t{empty} null records")

In [9]:
inspect_null(movies)

For columns movie_id:	0 null records
For columns movie_name:	0 null records
For columns genre:	0 null records


In [10]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records


In [11]:
inspect_null(ratings)

For columns user_id:	0 null records
For columns movie_id:	0 null records
For columns rating:	0 null records
For columns time_stamp:	0 null records


## 2.2. Data Types

### 2.2.1. User dataset

In [12]:
users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zipcode: string (nullable = true)



**Gender**

In [13]:
users = users.withColumn("gender", pyspark.sql.functions.when(users["gender"] == 'M', 1).otherwise(0))

**Mapping Age to Age Category**

In [14]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

label_mapping = {
    1: 1,
    18: 2,
    25: 3,
    35: 4,
    45: 5,
    50: 6,
    56: 7
}

age_udf = udf(lambda record: label_mapping[int(record)], IntegerType())
users = users.withColumn("age", age_udf(users["age"]))

**Mapping Zipcode to Region**

In [15]:
users = users.withColumn("casted_zipcode", users["zipcode"].cast(IntegerType()))
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records
For columns casted_zipcode:	66 null records


In [16]:
users.filter(users["casted_zipcode"].isNull()).show(10)

+-------+------+---+----------+----------+--------------+
|user_id|gender|age|occupation|   zipcode|casted_zipcode|
+-------+------+---+----------+----------+--------------+
|    161|     1|  5|        16|98107-2117|          NULL|
|    233|     0|  5|        20|37919-4204|          NULL|
|    293|     1|  7|         1|55337-4056|          NULL|
|    458|     1|  6|        16|55405-2546|          NULL|
|    506|     1|  3|        16|55103-1006|          NULL|
|    567|     1|  4|        20|52570-9634|          NULL|
|    868|     1|  6|        17|01702-7224|          NULL|
|    913|     1|  3|         0|20744-6223|          NULL|
|    939|     0|  3|        20|20110-5616|          NULL|
|    946|     1|  4|         7|48103-8929|          NULL|
+-------+------+---+----------+----------+--------------+
only showing top 10 rows



In [17]:
def to_region(record):
    record = str(record)
    return int(record[0])

zipcode_udf = udf(lambda record: to_region(record), IntegerType())
users = users.withColumn("region", zipcode_udf(users["zipcode"]))
users = users.drop("zipcode")

In [18]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns casted_zipcode:	66 null records
For columns region:	0 null records


**Asserting that All the data are in integer type with no nulls**

In [19]:
for col in users.columns:
    users = users.withColumn(col, users[col].cast(IntegerType()))
    
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns casted_zipcode:	66 null records
For columns region:	0 null records


### 2.2.2. For Movies Dataset

In [20]:
movies.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- genre: string (nullable = true)



**Transforming movies_id to integer**

In [21]:
movies = movies.withColumn("movie_id", movies["movie_id"].cast(IntegerType()))

**Parsing movie_name to year and name**

In [22]:
import re

def extract_date(record):
    pattern  = r'\((\d{4})\)'
    if re.findall(pattern, record.strip()[-6:]):
        return int(record.strip()[-5:-1])
    return None

def extract_name(record):
    pattern  = r'\((\d{4})\)'
    if re.findall(pattern, record.strip()[-6:]):
        return record.strip()[:-6].strip()
    return record

# Define UDFs for extract_date and extract_name functions
extract_date_udf = udf(lambda record: extract_date(record), IntegerType())
extract_name_udf = udf(lambda record: extract_name(record), StringType())

movies = movies.withColumn("year", extract_date_udf(movies["movie_name"]))
movies = movies.withColumn("name", extract_name_udf(movies["movie_name"]))

**Parsing the genre into a serie of genres**

In [23]:
movies = movies.withColumn("parsed_genre", pyspark.sql.functions.explode(pyspark.sql.functions.split(movies["genre"], "\\s*\\|\\s*")))
movies = movies.withColumn("value", (movies["parsed_genre"]==movies["parsed_genre"]).cast(IntegerType()))

### 2.2.3. For Ratings Dataset

In [24]:
ratings.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- time_stamp: string (nullable = true)



**Casting All the attributes to int type**

In [25]:
for col in ratings.columns:
    ratings = ratings.withColumn(col, ratings[col].cast(IntegerType()))

ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)



# 3. Saving The cleaned datasets 

In [26]:
ratings.toPandas().to_csv("./Data/cleaned_data_1/ratings.csv", header=True, columns = ratings.columns, index=False)
movies.toPandas().to_csv("./Data/cleaned_data_1/movies.csv", header=True, columns = movies.columns, index=False)
users.toPandas().to_csv("./Data/cleaned_data_1/users.csv", header=True, columns = users.columns, index=False)

# 4. Feature Engineering

In [27]:
movies.createOrReplaceTempView("movies_info")
users.createOrReplaceTempView("users_info")
ratings.createOrReplaceTempView("ratings_info")

## 4.1. Movies Dataset

**Features used for the movie:**
1. year
2. genres
3. watch_count
4. popularity among its genre
5. avarage rating
6. rating ratio per genre

**watch count**

In [28]:
popularity = spark.sql("SELECT movie_id, COUNT(DISTINCT(user_id)) AS watches FROM ratings_info GROUP BY movie_id")
popularity.createOrReplaceTempView("popularity_info")

In [29]:
popularity.show(5)

+--------+-------+
|movie_id|watches|
+--------+-------+
|    1580|   2538|
|     471|    599|
|    3175|   1728|
|    1959|    626|
|    3794|    121|
+--------+-------+
only showing top 5 rows



**Popularity among its genre**

In [30]:
query = """
    SELECT parsed_genre AS genre, COUNT(user_id) AS genre_count
    FROM   ratings_info LEFT JOIN movies_info ON movies_info.movie_id = ratings_info.movie_id  
    GROUP BY parsed_genre
"""

watches_per_genre = spark.sql(query)
watches_per_genre.createOrReplaceTempView("watches_per_genre_info")

In [31]:
watches_per_genre.show(5)

+---------+-----------+
|    genre|genre_count|
+---------+-----------+
|    Crime|      79541|
|  Romance|     147523|
| Thriller|     189680|
|Adventure|     133953|
|    Drama|     354529|
+---------+-----------+
only showing top 5 rows



In [32]:
query = """
    SELECT *, watches/genre_count AS popularity_per_genre
    FROM    (SELECT movies_info.movie_id AS movie_id, year, parsed_genre, watches
             FROM   movies_info INNER JOIN popularity_info ON movies_info.movie_id = popularity_info.movie_id
            ) A INNER JOIN 
            watches_per_genre_info ON A.parsed_genre = watches_per_genre_info.genre
"""

df = spark.sql(query)
df = df.drop("parsed_genre")
df.createOrReplaceTempView("df_info")
df.show(5)

+--------+----+-------+---------+-----------+--------------------+
|movie_id|year|watches|    genre|genre_count|popularity_per_genre|
+--------+----+-------+---------+-----------+--------------------+
|    1580|1997|   2538|   Sci-Fi|     157294|  0.0161353897796483|
|    1580|1997|   2538|   Comedy|     356580|0.007117617364967188|
|    1580|1997|   2538|Adventure|     133953| 0.01894694407740028|
|    1580|1997|   2538|   Action|     257457|0.009857956862699403|
|     471|1994|    599|  Romance|     147523|0.004060383804559289|
+--------+----+-------+---------+-----------+--------------------+
only showing top 5 rows



**Avarage Rating**

In [33]:
query = """
    SELECT movie_id, AVG(rating) AS avg_rating 
    FROM ratings_info 
    GROUP BY movie_id
"""

avg = spark.sql(query)
avg.createOrReplaceTempView("avg_info")
avg.show(5)

+--------+------------------+
|movie_id|        avg_rating|
+--------+------------------+
|    1580| 3.739952718676123|
|    2366|3.6560846560846563|
|    1088|3.3114992721979624|
|    1959|3.6533546325878596|
|    3175| 3.771412037037037|
+--------+------------------+
only showing top 5 rows



In [34]:
query = """
    SELECT A.movie_id, genre, year, watches, genre_count, popularity_per_genre, avg_rating
    FROM   df_info AS A LEFT JOIN avg_info ON A.movie_id = avg_info.movie_id
"""

df = spark.sql(query)
df.createOrReplaceTempView("df_info")
df.show(5)

+--------+---------+----+-------+-----------+--------------------+-----------------+
|movie_id|    genre|year|watches|genre_count|popularity_per_genre|       avg_rating|
+--------+---------+----+-------+-----------+--------------------+-----------------+
|    1580|   Sci-Fi|1997|   2538|     157294|  0.0161353897796483|3.739952718676123|
|    1580|   Comedy|1997|   2538|     356580|0.007117617364967188|3.739952718676123|
|    1580|Adventure|1997|   2538|     133953| 0.01894694407740028|3.739952718676123|
|    1580|   Action|1997|   2538|     257457|0.009857956862699403|3.739952718676123|
|     471|  Romance|1994|    599|     147523|0.004060383804559289|3.631051752921536|
+--------+---------+----+-------+-----------+--------------------+-----------------+
only showing top 5 rows



**rating ratio to genre rating**

In [35]:
query = """
    SELECT A.parsed_genre AS genre, MEAN(B.rating) AS mean_genre_rating
    FROM   movies_info AS A JOIN ratings_info B ON A.movie_id = B.movie_id
    GROUP BY A.parsed_genre
"""

avg = spark.sql(query)
avg.createOrReplaceTempView("avg_info")
avg.show(5)

+---------+------------------+
|    genre| mean_genre_rating|
+---------+------------------+
|    Crime| 3.708678543141273|
|  Romance| 3.607464598740535|
| Thriller|3.5704660480809784|
|Adventure| 3.477256948332624|
|    Drama| 3.766332232342065|
+---------+------------------+
only showing top 5 rows



In [36]:
query = """
    SELECT A.movie_id, A.genre, A.year, watches, popularity_per_genre, avg_rating, avg_rating/mean_genre_rating AS rating_per_genre
    FROM   df_info AS A LEFT JOIN avg_info B ON A.genre = B.genre
"""

df = spark.sql(query)
df.createOrReplaceTempView("df_info")
df.show(5)

+--------+---------+----+-------+--------------------+-----------------+------------------+
|movie_id|    genre|year|watches|popularity_per_genre|       avg_rating|  rating_per_genre|
+--------+---------+----+-------+--------------------+-----------------+------------------+
|    1580|   Sci-Fi|1997|   2538|  0.0161353897796483|3.739952718676123|1.0788777579469762|
|    1580|   Comedy|1997|   2538|0.007117617364967188|3.739952718676123|1.0618534293265696|
|    1580|Adventure|1997|   2538| 0.01894694407740028|3.739952718676123|1.0755468388579865|
|    1580|   Action|1997|   2538|0.009857956862699403|3.739952718676123|1.0712559739797276|
|     471|  Romance|1994|    599|0.004060383804559289|3.631051752921536|1.0065384298405133|
+--------+---------+----+-------+--------------------+-----------------+------------------+
only showing top 5 rows



In [37]:
df.toPandas().to_csv("./Data/cleaned_data/unpivoted_movies_features.csv", header=True, columns=df.columns, index=False)

### 4.1.2. Pivoting the Movies Table

In [38]:
excluded = ["movie_id", "year", "watches", "avg_rating"]

sub1 = df[["movie_id", "genre", "year", "watches", "avg_rating", "popularity_per_genre"]]
sub1 = sub1.groupBy(["movie_id", "year", "watches", "avg_rating"]).pivot("genre").sum("popularity_per_genre")

columns = {col: 0 for col in sub1.columns if not(col in excluded)}
sub1 = sub1.fillna(columns)

In [39]:
sub1.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- watches: long (nullable = false)
 |-- avg_rating: double (nullable = true)
 |-- Action: double (nullable = false)
 |-- Adventure: double (nullable = false)
 |-- Animation: double (nullable = false)
 |-- Children's: double (nullable = false)
 |-- Comedy: double (nullable = false)
 |-- Crime: double (nullable = false)
 |-- Documentary: double (nullable = false)
 |-- Drama: double (nullable = false)
 |-- Fantasy: double (nullable = false)
 |-- Film-Noir: double (nullable = false)
 |-- Horror: double (nullable = false)
 |-- Musical: double (nullable = false)
 |-- Mystery: double (nullable = false)
 |-- Romance: double (nullable = false)
 |-- Sci-Fi: double (nullable = false)
 |-- Thriller: double (nullable = false)
 |-- War: double (nullable = false)
 |-- Western: double (nullable = false)



In [40]:
excluded = ["movie_id", "year", "watches", "avg_rating"]
for col in sub1.columns:
    if not(col in excluded):
        sub1 = sub1.withColumnRenamed(col, "popularity_per_"+col)

sub1.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- watches: long (nullable = false)
 |-- avg_rating: double (nullable = true)
 |-- popularity_per_Action: double (nullable = false)
 |-- popularity_per_Adventure: double (nullable = false)
 |-- popularity_per_Animation: double (nullable = false)
 |-- popularity_per_Children's: double (nullable = false)
 |-- popularity_per_Comedy: double (nullable = false)
 |-- popularity_per_Crime: double (nullable = false)
 |-- popularity_per_Documentary: double (nullable = false)
 |-- popularity_per_Drama: double (nullable = false)
 |-- popularity_per_Fantasy: double (nullable = false)
 |-- popularity_per_Film-Noir: double (nullable = false)
 |-- popularity_per_Horror: double (nullable = false)
 |-- popularity_per_Musical: double (nullable = false)
 |-- popularity_per_Mystery: double (nullable = false)
 |-- popularity_per_Romance: double (nullable = false)
 |-- popularity_per_Sci-Fi: double (nullable = false)
 |-- po

In [41]:
excluded = ["movie_id", "year"]

sub2 = df[["movie_id", "genre", "year", "rating_per_genre"]]
sub2 = sub2.groupBy(["movie_id", "year"]).pivot("genre").sum("rating_per_genre")

columns = {col: 0 for col in sub2.columns if not(col in excluded)}
sub2 = sub2.fillna(columns)

In [42]:
sub2.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- Action: double (nullable = false)
 |-- Adventure: double (nullable = false)
 |-- Animation: double (nullable = false)
 |-- Children's: double (nullable = false)
 |-- Comedy: double (nullable = false)
 |-- Crime: double (nullable = false)
 |-- Documentary: double (nullable = false)
 |-- Drama: double (nullable = false)
 |-- Fantasy: double (nullable = false)
 |-- Film-Noir: double (nullable = false)
 |-- Horror: double (nullable = false)
 |-- Musical: double (nullable = false)
 |-- Mystery: double (nullable = false)
 |-- Romance: double (nullable = false)
 |-- Sci-Fi: double (nullable = false)
 |-- Thriller: double (nullable = false)
 |-- War: double (nullable = false)
 |-- Western: double (nullable = false)



In [43]:
excluded = ["movie_id", "year"]
for col in sub2.columns:
    if not(col in excluded):
        sub2 = sub2.withColumnRenamed(col, "rating_per_"+col)

sub2.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- rating_per_Action: double (nullable = false)
 |-- rating_per_Adventure: double (nullable = false)
 |-- rating_per_Animation: double (nullable = false)
 |-- rating_per_Children's: double (nullable = false)
 |-- rating_per_Comedy: double (nullable = false)
 |-- rating_per_Crime: double (nullable = false)
 |-- rating_per_Documentary: double (nullable = false)
 |-- rating_per_Drama: double (nullable = false)
 |-- rating_per_Fantasy: double (nullable = false)
 |-- rating_per_Film-Noir: double (nullable = false)
 |-- rating_per_Horror: double (nullable = false)
 |-- rating_per_Musical: double (nullable = false)
 |-- rating_per_Mystery: double (nullable = false)
 |-- rating_per_Romance: double (nullable = false)
 |-- rating_per_Sci-Fi: double (nullable = false)
 |-- rating_per_Thriller: double (nullable = false)
 |-- rating_per_War: double (nullable = false)
 |-- rating_per_Western: double (nullable = fals

In [44]:
sub1.createOrReplaceTempView("sub1_info")
sub2.createOrReplaceTempView("sub2_info")

query = """
        SELECT * 
        FROM sub1_info INNER JOIN sub2_info
        USING (movie_id, year)
"""

sub1 = spark.sql(query)
sub1.createOrReplaceTempView("sub1_info")
sub1.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- watches: long (nullable = false)
 |-- avg_rating: double (nullable = true)
 |-- popularity_per_Action: double (nullable = false)
 |-- popularity_per_Adventure: double (nullable = false)
 |-- popularity_per_Animation: double (nullable = false)
 |-- popularity_per_Children's: double (nullable = false)
 |-- popularity_per_Comedy: double (nullable = false)
 |-- popularity_per_Crime: double (nullable = false)
 |-- popularity_per_Documentary: double (nullable = false)
 |-- popularity_per_Drama: double (nullable = false)
 |-- popularity_per_Fantasy: double (nullable = false)
 |-- popularity_per_Film-Noir: double (nullable = false)
 |-- popularity_per_Horror: double (nullable = false)
 |-- popularity_per_Musical: double (nullable = false)
 |-- popularity_per_Mystery: double (nullable = false)
 |-- popularity_per_Romance: double (nullable = false)
 |-- popularity_per_Sci-Fi: double (nullable = false)
 |-- po

In [45]:
inspect_null(sub1)

For columns movie_id:	0 null records
For columns year:	0 null records
For columns watches:	0 null records
For columns avg_rating:	0 null records
For columns popularity_per_Action:	0 null records
For columns popularity_per_Adventure:	0 null records
For columns popularity_per_Animation:	0 null records
For columns popularity_per_Children's:	0 null records
For columns popularity_per_Comedy:	0 null records
For columns popularity_per_Crime:	0 null records
For columns popularity_per_Documentary:	0 null records
For columns popularity_per_Drama:	0 null records
For columns popularity_per_Fantasy:	0 null records
For columns popularity_per_Film-Noir:	0 null records
For columns popularity_per_Horror:	0 null records
For columns popularity_per_Musical:	0 null records
For columns popularity_per_Mystery:	0 null records
For columns popularity_per_Romance:	0 null records
For columns popularity_per_Sci-Fi:	0 null records
For columns popularity_per_Thriller:	0 null records
For columns popularity_per_War:	0

In [46]:
sub1.toPandas().to_csv("./Data/cleaned_data_1/pivoted_movies_features.csv", header=True, columns=sub1.columns, index=False)

## 4.2. Users Dataset

**Features used for the user:**

1. gender
2. age class
3. Occupation class
4. Region
5. Avarage ratings
6. number of watched movies
7. avarage rating per genre
8. the mode year of the movies watched
9. Median year of the movies watched

**For missing category avarage rating & Popularity avarage rating impute with avarage rating of all users**

In [47]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- casted_zipcode: integer (nullable = true)
 |-- region: integer (nullable = true)



**Avarage ratings & number of watched movies**

In [48]:
query = """
    SELECT  A.user_id, A.gender, A.age, A.occupation, A.region, B.avg_rating, B.watched_movies
    FROM    users_info A
            INNER JOIN 
            (SELECT   user_id, MEAN(rating) AS avg_rating, COUNT(movie_id) AS watched_movies
            FROM     ratings_info
            GROUP BY user_id) B
            ON B.user_id = A.user_id
"""

users = spark.sql(query)
users.createOrReplaceTempView("users_info")
users.show(5)

+-------+------+---+----------+------+------------------+--------------+
|user_id|gender|age|occupation|region|        avg_rating|watched_movies|
+-------+------+---+----------+------+------------------+--------------+
|    148|     1|  6|        17|     5| 3.733974358974359|           624|
|    463|     1|  3|         7|     5|               3.0|           123|
|    471|     1|  4|         7|     0|3.6285714285714286|           105|
|    496|     1|  2|         4|     5| 4.294117647058823|           119|
|    833|     1|  4|         7|     4|4.0476190476190474|            21|
+-------+------+---+----------+------+------------------+--------------+
only showing top 5 rows



In [49]:
movies.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- parsed_genre: string (nullable = false)
 |-- value: integer (nullable = false)



**Avarage ratings per genre**

In [50]:
spark.sql("SELECT * FROM movies_info").show(5)

+--------+----------------+--------------------+----+---------+------------+-----+
|movie_id|      movie_name|               genre|year|     name|parsed_genre|value|
+--------+----------------+--------------------+----+---------+------------+-----+
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|   Animation|    1|
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|  Children's|    1|
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|      Comedy|    1|
|       2|  Jumanji (1995)|Adventure|Childre...|1995|  Jumanji|   Adventure|    1|
|       2|  Jumanji (1995)|Adventure|Childre...|1995|  Jumanji|  Children's|    1|
+--------+----------------+--------------------+----+---------+------------+-----+
only showing top 5 rows



In [51]:
query = """SELECT  parsed_genre, user_id,  MEAN(rating) avg_rating_per_genre
            FROM    ratings_info A INNER JOIN movies_info B USING (movie_id)
            GROUP BY parsed_genre, user_id
"""

avg_per_genre = spark.sql(query)
avg_per_genre.show(5)

+------------+-------+--------------------+
|parsed_genre|user_id|avg_rating_per_genre|
+------------+-------+--------------------+
|      Sci-Fi|      9|   3.888888888888889|
|      Sci-Fi|     28|                 3.6|
|     Western|     28|                 3.0|
|     Fantasy|     37|                 2.0|
|   Adventure|     42|  3.8059701492537314|
+------------+-------+--------------------+
only showing top 5 rows



In [52]:
excluded = ["user_id"]
avg_per_genre = avg_per_genre.groupBy("user_id").pivot("parsed_genre").sum("avg_rating_per_genre")
columns = {col: 0 for col in avg_per_genre.columns if not(col in excluded)}
avg_per_genre = avg_per_genre.fillna(columns)

for col in avg_per_genre.columns:
    if not(col in excluded):
        avg_per_genre = avg_per_genre.withColumnRenamed(col, "avg_rating_for_"+col)

        
avg_per_genre.createOrReplaceTempView("avg_info")
avg_per_genre.show(5)

+-------+---------------------+------------------------+------------------------+-------------------------+---------------------+--------------------+--------------------------+--------------------+----------------------+------------------------+---------------------+----------------------+----------------------+----------------------+---------------------+-----------------------+------------------+----------------------+
|user_id|avg_rating_for_Action|avg_rating_for_Adventure|avg_rating_for_Animation|avg_rating_for_Children's|avg_rating_for_Comedy|avg_rating_for_Crime|avg_rating_for_Documentary|avg_rating_for_Drama|avg_rating_for_Fantasy|avg_rating_for_Film-Noir|avg_rating_for_Horror|avg_rating_for_Musical|avg_rating_for_Mystery|avg_rating_for_Romance|avg_rating_for_Sci-Fi|avg_rating_for_Thriller|avg_rating_for_War|avg_rating_for_Western|
+-------+---------------------+------------------------+------------------------+-------------------------+---------------------+-------------------

In [53]:
query = """
        SELECT  *
        FROM    users_info INNER JOIN avg_info USING (user_id)
"""

users = spark.sql(query)
users.createOrReplaceTempView("users_info")
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- watched_movies: long (nullable = false)
 |-- avg_rating_for_Action: double (nullable = false)
 |-- avg_rating_for_Adventure: double (nullable = false)
 |-- avg_rating_for_Animation: double (nullable = false)
 |-- avg_rating_for_Children's: double (nullable = false)
 |-- avg_rating_for_Comedy: double (nullable = false)
 |-- avg_rating_for_Crime: double (nullable = false)
 |-- avg_rating_for_Documentary: double (nullable = false)
 |-- avg_rating_for_Drama: double (nullable = false)
 |-- avg_rating_for_Fantasy: double (nullable = false)
 |-- avg_rating_for_Film-Noir: double (nullable = false)
 |-- avg_rating_for_Horror: double (nullable = false)
 |-- avg_rating_for_Musical: double (nullable = false)
 |-- avg_rating_for_Mystery: double (nullabl

In [54]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- watched_movies: long (nullable = false)
 |-- avg_rating_for_Action: double (nullable = false)
 |-- avg_rating_for_Adventure: double (nullable = false)
 |-- avg_rating_for_Animation: double (nullable = false)
 |-- avg_rating_for_Children's: double (nullable = false)
 |-- avg_rating_for_Comedy: double (nullable = false)
 |-- avg_rating_for_Crime: double (nullable = false)
 |-- avg_rating_for_Documentary: double (nullable = false)
 |-- avg_rating_for_Drama: double (nullable = false)
 |-- avg_rating_for_Fantasy: double (nullable = false)
 |-- avg_rating_for_Film-Noir: double (nullable = false)
 |-- avg_rating_for_Horror: double (nullable = false)
 |-- avg_rating_for_Musical: double (nullable = false)
 |-- avg_rating_for_Mystery: double (nullabl

In [55]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- watched_movies: long (nullable = false)
 |-- avg_rating_for_Action: double (nullable = false)
 |-- avg_rating_for_Adventure: double (nullable = false)
 |-- avg_rating_for_Animation: double (nullable = false)
 |-- avg_rating_for_Children's: double (nullable = false)
 |-- avg_rating_for_Comedy: double (nullable = false)
 |-- avg_rating_for_Crime: double (nullable = false)
 |-- avg_rating_for_Documentary: double (nullable = false)
 |-- avg_rating_for_Drama: double (nullable = false)
 |-- avg_rating_for_Fantasy: double (nullable = false)
 |-- avg_rating_for_Film-Noir: double (nullable = false)
 |-- avg_rating_for_Horror: double (nullable = false)
 |-- avg_rating_for_Musical: double (nullable = false)
 |-- avg_rating_for_Mystery: double (nullabl

In [56]:
query = """
        SELECT  user_id, MEAN(year) AS year_preferred
        FROM    (SELECT * FROM ratings_info WHERE rating>3) A 
                INNER JOIN 
                (SELECT movie_id, MEAN(year) year FROM movies_info GROUP BY movie_id) B
                USING (movie_id)
        GROUP BY user_id
"""

year = spark.sql(query)
year.show(5)

+-------+------------------+
|user_id|    year_preferred|
+-------+------------------+
|    148|1985.5959079283887|
|    463|1981.9772727272727|
|    471|1977.6949152542372|
|    496|1989.5643564356435|
|    833|         1992.9375|
+-------+------------------+
only showing top 5 rows



In [57]:
year[year["year_preferred"].isNull()].show()

+-------+--------------+
|user_id|year_preferred|
+-------+--------------+
+-------+--------------+



In [58]:
users.createOrReplaceTempView("users_info")
year.createOrReplaceTempView("year_info")

In [59]:
query = """
        SELECT    *
        FROM users_info INNER JOIN year_info USING (user_id)
"""

users = spark.sql(query)
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- watched_movies: long (nullable = false)
 |-- avg_rating_for_Action: double (nullable = false)
 |-- avg_rating_for_Adventure: double (nullable = false)
 |-- avg_rating_for_Animation: double (nullable = false)
 |-- avg_rating_for_Children's: double (nullable = false)
 |-- avg_rating_for_Comedy: double (nullable = false)
 |-- avg_rating_for_Crime: double (nullable = false)
 |-- avg_rating_for_Documentary: double (nullable = false)
 |-- avg_rating_for_Drama: double (nullable = false)
 |-- avg_rating_for_Fantasy: double (nullable = false)
 |-- avg_rating_for_Film-Noir: double (nullable = false)
 |-- avg_rating_for_Horror: double (nullable = false)
 |-- avg_rating_for_Musical: double (nullable = false)
 |-- avg_rating_for_Mystery: double (nullabl

In [60]:
users.select("region").distinct().show(25)

+------+
|region|
+------+
|     1|
|     6|
|     3|
|     5|
|     9|
|     4|
|     8|
|     7|
|     2|
|     0|
+------+



In [61]:
users.toPandas().to_csv("./Data/cleaned_data_1/pivoted_users_features.csv", header=True, columns=users.columns, index=False)
users.createOrReplaceTempView("users_info")
sub1.createOrReplaceTempView("movies_info")
print(len(sub1.columns))
print(len(users.columns))

40
26


# 5. Joining Features and creating unified dataset

In [62]:
query = """
        SELECT *
        FROM (ratings_info INNER JOIN users_info USING (user_id)) INNER JOIN sub1_info USING (movie_id)
"""

result = spark.sql(query)

In [63]:
len(result.columns)

68

In [64]:
result.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)
 |-- gender: integer (nullable = false)
 |-- age: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- region: integer (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- watched_movies: long (nullable = false)
 |-- avg_rating_for_Action: double (nullable = false)
 |-- avg_rating_for_Adventure: double (nullable = false)
 |-- avg_rating_for_Animation: double (nullable = false)
 |-- avg_rating_for_Children's: double (nullable = false)
 |-- avg_rating_for_Comedy: double (nullable = false)
 |-- avg_rating_for_Crime: double (nullable = false)
 |-- avg_rating_for_Documentary: double (nullable = false)
 |-- avg_rating_for_Drama: double (nullable = false)
 |-- avg_rating_for_Fantasy: double (nullable = false)
 |-- avg_rating_for_Film-Noir: double (nullable = false)
 |-- avg_rating_for_Horror: do

In [65]:
result.toPandas().to_csv("./Data/cleaned_data_1/unified_rating_features.csv", header=True, columns=result.columns, index=False)