# 1. Data Loading

In [None]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz
!tar xf spark-3.4.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql import SparkSession
import pyspark.ml as ml

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

In [None]:
def load_dfs():
    global movies, users, ratings
    movies = spark.read.csv("/content/drive/MyDrive/movieLens/movies.dat", sep="::", encoding="latin1")
    movies = movies.toDF("movie_id", "movie_name", "genre").cache()
    movies.createOrReplaceTempView("movies_info")

    users = spark.read.csv("/content/drive/MyDrive/movieLens/users.dat", sep="::", encoding="latin1")
    users = users.toDF("user_id", "gender", "age", "occupation", "zipcode").cache()
    users.createOrReplaceTempView("users_info")

    ratings = spark.read.csv("/content/drive/MyDrive/movieLens/ratings.dat", sep="::", encoding="latin1")
    ratings = ratings.toDF("user_id", "movie_id", "rating", "time_stamp").cache()
    ratings.createOrReplaceTempView("ratings_info")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
load_dfs()

In [None]:
movies.show(5)

+--------+--------------------+--------------------+
|movie_id|          movie_name|               genre|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
+--------+--------------------+--------------------+
only showing top 5 rows



In [None]:
users.show(5)

+-------+------+---+----------+-------+
|user_id|gender|age|occupation|zipcode|
+-------+------+---+----------+-------+
|      1|     F|  1|        10|  48067|
|      2|     M| 56|        16|  70072|
|      3|     M| 25|        15|  55117|
|      4|     M| 45|         7|  02460|
|      5|     M| 25|        20|  55455|
+-------+------+---+----------+-------+
only showing top 5 rows



In [None]:
ratings.show(5)

+-------+--------+------+----------+
|user_id|movie_id|rating|time_stamp|
+-------+--------+------+----------+
|      1|    1193|     5| 978300760|
|      1|     661|     3| 978302109|
|      1|     914|     3| 978301968|
|      1|    3408|     4| 978300275|
|      1|    2355|     5| 978824291|
+-------+--------+------+----------+
only showing top 5 rows



# 2. Checking Null Values

In [None]:
def inspect_null(df):
    for col in df.columns:
        empty = df.filter(df[col].isNull()).count()
        print(f"For columns {col}:\t{empty} null records")

In [None]:
inspect_null(movies)

For columns movie_id:	0 null records
For columns movie_name:	0 null records
For columns genre:	0 null records


In [None]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records


In [None]:
inspect_null(ratings)

For columns user_id:	0 null records
For columns movie_id:	0 null records
For columns rating:	0 null records
For columns time_stamp:	0 null records


# 3. Encoding String Data Types

## 3.1. For Users Dataset

In [None]:
users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zipcode: string (nullable = true)



**Gender**

In [None]:
users = users.withColumn("gender", pyspark.sql.functions.when(users["gender"] == 'M', 1).otherwise(0))

**Mapping Age to Age Category**

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

label_mapping = {
    1: 1,
    18: 2,
    25: 3,
    35: 4,
    45: 5,
    50: 6,
    56: 7
}

age_udf = udf(lambda record: label_mapping[int(record)], IntegerType())
users = users.withColumn("age", age_udf(users["age"]))

In [None]:
users.show(5)

+-------+------+---+----------+-------+
|user_id|gender|age|occupation|zipcode|
+-------+------+---+----------+-------+
|      1|     0|  1|        10|  48067|
|      2|     1|  7|        16|  70072|
|      3|     1|  3|        15|  55117|
|      4|     1|  5|         7|  02460|
|      5|     1|  3|        20|  55455|
+-------+------+---+----------+-------+
only showing top 5 rows



**Mapping Zipcode to Region & Imputing the missing values by 0**

In [None]:
users = users.withColumn("casted_zipcode", users["zipcode"].cast(IntegerType()))

In [None]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records
For columns casted_zipcode:	66 null records


In [None]:
users.filter(users["casted_zipcode"].isNull()).show(100)
# Show the first 5 rows of the result

+-------+------+---+----------+----------+--------------+
|user_id|gender|age|occupation|   zipcode|casted_zipcode|
+-------+------+---+----------+----------+--------------+
|    161|     1|  5|        16|98107-2117|          NULL|
|    233|     0|  5|        20|37919-4204|          NULL|
|    293|     1|  7|         1|55337-4056|          NULL|
|    458|     1|  6|        16|55405-2546|          NULL|
|    506|     1|  3|        16|55103-1006|          NULL|
|    567|     1|  4|        20|52570-9634|          NULL|
|    868|     1|  6|        17|01702-7224|          NULL|
|    913|     1|  3|         0|20744-6223|          NULL|
|    939|     0|  3|        20|20110-5616|          NULL|
|    946|     1|  4|         7|48103-8929|          NULL|
|   1046|     1|  7|        18|53404-1230|          NULL|
|   1081|     1|  2|         4|68144-2410|          NULL|
|   1139|     1|  3|         1|93420-2852|          NULL|
|   1201|     1|  2|         4|84112-2004|          NULL|
|   1463|     

In [None]:
def to_integer(record):
    try:
        record = int(record)
    except:
        record = int(record[:5])
    return record

zipcode_udf = udf(lambda record: to_integer(record), IntegerType())
users = users.withColumn("casted_zipcode", zipcode_udf(users["zipcode"]))

In [None]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records
For columns casted_zipcode:	0 null records


In [None]:
zipcode_udf = udf(lambda record: record // 10000, IntegerType())
users = users.withColumn("region", zipcode_udf(users["casted_zipcode"]))

**Asserting that All the data are in integer type with no nulls**

In [None]:
inspect_null(users)

For columns user_id:	0 null records
For columns gender:	0 null records
For columns age:	0 null records
For columns occupation:	0 null records
For columns zipcode:	0 null records
For columns casted_zipcode:	0 null records
For columns region:	0 null records


In [None]:
for col in users.columns:
    users = users.withColumn(col, users[col].cast(IntegerType()))

In [None]:
users.show(5)

+-------+------+---+----------+-------+--------------+------+
|user_id|gender|age|occupation|zipcode|casted_zipcode|region|
+-------+------+---+----------+-------+--------------+------+
|      1|     0|  1|        10|  48067|         48067|     4|
|      2|     1|  7|        16|  70072|         70072|     7|
|      3|     1|  3|        15|  55117|         55117|     5|
|      4|     1|  5|         7|   2460|          2460|     0|
|      5|     1|  3|        20|  55455|         55455|     5|
+-------+------+---+----------+-------+--------------+------+
only showing top 5 rows



**Applying One Hot Encoding on the Region & Occupation variables**

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol="occupation", dropLast=False, outputCol="encoded_occupation")
users = encoder.fit(users).transform(users)

encoder = OneHotEncoder(inputCol="region", dropLast=False, outputCol="encoded_region")
users = encoder.fit(users).transform(users)

**Presenting the dataset after processing**

In [None]:
users.show(5)

+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
|user_id|gender|age|occupation|zipcode|casted_zipcode|region|encoded_occupation|   encoded_region|
+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
|      1|     0|  1|        10|  48067|         48067|     4|   (21,[10],[1.0])|(19313,[4],[1.0])|
|      2|     1|  7|        16|  70072|         70072|     7|   (21,[16],[1.0])|(19313,[7],[1.0])|
|      3|     1|  3|        15|  55117|         55117|     5|   (21,[15],[1.0])|(19313,[5],[1.0])|
|      4|     1|  5|         7|   2460|          2460|     0|    (21,[7],[1.0])|(19313,[0],[1.0])|
|      5|     1|  3|        20|  55455|         55455|     5|   (21,[20],[1.0])|(19313,[5],[1.0])|
+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
only showing top 5 rows



## 3.2. For Movies Dataset

In [None]:
movies.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- genre: string (nullable = true)



In [None]:
movies.show(5)

+--------+--------------------+--------------------+
|movie_id|          movie_name|               genre|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
+--------+--------------------+--------------------+
only showing top 5 rows



**Transforming movies_id to integer**

In [86]:
movies = movies.withColumn("movie_id", movies["movie_id"].cast(IntegerType()))

**Parsing movie_name to year and name**

In [87]:
import re

def extract_date(record):
    pattern  = r'\((\d{4})\)'
    if re.findall(pattern, record.strip()[-6:]):
        return int(record.strip()[-5:-1])
    return None

def extract_name(record):
    pattern  = r'\((\d{4})\)'
    if re.findall(pattern, record.strip()[-6:]):
        return record.strip()[:-6].strip()
    return record

In [88]:
# Define UDFs for extract_date and extract_name functions
extract_date_udf = udf(lambda record: extract_date(record), IntegerType())
extract_name_udf = udf(lambda record: extract_name(record), StringType())

movies = movies.withColumn("year", extract_date_udf(movies["movie_name"]))
movies = movies.withColumn("name", extract_name_udf(movies["movie_name"]))

In [89]:
movies.show(5)

+--------+--------------------+--------------------+----+-----------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movie_id|          movie_name|                name|year|            genre|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+--------+--------------------+--------------------+----+-----------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|    1387|         Jaws (1975)|                Jaws|1975|    Action|Horror|     1|        0|        0|         0|     0|    0|          0|    0|      0|        0|     1|      0|      0|      0|     0|       0|  0|      0|
|    2453|Boy Who Could Fly...|Boy Who Could Fly...|1986|    Drama|Fantasy|     0|        0|        0|         0

**Parsing the genre into a serie of genres**

In [90]:
movies = movies.withColumn("parsed_genre", pyspark.sql.functions.explode(pyspark.sql.functions.split(movies["genre"], "\\s*\\|\\s*")))
parsed_movies = movies.withColumn("value", (movies["parsed_genre"]==movies["parsed_genre"]).cast(IntegerType()))

In [91]:
parsed_movies.show(5)

+--------+--------------------+--------------------+----+-------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------------+-----+
|movie_id|          movie_name|                name|year|        genre|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|parsed_genre|value|
+--------+--------------------+--------------------+----+-------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------------+-----+
|    1387|         Jaws (1975)|                Jaws|1975|Action|Horror|     1|        0|        0|         0|     0|    0|          0|    0|      0|        0|     1|      0|      0|      0|     0|       0|  0|      0|      Action|    1|
|    1387|         Jaws (1975)|                Jaws|

In [92]:
excluded = ["movie_id", "movie_name", "name", "year", "genre"]
parsed_movies = parsed_movies.groupBy(excluded).pivot("parsed_genre").sum("value")

columns = {col: 0 for col in parsed_movies.columns if not(col in excluded)}
parsed_movies = parsed_movies.fillna(columns)

In [93]:
parsed_movies.show(5)

+--------+--------------------+--------------------+----+-----------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movie_id|          movie_name|                name|year|            genre|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+--------+--------------------+--------------------+----+-----------------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|    1387|         Jaws (1975)|                Jaws|1975|    Action|Horror|     1|        0|        0|         0|     0|    0|          0|    0|      0|        0|     1|      0|      0|      0|     0|       0|  0|      0|
|    2453|Boy Who Could Fly...|Boy Who Could Fly...|1986|    Drama|Fantasy|     0|        0|        0|         0

**Inspecting the schema and the null values**

In [None]:
parsed_movies.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- Action: long (nullable = false)
 |-- Adventure: long (nullable = false)
 |-- Animation: long (nullable = false)
 |-- Children's: long (nullable = false)
 |-- Comedy: long (nullable = false)
 |-- Crime: long (nullable = false)
 |-- Documentary: long (nullable = false)
 |-- Drama: long (nullable = false)
 |-- Fantasy: long (nullable = false)
 |-- Film-Noir: long (nullable = false)
 |-- Horror: long (nullable = false)
 |-- Musical: long (nullable = false)
 |-- Mystery: long (nullable = false)
 |-- Romance: long (nullable = false)
 |-- Sci-Fi: long (nullable = false)
 |-- Thriller: long (nullable = false)
 |-- War: long (nullable = false)
 |-- Western: long (nullable = false)



In [None]:
inspect_null(parsed_movies)

For columns movie_id:	0 null records
For columns movie_name:	0 null records
For columns name:	0 null records
For columns year:	0 null records
For columns genre:	0 null records
For columns Action:	0 null records
For columns Adventure:	0 null records
For columns Animation:	0 null records
For columns Children's:	0 null records
For columns Comedy:	0 null records
For columns Crime:	0 null records
For columns Documentary:	0 null records
For columns Drama:	0 null records
For columns Fantasy:	0 null records
For columns Film-Noir:	0 null records
For columns Horror:	0 null records
For columns Musical:	0 null records
For columns Mystery:	0 null records
For columns Romance:	0 null records
For columns Sci-Fi:	0 null records
For columns Thriller:	0 null records
For columns War:	0 null records
For columns Western:	0 null records


## 3.3. For Ratings Dataset

In [None]:
ratings.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- time_stamp: string (nullable = true)



**Casting All the attributes to int type**

In [None]:
for col in ratings.columns:
    ratings = ratings.withColumn(col, ratings[col].cast(IntegerType()))

In [None]:
ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- time_stamp: integer (nullable = true)



In [None]:
ratings.show(5)

+-------+--------+------+----------+
|user_id|movie_id|rating|time_stamp|
+-------+--------+------+----------+
|      1|    1193|     5| 978300760|
|      1|     661|     3| 978302109|
|      1|     914|     3| 978301968|
|      1|    3408|     4| 978300275|
|      1|    2355|     5| 978824291|
+-------+--------+------+----------+
only showing top 5 rows



In [None]:
ratings.toPandas().to_csv("/content/drive/MyDrive/movieLens/cleaned_data/ratings.csv", header=True)
parsed_movies.toPandas().to_csv("/content/drive/MyDrive/movieLens/cleaned_data/movies.csv", header=True)
users.toPandas().to_csv("/content/drive/MyDrive/movieLens/cleaned_data/users.csv", header=True)

In [None]:
original_movies = movies
movies = parsed_movies

In [None]:
original_movies.toPandas().to_csv("/content/drive/MyDrive/movieLens/cleaned_data/original_movies.csv", header=True)

# 4. Feature Engineering

In [None]:
movies.createOrReplaceTempView("movies_info")
users.createOrReplaceTempView("users_info")
ratings.createOrReplaceTempView("ratings_info")
original_movies.createOrReplaceTempView("original_movies_info")

In [None]:
original_movies.show(5)

+--------+----------------+--------------------+----+---------+------------+
|movie_id|      movie_name|               genre|year|     name|parsed_genre|
+--------+----------------+--------------------+----+---------+------------+
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|   Animation|
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|  Children's|
|       1|Toy Story (1995)|Animation|Childre...|1995|Toy Story|      Comedy|
|       2|  Jumanji (1995)|Adventure|Childre...|1995|  Jumanji|   Adventure|
|       2|  Jumanji (1995)|Adventure|Childre...|1995|  Jumanji|  Children's|
+--------+----------------+--------------------+----+---------+------------+
only showing top 5 rows



## 4.2. Movies Features

In [None]:
movies.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- Action: long (nullable = false)
 |-- Adventure: long (nullable = false)
 |-- Animation: long (nullable = false)
 |-- Children's: long (nullable = false)
 |-- Comedy: long (nullable = false)
 |-- Crime: long (nullable = false)
 |-- Documentary: long (nullable = false)
 |-- Drama: long (nullable = false)
 |-- Fantasy: long (nullable = false)
 |-- Film-Noir: long (nullable = false)
 |-- Horror: long (nullable = false)
 |-- Musical: long (nullable = false)
 |-- Mystery: long (nullable = false)
 |-- Romance: long (nullable = false)
 |-- Sci-Fi: long (nullable = false)
 |-- Thriller: long (nullable = false)
 |-- War: long (nullable = false)
 |-- Western: long (nullable = false)



**Features used for the movie:**
1. year
2. genres
3. popularity (no of times watched)
4. popularity among its genre
5. avarage rating
6. rating ratio per genre

**Popularity**

In [None]:
popularity = spark.sql("SELECT movie_id, COUNT(DISTINCT(user_id)) AS watches FROM ratings_info GROUP BY movie_id")
popularity.createOrReplaceTempView("popularity_info")

In [None]:
popularity.show(5)

+--------+-------+
|movie_id|watches|
+--------+-------+
|    1580|   2538|
|     471|    599|
|    3175|   1728|
|    1959|    626|
|    3794|    121|
+--------+-------+
only showing top 5 rows



**Popularity among its genre**

In [None]:
query = """
    SELECT parsed_genre AS genre, COUNT(user_id) AS genre_watched
    FROM   ratings_info LEFT JOIN original_movies_info ON original_movies_info.movie_id = ratings_info.movie_id
    GROUP BY parsed_genre
"""

watches_per_genre = spark.sql(query)
watches_per_genre.createOrReplaceTempView("watches_per_genre_info")

In [None]:
watches_per_genre.show()

+-----------+-------------+
|      genre|genre_watched|
+-----------+-------------+
|      Crime|        79541|
|    Romance|       147523|
|   Thriller|       189680|
|  Adventure|       133953|
|      Drama|       354529|
| Children's|        72186|
|        War|        68527|
|Documentary|         7910|
|    Fantasy|        36301|
|    Mystery|        40178|
|    Musical|        41533|
|  Animation|        43293|
|  Film-Noir|        18261|
|     Horror|        76386|
|    Western|        20683|
|     Comedy|       356580|
|     Action|       257457|
|     Sci-Fi|       157294|
+-----------+-------------+



In [None]:
query = """
    SELECT *, watches/genre_watched AS popularity_per_genre
    FROM    (SELECT original_movies_info.movie_id AS movie_id, parsed_genre, watches
             FROM   original_movies_info INNER JOIN popularity_info ON original_movies_info.movie_id = popularity_info.movie_id
            ) A INNER JOIN
            watches_per_genre_info ON A.parsed_genre = watches_per_genre_info.genre
"""

df = spark.sql(query)
df.createOrReplaceTempView("df_info")
df.show()

+--------+------------+-------+---------+-------------+--------------------+
|movie_id|parsed_genre|watches|    genre|genre_watched|popularity_per_genre|
+--------+------------+-------+---------+-------------+--------------------+
|    1580|      Sci-Fi|   2538|   Sci-Fi|       157294|  0.0161353897796483|
|    1580|      Comedy|   2538|   Comedy|       356580|0.007117617364967188|
|    1580|   Adventure|   2538|Adventure|       133953| 0.01894694407740028|
|    1580|      Action|   2538|   Action|       257457|0.009857956862699403|
|     471|     Romance|    599|  Romance|       147523|0.004060383804559289|
|     471|      Comedy|    599|   Comedy|       356580|0.001679847439564754|
|    3175|      Sci-Fi|   1728|   Sci-Fi|       157294|0.010985797296781823|
|    3175|      Comedy|   1728|   Comedy|       356580|0.004846037354871277|
|    3175|   Adventure|   1728|Adventure|       133953| 0.01290004703142147|
|    1959|     Romance|    626|  Romance|       147523|  0.0042434061129451|

**Avarage Rating**

In [None]:
query = """
    SELECT movie_id, AVG(rating) AS avg_rating
    FROM ratings_info
    GROUP BY movie_id
"""

avg = spark.sql(query)
avg.createOrReplaceTempView("avg_info")

In [None]:
avg.show()

+--------+------------------+
|movie_id|        avg_rating|
+--------+------------------+
|    1580| 3.739952718676123|
|    2366|3.6560846560846563|
|    1088|3.3114992721979624|
|    1959|3.6533546325878596|
|    3175| 3.771412037037037|
|    1645|3.4358353510895885|
|     496|3.2162162162162162|
|    2142|2.8308457711442787|
|    1591|2.6210526315789475|
|    2122|2.4463519313304722|
|     833|2.1794871794871793|
|     463|  2.74468085106383|
|     471| 3.631051752921536|
|    1342| 2.904580152671756|
|     148| 2.782608695652174|
|    3918| 2.802395209580838|
|    3794|  3.28099173553719|
|    1238|               4.0|
|    2866|3.6884422110552766|
|    3749|3.1363636363636362|
+--------+------------------+
only showing top 20 rows



In [None]:
query = """
    SELECT A.movie_id, parsed_genre, watches, genre, genre_watched, popularity_per_genre, avg_rating
    FROM   df_info AS A LEFT JOIN avg_info ON A.movie_id = avg_info.movie_id
"""

df = spark.sql(query)
df.createOrReplaceTempView("df_info")
df.show()

+--------+------------+-------+---------+-------------+--------------------+------------------+
|movie_id|parsed_genre|watches|    genre|genre_watched|popularity_per_genre|        avg_rating|
+--------+------------+-------+---------+-------------+--------------------+------------------+
|    1580|      Sci-Fi|   2538|   Sci-Fi|       157294|  0.0161353897796483| 3.739952718676123|
|    1580|      Comedy|   2538|   Comedy|       356580|0.007117617364967188| 3.739952718676123|
|    1580|   Adventure|   2538|Adventure|       133953| 0.01894694407740028| 3.739952718676123|
|    1580|      Action|   2538|   Action|       257457|0.009857956862699403| 3.739952718676123|
|     471|     Romance|    599|  Romance|       147523|0.004060383804559289| 3.631051752921536|
|     471|      Comedy|    599|   Comedy|       356580|0.001679847439564754| 3.631051752921536|
|    3175|      Sci-Fi|   1728|   Sci-Fi|       157294|0.010985797296781823| 3.771412037037037|
|    3175|      Comedy|   1728|   Comedy

**rating ratio to genre rating**

In [None]:
query = """
    SELECT A.parsed_genre AS genre, MEAN(B.rating) AS mean_genre_rating
    FROM   original_movies_info AS A JOIN ratings_info B ON A.movie_id = B.movie_id
    GROUP BY A.parsed_genre
"""

avg = spark.sql(query)
avg.createOrReplaceTempView("avg_info")
avg.show()

+-----------+------------------+
|      genre| mean_genre_rating|
+-----------+------------------+
|      Crime| 3.708678543141273|
|    Romance| 3.607464598740535|
|   Thriller|3.5704660480809784|
|  Adventure| 3.477256948332624|
|      Drama| 3.766332232342065|
| Children's| 3.422034743579087|
|        War| 3.893326717935996|
|Documentary| 3.933122629582807|
|    Fantasy| 3.447370595851354|
|    Mystery|3.6681019463387923|
|    Musical|3.6655189849035708|
|  Animation| 3.684868223500335|
|  Film-Noir| 4.075187558184108|
|     Horror| 3.215013222318226|
|    Western|3.6377701493980563|
|     Comedy| 3.522098827752538|
|     Action|3.4911849357368414|
|     Sci-Fi| 3.466521291339784|
+-----------+------------------+



In [None]:
query = """
    SELECT *, avg_rating/mean_genre_rating AS rating_per_genre
    FROM   df_info AS A LEFT JOIN avg_info B ON A.parsed_genre = B.genre
"""

df = spark.sql(query)
df.show(5)

+--------+------------+-------+---------+-------------+--------------------+-----------------+---------+------------------+------------------+
|movie_id|parsed_genre|watches|    genre|genre_watched|popularity_per_genre|       avg_rating|    genre| mean_genre_rating|  rating_per_genre|
+--------+------------+-------+---------+-------------+--------------------+-----------------+---------+------------------+------------------+
|    1580|      Sci-Fi|   2538|   Sci-Fi|       157294|  0.0161353897796483|3.739952718676123|   Sci-Fi| 3.466521291339784|1.0788777579469762|
|    1580|      Comedy|   2538|   Comedy|       356580|0.007117617364967188|3.739952718676123|   Comedy| 3.522098827752538|1.0618534293265696|
|    1580|   Adventure|   2538|Adventure|       133953| 0.01894694407740028|3.739952718676123|Adventure| 3.477256948332624|1.0755468388579865|
|    1580|      Action|   2538|   Action|       257457|0.009857956862699403|3.739952718676123|   Action|3.4911849357368414|1.0712559739797276|

In [None]:
df.columns

['movie_id',
 'parsed_genre',
 'watches',
 'genre',
 'genre_watched',
 'popularity_per_genre',
 'avg_rating',
 'genre',
 'mean_genre_rating',
 'rating_per_genre']

In [None]:
cleaned_df = df.drop("genre", "genre_watched", "mean_genre_rating")
cleaned_df.columns

['movie_id',
 'parsed_genre',
 'watches',
 'popularity_per_genre',
 'avg_rating',
 'rating_per_genre']

In [None]:
cleaned_df.createOrReplaceTempView("cleaned_df_info")
cleaned_df.show(5)

+--------+------------+-------+--------------------+-----------------+------------------+
|movie_id|parsed_genre|watches|popularity_per_genre|       avg_rating|  rating_per_genre|
+--------+------------+-------+--------------------+-----------------+------------------+
|    1580|      Sci-Fi|   2538|  0.0161353897796483|3.739952718676123|1.0788777579469762|
|    1580|      Comedy|   2538|0.007117617364967188|3.739952718676123|1.0618534293265696|
|    1580|   Adventure|   2538| 0.01894694407740028|3.739952718676123|1.0755468388579865|
|    1580|      Action|   2538|0.009857956862699403|3.739952718676123|1.0712559739797276|
|     471|     Romance|    599|0.004060383804559289|3.631051752921536|1.0065384298405133|
+--------+------------+-------+--------------------+-----------------+------------------+
only showing top 5 rows



In [None]:
query = """
    SELECT A.movie_id, A.parsed_genre, A.watches, A.popularity_per_genre, A.avg_rating, A.rating_per_genre, B.year
    FROM cleaned_df_info A INNER JOIN original_movies_info B ON A.movie_id = B.movie_id AND A.parsed_genre = B.parsed_genre
"""

cleaned_df = spark.sql(query)
cleaned_df.show()

+--------+------------+-------+--------------------+------------------+------------------+----+
|movie_id|parsed_genre|watches|popularity_per_genre|        avg_rating|  rating_per_genre|year|
+--------+------------+-------+--------------------+------------------+------------------+----+
|    1580|      Sci-Fi|   2538|  0.0161353897796483| 3.739952718676123|1.0788777579469762|1997|
|    1580|      Comedy|   2538|0.007117617364967188| 3.739952718676123|1.0618534293265696|1997|
|    1580|   Adventure|   2538| 0.01894694407740028| 3.739952718676123|1.0755468388579865|1997|
|    1580|      Action|   2538|0.009857956862699403| 3.739952718676123|1.0712559739797276|1997|
|     471|     Romance|    599|0.004060383804559289| 3.631051752921536|1.0065384298405133|1994|
|     471|      Comedy|    599|0.001679847439564754| 3.631051752921536| 1.030934090863805|1994|
|    3175|      Sci-Fi|   1728|0.010985797296781823| 3.771412037037037| 1.087952941889884|1999|
|    3175|      Comedy|   1728|0.0048460

In [70]:
cleaned_df.toPandas().to_csv("/content/drive/MyDrive/movieLens/cleaned_data/movies_features.csv", header=True)

## 4.2. User Features

users.printSchema()

**Features used for the user:**
1. gender
2. age category
3. Occupation
4. Region
5. Avarage ratings
6. number of watched movies
7. avarage rating per genre
8. avarage rating per popularity

**For missing category avarage rating & Popularity avarage rating impute with avarage rating of all users**

In [71]:
users.show(5)

+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
|user_id|gender|age|occupation|zipcode|casted_zipcode|region|encoded_occupation|   encoded_region|
+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
|      1|     0|  1|        10|  48067|         48067|     4|   (21,[10],[1.0])|(19313,[4],[1.0])|
|      2|     1|  7|        16|  70072|         70072|     7|   (21,[16],[1.0])|(19313,[7],[1.0])|
|      3|     1|  3|        15|  55117|         55117|     5|   (21,[15],[1.0])|(19313,[5],[1.0])|
|      4|     1|  5|         7|   2460|          2460|     0|    (21,[7],[1.0])|(19313,[0],[1.0])|
|      5|     1|  3|        20|  55455|         55455|     5|   (21,[20],[1.0])|(19313,[5],[1.0])|
+-------+------+---+----------+-------+--------------+------+------------------+-----------------+
only showing top 5 rows



In [72]:
users = users.drop("zipcode")

**Avarage ratings & number of watched movies**

In [104]:
query = """
    SELECT  A.user_id, A.gender, A.age, A.occupation, A.zipcode, A.casted_zipcode, A.region, B.mean_rating, B.watched_movies
    FROM    users_info A
            INNER JOIN
            (SELECT   user_id, MEAN(rating) AS mean_rating, COUNT(movie_id) AS watched_movies
            FROM     ratings_info
            GROUP BY user_id) B
            ON B.user_id = A.user_id
"""

users = spark.sql(query)
users.createOrReplaceTempView("users_info")
users.show(5)

+-------+------+---+----------+-------+--------------+------+------------------+--------------+
|user_id|gender|age|occupation|zipcode|casted_zipcode|region|       mean_rating|watched_movies|
+-------+------+---+----------+-------+--------------+------+------------------+--------------+
|    148|     1|  6|        17|  57747|         57747|     5| 3.733974358974359|           624|
|    463|     1|  3|         7|  55105|         55105|     5|               3.0|           123|
|    471|     1|  4|         7|   8904|          8904|     0|3.6285714285714286|           105|
|    496|     1|  2|         4|  55455|         55455|     5| 4.294117647058823|           119|
|    833|     1|  4|         7|  46825|         46825|     4|4.0476190476190474|            21|
+-------+------+---+----------+-------+--------------+------+------------------+--------------+
only showing top 5 rows



# **Pivoting for all genre related variables**

In [75]:
cleaned_df.createOrReplaceTempView("cleaned_df_info")
cleaned_df.show(5)


+--------+------------+-------+--------------------+-----------------+------------------+----+
|movie_id|parsed_genre|watches|popularity_per_genre|       avg_rating|  rating_per_genre|year|
+--------+------------+-------+--------------------+-----------------+------------------+----+
|    1580|      Sci-Fi|   2538|  0.0161353897796483|3.739952718676123|1.0788777579469762|1997|
|    1580|      Comedy|   2538|0.007117617364967188|3.739952718676123|1.0618534293265696|1997|
|    1580|   Adventure|   2538| 0.01894694407740028|3.739952718676123|1.0755468388579865|1997|
|    1580|      Action|   2538|0.009857956862699403|3.739952718676123|1.0712559739797276|1997|
|     471|     Romance|    599|0.004060383804559289|3.631051752921536|1.0065384298405133|1994|
+--------+------------+-------+--------------------+-----------------+------------------+----+
only showing top 5 rows



In [107]:
genre_mov = parsed_movies.drop("movie_name","name","year","genre")
genre_mov.createOrReplaceTempView("movie_mapped_genre")
genre_mov.show(10)

+--------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movie_id|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+--------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|    1387|     1|        0|        0|         0|     0|    0|          0|    0|      0|        0|     1|      0|      0|      0|     0|       0|  0|      0|
|    2453|     0|        0|        0|         0|     0|    0|          0|    1|      1|        0|     0|      0|      0|      0|     0|       0|  0|      0|
|    2961|     0|        0|        0|         0|     1|    0|          0|    1|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|
|    2857|     0|        0|        1|         0|     0|   

In [82]:
excluded = ["movie_id"]
pop_gen_mov =cleaned_df.groupBy(excluded).pivot("parsed_genre").sum("popularity_per_genre")

columns = {col: 0 for col in pop_gen_mov.columns if not(col in excluded)}
pop_gen_mov = pop_gen_mov.fillna(columns)

In [83]:
pop_gen_mov.show(10)

+--------+--------------------+-------------------+---------+----------+--------------------+-----+-----------+--------------------+-------+---------+--------------------+-------+-------+--------------------+--------------------+--------------------+---+-------+
|movie_id|              Action|          Adventure|Animation|Children's|              Comedy|Crime|Documentary|               Drama|Fantasy|Film-Noir|              Horror|Musical|Mystery|             Romance|              Sci-Fi|            Thriller|War|Western|
+--------+--------------------+-------------------+---------+----------+--------------------+-----+-----------+--------------------+-------+---------+--------------------+-------+-------+--------------------+--------------------+--------------------+---+-------+
|    1580|0.009857956862699403|0.01894694407740028|      0.0|       0.0|0.007117617364967188|  0.0|        0.0|                 0.0|    0.0|      0.0|                 0.0|    0.0|    0.0|                 0.0|  0

In [84]:
excluded = ["movie_id"]
rat_gen_mov =cleaned_df.groupBy(excluded).pivot("parsed_genre").sum("rating_per_genre")

columns = {col: 0 for col in pop_gen_mov.columns if not(col in excluded)}
rat_gen_mov = rat_gen_mov.fillna(columns)
rat_gen_mov.show(10)

+--------+------------------+------------------+---------+----------+------------------+------------------+-----------+------------------+-------+---------+------------------+------------------+------------------+------------------+------------------+------------------+---+-------+
|movie_id|            Action|         Adventure|Animation|Children's|            Comedy|             Crime|Documentary|             Drama|Fantasy|Film-Noir|            Horror|           Musical|           Mystery|           Romance|            Sci-Fi|          Thriller|War|Western|
+--------+------------------+------------------+---------+----------+------------------+------------------+-----------+------------------+-------+---------+------------------+------------------+------------------+------------------+------------------+------------------+---+-------+
|     463|               0.0|               0.0|      0.0|       0.0|               0.0|0.7400697631612657|        0.0|0.7287410355079247|    0.0|     

In [101]:

joinedDF = rat_gen_mov.join(pop_gen_mov, "movie_id").join(genre_mov, "movie_id")

joinedDF.show(10)

+--------+------------------+------------------+---------+----------+------------------+-----+-----------+------------------+-------+---------+------------------+-------+-------+------------------+------------------+-----------------+---+-------+--------------------+-------------------+---------+----------+--------------------+-----+-----------+--------------------+-------+---------+--------------------+-------+-------+--------------------+--------------------+--------------------+---+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movie_id|            Action|         Adventure|Animation|Children's|            Comedy|Crime|Documentary|             Drama|Fantasy|Film-Noir|            Horror|Musical|Mystery|           Romance|            Sci-Fi|         Thriller|War|Western|              Action|          Adventure|Animation|Children's|              Comedy|Crime|Documentary|        

#User Features For popularity and genre interest

In [115]:
from pyspark.sql.functions import sum as spark_sum
total_watches = popularity.select(spark_sum("watches")).collect()[0][0]
# Calculate the ratio of watches for each movie
popularity_ratio = popularity.withColumn("popularity_ratio", popularity["watches"] / total_watches)
# Show the DataFrame with the ratio column
popularity_ratio.show()

+--------+-------+--------------------+
|movie_id|watches|    popularity_ratio|
+--------+-------+--------------------+
|    1580|   2538|0.002537469668839...|
|     471|    599|5.988748351594517E-4|
|    3175|   1728|0.001727638923464...|
|    1959|    626|6.258691933385922E-4|
|    3794|    121|1.209747162842965...|
|    1342|    262|2.619452534420306...|
|    2866|    199|1.989584176907026...|
|     148|     23|2.299519400445307E-5|
|     833|     78|7.798370140640606E-5|
|    2122|    233|2.329513131755463E-4|
|    1088|    687|6.868564470025765E-4|
|    2659|     46|4.599038800890614E-5|
|    1645|    826|8.258274020729667E-4|
|    2142|    201|2.009579997780463...|
|    3918|    167|1.669651042932027...|
|    1238|    351|3.509266563288272...|
|    1591|    475|4.749007457441394...|
|     463|     47|4.699017905257801E-5|
|    2366|    756|7.558420290159357E-4|
|     496|     37|3.699226861585928...|
+--------+-------+--------------------+
only showing top 20 rows



In [128]:
query1 = """
    SELECT  A.user_id, A.rating, A.movie_id, B.*
    FROM    ratings_info A
            INNER JOIN
            movie_mapped_genre B
            ON B.movie_id = A.movie_id
"""
query2 = """
    SELECT user_id,
       sum(Action) AS Action_sum,
       sum(Adventure) AS Adventure_sum,
       sum(Animation) AS Animation_sum,
       sum([Children's]) AS Childrens_sum,
       sum(Comedy) AS Comedy_sum,
       sum(Crime) AS Crime_sum,
       sum(Documentary) AS Documentary_sum,
       sum(Drama) AS Drama_sum,
       sum(Fantasy) AS Fantasy_sum,
       sum("Film-Noir") AS Film_Noir_sum,
       sum(Horror) AS Horror_sum,
       sum(Musical) AS Musical_sum,
       sum(Mystery) AS Mystery_sum,
       sum(Romance) AS Romance_sum,
       sum("Sci-Fi") AS Sci_Fi_sum,
       sum(Thriller) AS Thriller_sum,
       sum(War) AS War_sum,
       sum(Western) AS Western_sum
  FROM user_rat
  GROUP BY user_id
"""

user_rat = spark.sql(query1)
user_rat.createOrReplaceTempView("user_rat")
user_rat = spark.sql(query2)
user_rat.show(5)

ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near '['.(line 6, pos 11)

== SQL ==

    SELECT user_id,
       sum(Action) AS Action_sum,
       sum(Adventure) AS Adventure_sum,
       sum(Animation) AS Animation_sum,
       sum([Children's]) AS Childrens_sum,
-----------^^^
       sum(Comedy) AS Comedy_sum,
       sum(Crime) AS Crime_sum,
       sum(Documentary) AS Documentary_sum,
       sum(Drama) AS Drama_sum,
       sum(Fantasy) AS Fantasy_sum,
       sum("Film-Noir") AS Film_Noir_sum,
       sum(Horror) AS Horror_sum,
       sum(Musical) AS Musical_sum,
       sum(Mystery) AS Mystery_sum,
       sum(Romance) AS Romance_sum,
       sum("Sci-Fi") AS Sci_Fi_sum,
       sum(Thriller) AS Thriller_sum,
       sum(War) AS War_sum,
       sum(Western) AS Western_sum
  FROM user_rat
  GROUP BY user_id
