### Notebook Set-Up

#### Import Required Modules

In [0]:
%pip install pinecone-client python-dotenv

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

from pyspark.ml.recommendation import ALS



#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Load Secrets as Environment Variables

In [0]:
env_path = "/dbfs/FileStore/env/.env"
load_dotenv(env_path)


Out[3]: True

#### Set Constants

### Import and Clean the Raw Data

#### Import and Clean Movie Metadata

In [0]:
metadata_path = "dbfs:/FileStore/data/raw/movies_metadata.csv"
metadata_raw = spark.read.csv(metadata_path, header=True)
metadata_raw.printSchema()
metadata_raw.show(1, truncate=False, vertical=True)

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

In [0]:
metadata = metadata_raw \
    .select(
        f.col('id').cast('STRING').alias('id'),
        f.col('imdb_id'),
        f.col('title'),
        f.col('release_date').cast('DATE').alias('release_date'),
        f.col('popularity').cast('DOUBLE').alias('popularity'),
        f.transform(f.from_json('genres', "ARRAY<STRUCT<id:INTEGER,name:STRING>>"), lambda x: x['name']).alias('genres'),
        f.col('overview'),
        f.col('runtime').cast('DOUBLE').alias('runtime'),
        f.from_json('spoken_languages', "ARRAY<STRUCT<iso_639_1:STRING,name:STRING>>").alias('spoken_languages'),
        f.col('adult').cast('BOOLEAN').alias('adult'),
        f.col('status'),
        f.col('vote_average').cast('DOUBLE').alias('vote_average'),
        f.col('vote_count').cast('INTEGER').alias('vote_count')
    ) \
    .filter(f.col('adult') == False) \
    .filter(f.col('release_date') >= '1970-01-01') \
    .filter(f.array_contains(f.transform('spoken_languages', lambda x: x['name']), 'English')) \
    .filter(f.col('status') == 'Released') \
    .filter(f.col('popularity') >= 5.0) \
    .select('id', 'imdb_id', 'title', 'release_date', 'popularity', 'genres', 'overview', 'runtime', 'vote_average', 'vote_count')
    
metadata.printSchema()
metadata.show(1, truncate=False, vertical=True)
metadata.count()

root
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- popularity: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overview: string (nullable = true)
 |-- runtime: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id           | 862                                                                                                                                                                                                                 

#### Import and Clean the Ratings Data

In [0]:
ratings_path = "dbfs:/FileStore/data/raw/ratings.csv"
ratings_raw = spark.read.csv(ratings_path, header=True)
ratings_raw.printSchema()
ratings_raw.show(5, truncate=False)

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |110    |1.0   |1425941529|
|1     |147    |4.5   |1425942435|
|1     |858    |5.0   |1425941523|
|1     |1221   |5.0   |1425941546|
|1     |1246   |5.0   |1425941556|
+------+-------+------+----------+
only showing top 5 rows



In [0]:
ratings = ratings_raw \
    .select(
        f.col('userId').alias('user_id'),
        f.col('movieId').alias('movie_id'),
        f.col('rating').cast('DOUBLE').alias('rating'),
        f.from_unixtime('timestamp').alias('timestamp')
    )

ratings.printSchema()
ratings.show(5, truncate=False)
ratings.agg(
    f.countDistinct('user_id').alias('total_users'),
    f.countDistinct('movie_id').alias('total_movies'),
    f.count('*').alias('total_ratings')
).show()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+-------+--------+------+-------------------+
|user_id|movie_id|rating|timestamp          |
+-------+--------+------+-------------------+
|1      |110     |1.0   |2015-03-09 22:52:09|
|1      |147     |4.5   |2015-03-09 23:07:15|
|1      |858     |5.0   |2015-03-09 22:52:03|
|1      |1221    |5.0   |2015-03-09 22:52:26|
|1      |1246    |5.0   |2015-03-09 22:52:36|
+-------+--------+------+-------------------+
only showing top 5 rows

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|     270896|       45115|     26024289|
+-----------+------------+-------------+



#### Join with the Metadata to Select Only Movies in Both Datasets

In [0]:
ratings = ratings \
    .join(metadata, on=(ratings['movie_id'] == metadata['id']), how='inner') \
    .select('user_id', 'movie_id', 'rating', 'timestamp') \
    
ratings.agg(
    f.countDistinct('user_id').alias('total_users'),
    f.countDistinct('movie_id').alias('total_movies'),
    f.count('*').alias('total_ratings')
).show()

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|     250691|        1617|      4978604|
+-----------+------------+-------------+



#### Convert to an Implicit Feedback Dataset

In [0]:
MIN_RATING = 3
MIN_USERS = 10
MIN_MOVIES = 10

In [0]:
model_frame = ratings \
    .filter(f.col('rating') >= MIN_RATING) \
    .withColumn('cnt_users', f.count('user_id').over(Window.partitionBy('movie_id'))) \
    .filter(f.col('cnt_users') >= 10) \
    .withColumn('cnt_movies', f.count('movie_id').over(Window.partitionBy('user_id'))) \
    .filter(f.col('cnt_movies') >= 10) \
    .withColumn('rating', f.col('rating') - (MIN_RATING - 1)) \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('movie_id', f.col('movie_id').cast('INTEGER')) \
    .cache()

model_frame.agg(
    f.countDistinct('user_id').alias('total_users'),
    f.countDistinct('movie_id').alias('total_movies'),
    f.count('*').alias('total_ratings')
).show()
model_frame.groupby('rating').count().sort('rating').show()

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|     102103|        1382|      3575530|
+-----------+------------+-------------+

+------+-------+
|rating|  count|
+------+-------+
|   1.0|1042945|
|   1.5| 396672|
|   2.0|1218635|
|   2.5| 263803|
|   3.0| 653475|
+------+-------+



### Train the ALS Recommender Model

In [0]:
estimator = ALS(rank=20, maxIter=20, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='movie_id', ratingCol='rating', coldStartStrategy='nan')
transformer = estimator.fit(model_frame)
transformer

Out[11]: ALSModel: uid=ALS_c47afbb0c064, rank=20

### Extract The User/Movie Embeddings

In [0]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

len(user_embeddings), len(movie_embeddings)

Out[12]: (102103, 1382)

In [0]:
user_embeddings[0]

Out[13]: {'id': '20',
 'values': array([-0.05790565,  0.57013506,  0.05492533, -0.25566995,  0.39340326,
        -0.0469562 , -0.1545669 , -0.11135219, -0.2998827 , -0.04662196,
         0.21610533,  0.33735046,  0.18087918,  0.03378131, -0.282393  ,
        -0.06347315, -0.18413539, -0.02516779, -0.05733555, -0.01083682],
       dtype=float32)}

### Insert the User/Movie Embeddings into Pinecone

In [0]:
import pinecone      
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"])   

In [0]:
# pinecone.delete_index('cf-embed')
# pinecone.create_index(name='cf-embed', dimension=20, metric='cosine', pods=1, replicas=1, pod_type="p1")

[0;31m---------------------------------------------------------------------------[0m
[0;31mApiException[0m                              Traceback (most recent call last)
File [0;32m<command-73181466957936>:2[0m
[1;32m      1[0m [38;5;66;03m# pinecone.delete_index('cf-embed')[39;00m
[0;32m----> 2[0m [43mpinecone[49m[38;5;241;43m.[39;49m[43mcreate_index[49m[43m([49m[43mname[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mcf-embed[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mdimension[49m[38;5;241;43m=[39;49m[38;5;241;43m20[39;49m[43m,[49m[43m [49m[43mmetric[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mcosine[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mpods[49m[38;5;241;43m=[39;49m[38;5;241;43m1[39;49m[43m,[49m[43m [49m[43mreplicas[49m[38;5;241;43m=[39;49m[38;5;241;43m1[39;49m[43m,[49m[43m [49m[43mpod_type[49m[38;5;241;43m=[39;49m[38;5;124;43m"[39;49m[38;5;124;43mp1[39;49m

In [0]:
index = pinecone.Index('cf-embed')
index.describe_index_stats()

Out[16]: {'dimension': 20,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 1382},
                'users': {'vector_count': 102103}},
 'total_vector_count': 103485}

In [0]:
index.upsert(vectors=user_embeddings, namespace="users", batch_size=100)

Upserted vectors:   0%|          | 0/102103 [00:00<?, ?it/s]



Out[17]: {'upserted_count': 102103}

In [0]:
index.upsert(vectors=movie_embeddings, namespace="movies", batch_size=100)

Upserted vectors:   0%|          | 0/1382 [00:00<?, ?it/s]

Out[18]: {'upserted_count': 1382}

In [0]:
index.describe_index_stats()

Out[19]: {'dimension': 20,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 1382},
                'users': {'vector_count': 102103}},
 'total_vector_count': 103485}

### Test the Query Functionality

In [0]:
index.query(namespace='users', id='20', top_k=10)['matches']

Out[20]: [{'id': '20', 'score': 1.00000012, 'values': []},
 {'id': '192522', 'score': 0.909715772, 'values': []},
 {'id': '6244', 'score': 0.905727327, 'values': []},
 {'id': '41', 'score': 0.905266404, 'values': []},
 {'id': '248850', 'score': 0.904566705, 'values': []},
 {'id': '24256', 'score': 0.903814316, 'values': []},
 {'id': '173611', 'score': 0.903347969, 'values': []},
 {'id': '80342', 'score': 0.902463555, 'values': []},
 {'id': '194328', 'score': 0.902438939, 'values': []},
 {'id': '142376', 'score': 0.894477427, 'values': []}]

In [0]:
matches = index.query(namespace='movies', id='63', top_k=10)['matches']
matches

Out[21]: [{'id': '63', 'score': 1.00000012, 'values': []},
 {'id': '88', 'score': 0.946648419, 'values': []},
 {'id': '1439', 'score': 0.900474, 'values': []},
 {'id': '710', 'score': 0.884406686, 'values': []},
 {'id': '829', 'score': 0.854542, 'values': []},
 {'id': '75', 'score': 0.85163331, 'values': []},
 {'id': '840', 'score': 0.849387765, 'values': []},
 {'id': '663', 'score': 0.839607358, 'values': []},
 {'id': '619', 'score': 0.837498069, 'values': []},
 {'id': '782', 'score': 0.835884154, 'values': []}]

In [0]:
metadata.filter(f.col('id').isin([movie['id'] for movie in matches])).show()

+----+---------+--------------------+------------+----------+--------------------+--------------------+-------+------------+----------+
|  id|  imdb_id|               title|release_date|popularity|              genres|            overview|runtime|vote_average|vote_count|
+----+---------+--------------------+------------+----------+--------------------+--------------------+-------+------------+----------+
| 710|tt0113189|           GoldenEye|  1995-11-16| 14.686036|[Adventure, Actio...|James Bond must u...|  130.0|         6.6|      1194|
|  63|tt0114746|      Twelve Monkeys|  1995-12-29| 12.297305|[Science Fiction,...|In the year 2035,...|  129.0|         7.4|      2470|
|  88|tt0092890|       Dirty Dancing|  1987-08-21| 14.044122|[Drama, Music, Ro...|Expecting the usu...|  100.0|         7.1|      1371|
| 829|tt0071315|           Chinatown|  1974-06-20|  12.29227|[Crime, Drama, My...|Private eye Jake ...|  130.0|         7.9|       939|
|  75|tt0116996|       Mars Attacks!|  1996-12-1