### Notebook Set-Up

In [1]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

from pyspark.ml.recommendation import ALS

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/10/16 21:26:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/16 21:26:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
load_dotenv() 

True

### Load Input Data

In [4]:
ratings_path = "../data/ratings_small.csv"
ratings = spark.read.csv(ratings_path, header=True, inferSchema=True)
ratings = ratings.withColumn('timestamp', f.from_unixtime('timestamp'))

ratings.printSchema()
ratings.show(5, truncate=False)
ratings.count()

                                                                                

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+------+-------------------+
|userId|movieId|rating|timestamp          |
+------+-------+------+-------------------+
|1     |31     |2.5   |2009-12-13 18:52:24|
|1     |1029   |3.0   |2009-12-13 18:52:59|
|1     |1061   |3.0   |2009-12-13 18:53:02|
|1     |1129   |2.0   |2009-12-13 18:53:05|
|1     |1172   |4.0   |2009-12-13 18:53:25|
+------+-------+------+-------------------+
only showing top 5 rows



100004

In [5]:
ratings.agg(
    f.countDistinct('userId').alias('total_users'),
    f.countDistinct('movieId').alias('total_movies'),
    f.count('*').alias('total_ratings')
).show()
ratings.groupby('rating').count().sort('rating').show()

                                                                                

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|        671|        9066|       100004|
+-----------+------------+-------------+

+------+-----+
|rating|count|
+------+-----+
|   0.5| 1101|
|   1.0| 3326|
|   1.5| 1687|
|   2.0| 7271|
|   2.5| 4449|
|   3.0|20064|
|   3.5|10538|
|   4.0|28750|
|   4.5| 7723|
|   5.0|15095|
+------+-----+



### Convert to an Implicit Feedback Dataset

In [6]:
MIN_RATING = 3
MIN_USERS = 10
MIN_MOVIES = 10

In [7]:
ratings = ratings \
    .filter(f.col('rating') >= MIN_RATING) \
    .withColumn('cnt_users', f.count('userId').over(Window.partitionBy('movieId'))) \
    .filter(f.col('cnt_users') >= 10) \
    .withColumn('cnt_movies', f.count('movieId').over(Window.partitionBy('userId'))) \
    .filter(f.col('cnt_movies') >= 10) \
    .withColumn('rating', f.col('rating') - (MIN_RATING - 1)) \
    .cache()

ratings.agg(
    f.countDistinct('userId').alias('total_users'),
    f.countDistinct('movieId').alias('total_movies'),
    f.count('*').alias('total_ratings')
).show()
ratings.groupby('rating').count().sort('rating').show()

                                                                                

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|        660|        1862|        66021|
+-----------+------------+-------------+





+------+-----+
|rating|count|
+------+-----+
|   1.0|15218|
|   1.5| 7931|
|   2.0|23338|
|   2.5| 6367|
|   3.0|13167|
+------+-----+



                                                                                

### Train the ALS Recommender Model

In [8]:
estimator = ALS(rank=20, maxIter=20, regParam=0.1, implicitPrefs=True, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='nan')
transformer = estimator.fit(ratings)
transformer

                                                                                

ALSModel: uid=ALS_45689656b171, rank=20

### Extract The User/Movie Embeddings

In [9]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

len(user_embeddings), len(movie_embeddings)

(660, 1862)

In [10]:
user_embeddings[0]

{'id': '10',
 'values': [-0.06223471835255623,
  0.058902062475681305,
  0.07113000750541687,
  -0.19392582774162292,
  -0.02988211065530777,
  -0.17706599831581116,
  0.24346260726451874,
  -0.10334749519824982,
  -0.24157141149044037,
  -0.01667705550789833,
  0.020632153376936913,
  -0.21882857382297516,
  0.008991416543722153,
  0.10517088323831558,
  -0.003086844924837351,
  0.010951409116387367,
  -0.018499843776226044,
  0.0952700823545456,
  0.05312608927488327,
  -0.006292677950114012]}

### Insert the User/Movie Embeddings into Pinecone

In [11]:
import pinecone      
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"])      
index = pinecone.Index('cf-embed')
index

  from tqdm.autonotebook import tqdm


<pinecone.index.Index at 0x101549070>

In [12]:
index.upsert(vectors=user_embeddings, namespace="users", batch_size=100)

Upserted vectors:   0%|          | 0/660 [00:00<?, ?it/s]

{'upserted_count': 660}

In [13]:
index.upsert(vectors=movie_embeddings, namespace="movies", batch_size=100)

Upserted vectors:   0%|          | 0/1862 [00:00<?, ?it/s]

{'upserted_count': 1862}

### Test the Query Functionality

In [14]:
index.query(namespace='users', id='10', top_k=10)

{'matches': [{'id': '10', 'score': 1.00000012, 'values': []},
             {'id': '271', 'score': 0.844474137, 'values': []},
             {'id': '425', 'score': 0.829484522, 'values': []},
             {'id': '74', 'score': 0.797080636, 'values': []},
             {'id': '101', 'score': 0.794035, 'values': []},
             {'id': '116', 'score': 0.789813, 'values': []},
             {'id': '237', 'score': 0.782995045, 'values': []},
             {'id': '651', 'score': 0.780538201, 'values': []},
             {'id': '260', 'score': 0.777691126, 'values': []},
             {'id': '277', 'score': 0.759184182, 'values': []}],
 'namespace': 'users'}

In [15]:
index.query(namespace='movies', vector=user_embeddings[0]['values'], top_k=10)

{'matches': [{'id': '2571', 'score': 0.741118371, 'values': []},
             {'id': '1196', 'score': 0.723255754, 'values': []},
             {'id': '1198', 'score': 0.717089295, 'values': []},
             {'id': '2028', 'score': 0.673573196, 'values': []},
             {'id': '1210', 'score': 0.654001296, 'values': []},
             {'id': '260', 'score': 0.648884952, 'values': []},
             {'id': '2762', 'score': 0.638979614, 'values': []},
             {'id': '1270', 'score': 0.621263206, 'values': []},
             {'id': '318', 'score': 0.608622491, 'values': []},
             {'id': '593', 'score': 0.588954, 'values': []}],
 'namespace': 'movies'}