# ALS Implementation for movie recommendation

In [9]:
#installing pyspark

!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=faadf738883cbdac79d047d706b86f79e8524a4ddf94976af8bb736011e8ce60
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [10]:
#import all the necessary libraries and packages and use pyspark MPP Engine on Spark local cluster

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import FloatType, IntegerType, LongType
from pyspark.ml.evaluation import RegressionEvaluator


In [16]:
# print session info about software and versions
print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"PySpark version: {pyspark.__version__}")

System version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
Pandas version: 1.5.3
PySpark version: 3.5.0


In [14]:
#ingest the rating file
#use pandas funtion to import the entire dataset
# ASSUMPTION : our dataset is being created incrementally from the Spark Streaming notebook python script which cinsumes Kafka Topic

#I will perform some summary ststistics checkings and EDA steps in the subsequent cells

df = pd.read_csv('rating.csv')\
    .drop(['timestamp'], axis = 1)

df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5


In [7]:
df.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,2950091.0,9985.869162,5774.073065,1.0,4924.0,9983.0,15013.0,20052.0
movieId,2950090.0,8976.159955,19609.510801,1.0,904.0,2167.0,4745.0,130834.0
rating,2950090.0,3.518045,1.052513,0.5,3.0,3.5,4.0,5.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2950091 entries, 0 to 2950090
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  float64
 2   rating   float64
dtypes: float64(2), int64(1)
memory usage: 67.5 MB


In [42]:
#define some global variables

USER = "userId"
MOVIE = "movieID"
RATING = "rating"
PRED = "prediction"
TIME = "timestamp"

In [52]:
# here I will define upfront the schema to ingest data into spark prior to instatiating a spark session

schema = StructType(
    (
        StructField(USER, IntegerType()),
        StructField(MOVIE, IntegerType()),
        StructField(RATING, FloatType()),
        StructField(TIME, LongType()),
    )
)


In [53]:
# define model hyperparameters based on literature challenges priors..

RANK = 10
MAX_ITER = 15
REG_PARAM = 0.05

In [54]:
# Define the number of items I would like to recommend to users
K = 8

In [55]:
# Now it is time to instatiante a spark session
#forcing the ports and hosts to open sparkUI
!wget -qnc https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -n -q ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!sleep 5
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'


spark = SparkSession \
    .builder \
    .config('spark.ui.port', '4050')\
    .appName("Python Spark Application for ASL Collaborative Filtering") \
    .getOrCreate()

spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")


In [57]:
spark

In [63]:
dfs = spark.read.option("header", True)\
      .schema(schema)\
      .csv('rating.csv')

dfs.show(3)

+------+-------+------+---------+
|userId|movieID|rating|timestamp|
+------+-------+------+---------+
|     1|      2|   3.5|     NULL|
|     1|     29|   3.5|     NULL|
|     1|     32|   3.5|     NULL|
+------+-------+------+---------+
only showing top 3 rows



In [64]:
dfs.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: long (nullable = true)



## Train-test split for hold-out validation

In [65]:
dfs_train, dfs_test = dfs.randomSplit(weights=[0.7,0.3], seed=100)

## Training an ALS Model

In [66]:
# Here I avois the cold staring problem by taking care that instances that appear during training
# might not be seen during test set evaluation; spark favours robust evaluation.

als_model = ALS(
    maxIter = MAX_ITER,
    rank= RANK,
    regParam= REG_PARAM,
    userCol= USER,
    itemCol= MOVIE,
    ratingCol = RATING,
    coldStartStrategy="drop"
)
als_model

ALS_3e4fefc0f93f

In [68]:
movie_model = als_model.fit(dfs_train)

## Prediction with ALS model

In [None]:
# I am now ready to make prediction by transforming my dfs test set and ropping the rating columns

dfs_prediction = movie_model.transform(dfs_test).drop(RATING)
dfs_prediction.show()

## Evaluating the ALS model

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")

#evaluating our ALS model on the loss function RMSE scoring metric using the RegressionEvaluator object
rmse = evaluator.evaluate(dfs_prediction)
print("Root-mean-square error = " + str(rmse))

## Spot Checking recommendation

In [73]:
userId = 2

single_user = dfs_test.filter(dfs_test['userId']== 10).select(['movieId','userId'])
# Let's focus on a single userId and predict its values, of course using the held out portion of the dataset
# best practice would be using LOOCV of K-fold cv with 3-5 fold in this case give 2M dimesnion of sample size

single_user.show()


reccomendations = movie_model.transform(single_user)

reccomendations.orderBy('prediction',ascending=False).show()
