In [2]:
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

# Import Data

In [3]:
spark = (pyspark.sql.SparkSession.builder
  .master("local[*]")
  .getOrCreate())

## Read datasets into Pyspark DataFrames

In [None]:
# Print Schema
movie_ratings.printSchema()
movie_ratings.persist()
movie_ratings.show(5)

### Movies.dat

In [None]:
dat_file = ''
movies = spark.read.csv(dat_file, inferSchema=True, header=True)

### Movies_metadata.csv

In [None]:
csv_file = 'movies_metadata.csv'
movies = spark.read.csv(csv_file, inferSchema=True, header=True)

### Ratings.json

In [None]:
json_file = 'ratings.json'
movies = spark.read.csv(json_file, inferSchema=True, header=True)

### Requests.json

In [None]:
json_file = 'requests.json'
movies = spark.read.csv(json_file, inferSchema=True, header=True)

### Users.dat

In [None]:
dat_file = ''
movies = spark.read.csv(dat_file, inferSchema=True, header=True)

# Fitting ALS Model

## Train:Test Split

In [None]:
(trainingdata, testdata) = movie_ratings.randomSplit([0.7, 0.3], seed = 100)

print("Training Dataset Count: " + str(trainingdata.count()))
print("Test Dataset Count: " + str(testdata.count()))

## Convert to Matrix

In [None]:
als = ALS(
    rank=10,  #10 variables/latent factors
    maxIter=10,  #
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
)

## Fit the model

In [None]:
als_model = als.fit(trainingdata)

# Evaluate ALS Model

In [None]:
predictions = als_model.transform(testdata)
predictions.persist()

In [None]:

movie_ratings.show(1)

In [None]:

predictions.show(1)

In [None]:
user_factors = als_model.userFactors
user_factors

In [None]:
item_factors = als_model.itemFactors
item_factors

In [None]:
evaluator = RegressionEvaluator(labelCol='rating')
evaluator.evaluate(predictions.na.drop())

# Will User Like a Certain Movie?

In [None]:
# User
user_row = user_factors[user_factors['id'] == 10].first()
user_factors = np.array(user_row['features'])
user_factors

In [None]:
# Movie
movie_row = item_factors[item_factors['id'] == 296].first()
movie_factors = np.array(movie_row['features'])
movie_factors

## Dot Product

In [None]:
user_factors @ movie_factors

## User Prediction

In [None]:
user_preds = predictions[predictions['userId'] == 10]
user_preds.sort('movieId').show()
!grep 296 < data/movies.csv

# What Movies will a User Like?

In [None]:
recs = als_model.recommendForAllUsers(numItems=10)
recs[recs['userId']==10].first()['recommendations']

In [None]:
top_movie = None # put a number here/movieID
!grep top_movie < data/movies.csv

# Cold Start Model

**Machine Learning in Recommendation Systems**

ML is only used in the best recommendation systems. The model is constantly learning and adapting to platforms’ users and products it sells. Enables platform to optimize and personalize the content for every particular user.

**Cold Start Problem**

success strongly depends on the platform’s capabilities to adapt quickly to a new person or a new search in order to provide the best and personalized service.

**Product vs Visitor Cold Start**

Can get both types, i.e a new movie or new visitor on platform.  

Use content-based filtering to address this challenge: 
* First use the metadata of new products while creating recommendations
* Visitor’s actions are not used until a certain period of time, i.e. we know enough about them

**Best Strategy for Visitor Cold Start**

Use popularity based recommendations
* regional trends, e.g. global, local
* Time based trends, e.g. time of day, time of year
* Geolocation, e.g. zipcode, region, country
* Platform, e.g. mobile, desktop

Make Clusters within these categories
* Kmeans
* Want high scores, but with tight confidence intervals

**Penalizing Some User Types**

Can even distinguish between users and how they jump between different movies.  If they jump around a lot, can weight their recommendations.

**Limitations**
* ALS is limited in how to deal with NaNs, as we have to drop them.  Doesn't work so well in reality