# EDA

In [1]:
import numpy as np
import pandas as pd

import pyspark as ps

In [2]:
spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("mov_rec") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [3]:
rat_rdd = spark.read.format('com.databricks.spark.csv').\
                            options(header='true',\
                            inferschema='true').\
                            load('data/ml-latest-small/ratings.csv', header=True)
        

In [4]:
rat_rdd.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [5]:
rat_rdd.tail(5)

[Row(userId=610, movieId=166534, rating=4.0, timestamp=1493848402),
 Row(userId=610, movieId=168248, rating=5.0, timestamp=1493850091),
 Row(userId=610, movieId=168250, rating=5.0, timestamp=1494273047),
 Row(userId=610, movieId=168252, rating=5.0, timestamp=1493846352),
 Row(userId=610, movieId=170875, rating=3.0, timestamp=1493846415)]

In [6]:
rat_rdd.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
mov_rdd = spark.read.format('com.databricks.spark.csv').\
                            options(header='true',\
                            inferschema='true').\
                            load('data/ml-latest-small/movies.csv', header=True)
        

In [8]:
# tags_rdd = spark.read.format('com.databricks.spark.csv').\
#                              options(header='true',\
#                              inferschema='true').\
#                              load('data/ml-latest-small/tags.csv', header=True)
        

In [9]:
mov_rdd.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [10]:
rat_rdd.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [11]:
mov_rdd.createOrReplaceTempView("mov_rdd")
rat_rdd.createOrReplaceTempView("rat_rdd")

In [12]:
# tags_rdd.show(5)

Leave out timestamp...

In [13]:
mov_surprise = spark.sql(
'''
SELECT ratings.userId, movies.title, ratings.rating 
  FROM mov_rdd AS movies
  JOIN rat_rdd AS ratings
    ON movies.movieId = ratings.movieId
'''
)

In [14]:
mov_surprise.createOrReplaceTempView("mov_surprise")

In [15]:
mov_surprise.show(5)

+------+--------------------+------+
|userId|               title|rating|
+------+--------------------+------+
|     1|    Toy Story (1995)|   4.0|
|     1|Grumpier Old Men ...|   4.0|
|     1|         Heat (1995)|   4.0|
|     1|Seven (a.k.a. Se7...|   5.0|
|     1|Usual Suspects, T...|   5.0|
+------+--------------------+------+
only showing top 5 rows



# Similarity Based Recommender using Surprise

Begin by reimplementing the similarity based recommender.

In [16]:
# # install surprise if not already available
# pip install surprise

In [17]:
from surprise import KNNBasic, KNNWithMeans, SVD

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate

# mov_surprise doesn't contain any zero values
mov_surprise = mov_surprise[mov_surprise['rating']!=0]

# ratings in dataset are from 0.5 to 5
reader = Reader(rating_scale=(0.5, 5))

In [18]:
mov_surprise.head(5)

[Row(userId=1, title='Toy Story (1995)', rating=4.0),
 Row(userId=1, title='Grumpier Old Men (1995)', rating=4.0),
 Row(userId=1, title='Heat (1995)', rating=4.0),
 Row(userId=1, title='Seven (a.k.a. Se7en) (1995)', rating=5.0),
 Row(userId=1, title='Usual Suspects, The (1995)', rating=5.0)]

In [19]:
# convert to pandas DataFrame
# warning, if file is too large, this will likely crash...
mov_pDF = mov_surprise.toPandas()
print(mov_pDF.head(5))

surprise_movie_ratings = Dataset.load_from_df(mov_pDF, reader)

   userId                        title  rating
0       1             Toy Story (1995)     4.0
1       1      Grumpier Old Men (1995)     4.0
2       1                  Heat (1995)     4.0
3       1  Seven (a.k.a. Se7en) (1995)     5.0
4       1   Usual Suspects, The (1995)     5.0


In [20]:
print(mov_pDF.tail(5))

        userId                           title  rating
100831     610                    Split (2017)     4.0
100832     610   John Wick: Chapter Two (2017)     5.0
100833     610                  Get Out (2017)     5.0
100834     610                    Logan (2017)     5.0
100835     610  The Fate of the Furious (2017)     3.0


In [21]:
# treat data as training set
training_movie_data = surprise_movie_ratings.build_full_trainset()

# anything not a user-item rating pair as test
# no data for this, so we can't do an eval on out of sample accuracy
testing_movie_data = training_movie_data.build_anti_testset()

# create instance of recommender algorithm object
# algo = KNNBasic()
algo = SVD(n_factors=5, reg_all=0.005)

# fit it on the training data
algo.fit(training_movie_data)

# predict on the test data
predictions = algo.test(testing_movie_data)

In [22]:
# print the first 5 predictions
predictions[:5]

[Prediction(uid=1, iid='Shawshank Redemption, The (1994)', r_ui=3.501556983616962, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid='Good Will Hunting (1997)', r_ui=3.501556983616962, est=4.896820115620676, details={'was_impossible': False}),
 Prediction(uid=1, iid='Kill Bill: Vol. 1 (2003)', r_ui=3.501556983616962, est=4.912927504678244, details={'was_impossible': False}),
 Prediction(uid=1, iid='Collateral (2004)', r_ui=3.501556983616962, est=4.584025743510868, details={'was_impossible': False}),
 Prediction(uid=1, iid='Talladega Nights: The Ballad of Ricky Bobby (2006)', r_ui=3.501556983616962, est=4.165503510861964, details={'was_impossible': False})]

In [23]:
# print last 5 predictions
predictions[-5:]

[Prediction(uid=610, iid='United States of Leland, The (2003)', r_ui=3.501556983616962, est=3.6896634533868404, details={'was_impossible': False}),
 Prediction(uid=610, iid='A Home at the End of the World (2004)', r_ui=3.501556983616962, est=3.475706018168136, details={'was_impossible': False}),
 Prediction(uid=610, iid='League of Ordinary Gentlemen, A (2004)', r_ui=3.501556983616962, est=3.722884122081124, details={'was_impossible': False}),
 Prediction(uid=610, iid='I Think I Love My Wife (2007)', r_ui=3.501556983616962, est=3.4807778365947373, details={'was_impossible': False}),
 Prediction(uid=610, iid='Man of the Year (1995)', r_ui=3.501556983616962, est=3.5622646888438263, details={'was_impossible': False})]

## Cross Validate: SVD

In [24]:
# standard format for cross validation
# algorithm used, ratings dataset, metrics, number of cross validations, verbose controls how much output
cross_validate(SVD(n_factors=5, reg_all=0.005), 
               surprise_movie_ratings, 
               measures=['RMSE', 'MAE'], 
               cv=5, 
               verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8737  0.8702  0.8666  0.8720  0.8673  0.8700  0.0027  
MAE (testset)     0.6710  0.6693  0.6672  0.6696  0.6653  0.6685  0.0020  
Fit time          3.17    3.36    3.13    2.99    2.82    3.10    0.18    
Test time         0.29    0.31    0.37    0.31    0.26    0.31    0.04    


{'test_rmse': array([0.87373469, 0.87022619, 0.86661406, 0.87196631, 0.86728692]),
 'test_mae': array([0.67099139, 0.66929668, 0.66718857, 0.66960229, 0.6653225 ]),
 'fit_time': (3.174436092376709,
  3.3636579513549805,
  3.130986213684082,
  2.9899497032165527,
  2.8170735836029053),
 'test_time': (0.28514528274536133,
  0.30999755859375,
  0.3671238422393799,
  0.31262898445129395,
  0.2559535503387451)}

## Cross Validtate: KNNBasic

In [25]:
cross_validate(KNNBasic(k=5), 
               surprise_movie_ratings, 
               measures=['RMSE', 'MAE'], 
               cv=5, 
               verbose=True)


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9535  0.9611  0.9548  0.9644  0.9592  0.9586  0.0040  
MAE (testset)     0.7260  0.7334  0.7263  0.7331  0.7335  0.7305  0.0035  
Fit time          0.20    0.23    0.28    0.18    0.17    0.21    0.04    
Test time         2.50    2.93    2.92    2.92    2.72    2.80    0.17    


{'test_rmse': array([0.95345552, 0.96109258, 0.95477973, 0.96444648, 0.95917494]),
 'test_mae': array([0.72598122, 0.73340534, 0.72630955, 0.73313877, 0.73349889]),
 'fit_time': (0.20058107376098633,
  0.22847986221313477,
  0.280029296875,
  0.18148255348205566,
  0.17344260215759277),
 'test_time': (2.50127911567688,
  2.9284098148345947,
  2.922307014465332,
  2.9216692447662354,
  2.718113899230957)}

## Cross Validate: KNNWithMeans

In [26]:
cross_validate(KNNWithMeans(k=5), 
               surprise_movie_ratings, 
               measures=['RMSE', 'MAE'], 
               cv=5, 
               verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9327  0.9203  0.9283  0.9272  0.9413  0.9299  0.0069  
MAE (testset)     0.7179  0.7098  0.7134  0.7126  0.7246  0.7157  0.0052  
Fit time          0.31    0.23    0.31    0.29    0.23    0.28    0.04    
Test time         3.49    2.62    3.32    2.38    2.28    2.82    0.50    


{'test_rmse': array([0.93271443, 0.92025573, 0.92825424, 0.92720542, 0.94126982]),
 'test_mae': array([0.71788303, 0.70975971, 0.71343231, 0.71263097, 0.72457191]),
 'fit_time': (0.31108736991882324,
  0.22804760932922363,
  0.313495397567749,
  0.2927126884460449,
  0.22972917556762695),
 'test_time': (3.4937775135040283,
  2.6174607276916504,
  3.315605878829956,
  2.381096363067627,
  2.2750136852264404)}

# Comparing to Baseline Model

In [27]:
from src import baselines 

# provide basline model our movie ratings
data = surprise_movie_ratings

print("\nGlobal Mean...")
algo_GM = baselines.GlobalMean()
cross_validate(algo_GM, data)

print("\nMeanOfMeans...")
algo_MM = baselines.MeanofMeans()
cross_validate(algo_MM, data)


Global Mean...

MeanOfMeans...


{'test_rmse': array([0.93803248, 0.92443217, 0.92454112, 0.93051858, 0.93717266]),
 'test_mae': array([0.73801064, 0.72755202, 0.72825139, 0.72937338, 0.7370952 ]),
 'fit_time': (1.042175531387329,
  0.9857630729675293,
  0.9571704864501953,
  0.9711318016052246,
  0.9588742256164551),
 'test_time': (0.8534317016601562,
  0.7816576957702637,
  0.7068028450012207,
  0.7594108581542969,
  0.7044799327850342)}

In [28]:
import statistics

print(statistics.mean([0.92830858, 0.93106667, 0.9341514 , 0.93381002, 0.92575874]))

0.930619082


# Comparison of models

In [38]:
mom_rmse = 0.930619082
svd_rmse = 0.87

percent_change = abs(((svd_rmse - mom_rmse)) / abs(mom_rmse)) * 100
print(f"The new model shows a {percent_change:.2f}% improvement over the original")

The new model shows a 6.51% improvement over the original


## Search for Movie Title

## Top N Recommendations for each user

Source:
- [Top N Recommendations](https://surprise.readthedocs.io/en/stable/FAQ.html#top-n-recommendations-py)  

In [29]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# # First train an SVD algorithm on the movielens dataset.
# data = Dataset.load_builtin('ml-100k')
# trainset = data.build_full_trainset()
# algo = SVD()
# algo.fit(trainset)

# Predict ratings for all pairs (u, i) that are NOT in the training set.
# predictions has been instantiated above already

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
    print("\n")

1 ['Shawshank Redemption, The (1994)', 'Departed, The (2006)', 'Dark Knight, The (2008)', 'Philadelphia Story, The (1940)', 'Rear Window (1954)', 'North by Northwest (1959)', 'Casablanca (1942)', 'Brazil (1985)', 'Amadeus (1984)', 'Seven Samurai (Shichinin no samurai) (1954)']


2 ['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Lawrence of Arabia (1962)', 'Rear Window (1954)', 'Streetcar Named Desire, A (1951)', 'Philadelphia Story, The (1940)', 'Cool Hand Luke (1967)', 'Goodfellas (1990)', 'Boondock Saints, The (2000)', 'Eternal Sunshine of the Spotless Mind (2004)', 'Casablanca (1942)']


3 ['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Shawshank Redemption, The (1994)', 'Streetcar Named Desire, A (1951)', 'Godfather, The (1972)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Lawrence of Arabia (1962)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Rear Window (1954)', 

In [30]:
for uid, iid, true_r, est, _ in predictions[:5]:
        print(uid, iid, true_r, est)

1 Shawshank Redemption, The (1994) 3.501556983616962 5
1 Good Will Hunting (1997) 3.501556983616962 4.896820115620676
1 Kill Bill: Vol. 1 (2003) 3.501556983616962 4.912927504678244
1 Collateral (2004) 3.501556983616962 4.584025743510868
1 Talladega Nights: The Ballad of Ricky Bobby (2006) 3.501556983616962 4.165503510861964


# Predict results for user & movie

In [31]:
mov_pDF['title'].unique()

array(['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', ...,
       'Hazard (2005)', 'Blair Witch (2016)', '31 (2016)'], dtype=object)

In [32]:
def movie_title_search(str):
    
    title_lst = list()
    
    for i in mov_pDF['title'].unique():
        
        if str in i:
            title_lst.append(i)
    
    return title_lst  

print(movie_title_search('Matrix'))

['Matrix, The (1999)', 'Matrix Reloaded, The (2003)', 'Matrix Revolutions, The (2003)']
