# 07. RECOMMENDATION SYSTEM

Information about each metric you can find in the code:

https://colab.research.google.com/drive/1SBx6nolnsqvlQLhlLjUz8VQi7ylJsZ4x


* Other Links

https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea

In [0]:
import pandas as pd
import numpy as np
import random

!pip install surprise

import surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678270 sha256=a2f7bf18778ae5ae3c113564c4c9732ca56f8e3db338f8a1dea506b8309ac7a2
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


## MovieLens

https://movielens.org/

Download Movie-Lens data to get rating from abou

Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

In [0]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

!ls ./ml-latest-small/*

--2019-12-17 16:44:30--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2019-12-17 16:44:32 (959 KB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
./ml-latest-small/links.csv    ./ml-latest-small/README.txt
./ml-latest-small/movies.csv   ./ml-latest-small/tags.csv
./ml-latest-small/ratings.csv


In [0]:
RATING_PATH = './ml-latest-small/ratings.csv'
MOVIES_PATH = './ml-latest-small/movies.csv'
LINKS_PATH = './ml-latest-small/links.csv'
TAGS_PATH = './ml-latest-small/tags.csv'

In [0]:
!head -n 10  {RATING_PATH}

userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041


## Read data from file (DataSet)

In [0]:
from surprise import Dataset
from surprise import Reader

np.random.seed(0)
random.seed(0)

In [0]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
"""
line_format - list of columns
sep - separator for csv file
skip_lines - start from the second line
"""

'\nline_format - list of columns\nsep - separator for csv file\nskip_lines - start from the second line\n'

In [0]:
dataset = Dataset.load_from_file(RATING_PATH, reader=reader)

In [0]:
%%time
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import KNNBaseline
from collections import defaultdict

def get_popularity_ranking(dataset):
    ratings = defaultdict(int)
    rankings = defaultdict(int)
    data  = dataset.build_full_trainset()
    
    for userId, movieId, _ in  data.all_ratings():
      ratings[movieId] += 1
    rank = 1
    for movieId, _ in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
       rankings[int(data.to_raw_iid(movieId))] = rank
       rank+=1
    return ratings, rankings


class RecommendationDataSet:
    def __init__(self, dataset, test_size = .25):
      self.dataset = dataset
      self.full_dataset = dataset.build_full_trainset()

      # Train Set, Test Set to test results
      self.train_set, self.test_set = train_test_split(dataset, test_size=test_size, random_state=1)

      # https://surprise.readthedocs.io/en/stable/trainset.html#surprise.Trainset.build_anti_testset
      ## Situation when the user u is known, the item is known, but the rating is not
      # in the trainset
      self.anti_test_set        = self.full_dataset.build_anti_testset()

      ## LeaveOneOut
      ## Cross-validation iterator where each user has exactly one rating in the testset.
      leave_one_out_set = LeaveOneOut(n_splits=1, random_state=1)
      loo_train_set, loo_test_set = list(leave_one_out_set.split(dataset))[0]

      self.leave_one_out_train_set       = loo_train_set
      self.leave_one_out_test_set        = loo_test_set
      self.leave_one_out_anti_test_set   = loo_train_set.build_anti_testset()

      #Compute similarty matrix between items so we can measure diversity
      sim_options = {'name': 'cosine', 'user_based': False}
      self.similarty_algorithm = KNNBaseline(sim_options=sim_options)
      self.similarty_algorithm.fit(self.full_dataset)


      # ranking
      self.ratings, self.rankings = get_popularity_ranking(dataset)


recommendation_dataset = RecommendationDataSet(dataset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
CPU times: user 23.6 s, sys: 2 s, total: 25.6 s
Wall time: 25.6 s


# Evaluation


## Mean absolute Error:

$$\frac{\sum_{t=1}^{n}|y_i-x_i|}{n}$$




## Root Mean absolute Error:

$$\sqrt\frac{\sum_{t=1}^{n}(y_i-x_i)^2}{n}$$



$y_i$ - Predicted rating

$x_i$ - actual real rating


In [0]:
from surprise import SVD

svd = SVD(random_state=10)
svd.fit(recommendation_dataset.train_set)

print('Rest Set:', recommendation_dataset.test_set[:10])
predictions = svd.test(recommendation_dataset.test_set)
print('Predictions: ', predictions[:10])

from surprise import accuracy

accuracy.mae(predictions, verbose=True)
accuracy.rmse(predictions, verbose=True)

Rest Set: [('167', '1196', 4.5), ('605', '2291', 3.5), ('472', '3298', 4.0), ('600', '362', 2.5), ('105', '7361', 5.0), ('249', '79224', 3.5), ('153', '8464', 1.0), ('354', '1923', 4.0), ('483', '46948', 4.0), ('448', '4255', 0.5)]
Predictions:  [Prediction(uid='167', iid='1196', r_ui=4.5, est=4.150335799784241, details={'was_impossible': False}), Prediction(uid='605', iid='2291', r_ui=3.5, est=3.373228411272731, details={'was_impossible': False}), Prediction(uid='472', iid='3298', r_ui=4.0, est=3.742424718052575, details={'was_impossible': False}), Prediction(uid='600', iid='362', r_ui=2.5, est=3.487305536934906, details={'was_impossible': False}), Prediction(uid='105', iid='7361', r_ui=5.0, est=4.80859833564308, details={'was_impossible': False}), Prediction(uid='249', iid='79224', r_ui=3.5, est=3.0133924239941066, details={'was_impossible': False}), Prediction(uid='153', iid='8464', r_ui=1.0, est=2.02352618310102, details={'was_impossible': False}), Prediction(uid='354', iid='1923',

0.87790565300794

In [0]:
from  collections import defaultdict

# https://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user
def get_top_n(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)

    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

get_top_n(predictions)[610]

[(4226, 4.515496953427138),
 (260, 4.447636165483844),
 (7361, 4.384286812520132),
 (6502, 4.379323035207386),
 (48516, 4.367892317536974),
 (1283, 4.362001234390769),
 (142488, 4.346278837441465),
 (1240, 4.3431853118019985),
 (54503, 4.339674297306723),
 (68954, 4.331091409087657)]

### HitRate

In this measure system we've got a recommendation system that give us TOP-N recommendations, so next if the element exists in our test set it is consider as a hit. 

$\begin{align*}top\_n\_hitrate=\frac{hits}{users}\end{align*}$

In [0]:
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

    # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    # Compute overall precision
    return hits/total

## TEST SET, ANTI-TEST SET

In [0]:
np.random.seed(0)
random.seed(0)

dataframe = pd.DataFrame(
    [[10, 1, 5],
     [10, 2, 4],
     [10, 3, 3],
     [10, 5, 2],
     [20, 1, 2],
     [20, 2, 3],
     [20, 4, 2]
     ], columns=['uid','iid', 'rating'])

sample_dataset = Dataset.load_from_df(dataframe,reader=Reader(rating_scale=(1,5)))
train_set, test_set = train_test_split(sample_dataset, test_size=0.2, random_state=1)

print('1. Test Set: ', test_set)

loo  = LeaveOneOut(n_splits=1, random_state=1)
loo_train_set, loo_test_set = list(loo.split(sample_dataset))[0]
print('2. Leave One Out Test Set', loo_test_set)


loo_anti_test_set =  loo_train_set.build_anti_testset()
print('3. Leave one out: Anti Test Set', loo_anti_test_set)

loo_anti_test_prediction = svd.test(loo_anti_test_set)
loo_anti_test_topn = get_top_n(loo_anti_test_prediction, 2, 1.0)
print('4. Leave one out predictions', loo_anti_test_topn)

print('Train by SVD on train_set')
svd_sample = SVD(random_state=10)
svd_sample.fit(loo_train_set)

print('5. Hit Rate: ', HitRate(loo_anti_test_topn, loo_test_set))

1. Test Set:  [(10, 5, 2.0), (20, 2, 3.0)]
2. Leave One Out Test Set [(10, 2, 4.0), (20, 1, 2.0)]
3. Leave one out: Anti Test Set [(10, 2, 3.0), (10, 4, 3.0), (20, 1, 3.0), (20, 3, 3.0), (20, 5, 3.0)]
4. Leave one out predictions defaultdict(<class 'list'>, {10: [(2, 3.5048725984106204), (4, 3.5048725984106204)], 20: [(1, 3.5048725984106204), (3, 3.5048725984106204)]})
Train by SVD on train_set
5. Hit Rate:  1.0


## Cumulative Hit Rate

In this case we remove hit rate for movies that are not bellow some value. Because we haven't enjoy this movie it shouldn't be in the list.

In our cumulative hit rate we need remove this in calculations.

In [0]:
def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
      hits = 0
      total = 0

      # For each left-out rating
      for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
          # Only look at ability to recommend things the users actually liked...
          if (actualRating >= ratingCutoff):
              # Is it in the predicted top 10 for this user?
              hit = False
              for movieID, predictedRating in topNPredicted[int(userID)]:
                  if (int(leftOutMovieID) == movieID):
                      hit = True
                      break
              if (hit) :
                  hits += 1

              total += 1

      # Compute overall precision
      return hits/total

In [0]:
loo_test_prediction = svd_sample.test(loo_test_set)
print('Cumulative Hit Rate: ', 
      CumulativeHitRate(loo_anti_test_topn, loo_test_prediction, 4.0))

Cumulative Hit Rate:  1.0


## Rating Hit Rate (rHR)

In this situation for each score in your stars (⭐ 1-5). For each score if the movie is in the score it is consider as a hit, otherwise we add total for this score.



In [0]:
def RatingHitRate(topNPredicted, leftOutPredictions):
    hits = defaultdict(float)
    total = defaultdict(float)

    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit = True
                break
        if (hit) :
            hits[actualRating] += 1

        total[actualRating] += 1

    # Compute overall precision
    ratings = {}
    for rating in sorted(hits.keys()):
        ratings[rating] = hits[rating] / total[rating]
    return ratings

In [0]:
print('Rating Hit Rate: ', 
      RatingHitRate(loo_anti_test_topn, loo_test_prediction))

Rating Hit Rate:  {2.0: 1.0, 4.0: 1.0}


## Average Reciprocal HitRate  (ARHR)


- This is like HitRate but also get into consideration where it is appear $rank_i$. 
- Focus on the user method

$\begin{align*}arhr=\frac{\sum_{i=1}^{n}\frac{1}{rank_i}}{users}\end{align*}$


In [0]:
def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
    summation = 0
    total = 0
    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Is it in the predicted top N for this user?
        hitRank = 0
        rank = 0
        for movieID, predictedRating in topNPredicted[int(userID)]:
            rank = rank + 1
            if (int(leftOutMovieID) == movieID):
                hitRank = rank
                break
        if (hitRank > 0) :
            summation += 1.0 / hitRank

        total += 1

    return summation / total

In [0]:
print('Average Reciprocal HitRate (ARHR): ', 
      AverageReciprocalHitRank(loo_anti_test_topn, loo_test_prediction))


loo_anti_test_topn2 = {
 10: [(2,3.5), (4,3.5)], 
 20: [(3,3.5), (1,3.5)]
}

print('\nAverage Reciprocal HitRate (ARHR): \n\
(1/rank_user10 + 1/rank_user20)/2 = (1/1 + 1/2)/2 = \n', 
      AverageReciprocalHitRank(loo_anti_test_topn2, loo_test_prediction))

Average Reciprocal HitRate (ARHR):  1.0

Average Reciprocal HitRate (ARHR): 
(1/rank_user10 + 1/rank_user20)/2 = (1/1 + 1/2)/2 = 
 0.75


## Coverage

The % of <user, item> pairt that can be predicted, with rating higher than min_rate.

This is good to measure new items that are not yet scored.

$user\_coverage(>=2.9)=\frac{user1\_hits+user2\_hits}{num\_users}=\frac{1+0}{2}=0.5$

$user\_coverage(>=2.0)=\frac{user1\_hits+user2\_hits}{num\_users}=\frac{1+1}{2}=1.0$

In [0]:
# What percentage of users have at least one "good" recommendation
def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
    hits = 0
    for userID in topNPredicted.keys():
        hit = False
        for movieID, predictedRating in topNPredicted[userID]:
            if (predictedRating >= ratingThreshold):
                hit = True
                break
        if (hit):
            hits += 1

    return hits / numUsers

In [0]:
sample_full = sample_dataset.build_full_trainset()
svd_sample.fit(sample_full)

sample_anti_testset = sample_full.build_anti_testset()
sample_anti_predictions =  svd_sample.test(sample_anti_testset)
print("Anti Test SET: ", sample_anti_testset)
print("Anti Test Predictions: ", sample_anti_predictions)

sample_anti_topn = get_top_n(sample_anti_predictions, 2, 1.0)
print("Anti Test TopN: ", sample_anti_topn)

print('User Coverage (>=2.9): ', UserCoverage(sample_anti_topn,2,2.9) )
print('User Coverage (>=2.5): ', UserCoverage(sample_anti_topn,2,2.5) )
print('User Coverage (>=2.7): ', UserCoverage(sample_anti_topn,2,0) )
print('User Coverage (>=4.0): ', UserCoverage(sample_anti_topn,2,4.0) )

Anti Test SET:  [(10, 4, 3.0), (20, 3, 3.0), (20, 5, 3.0)]
Anti Test Predictions:  [Prediction(uid=10, iid=4, r_ui=3.0, est=2.9810168611713936, details={'was_impossible': False}), Prediction(uid=20, iid=3, r_ui=3.0, est=2.777842182291167, details={'was_impossible': False}), Prediction(uid=20, iid=5, r_ui=3.0, est=2.590335248598558, details={'was_impossible': False})]
Anti Test TopN:  defaultdict(<class 'list'>, {10: [(4, 2.9810168611713936)], 20: [(3, 2.777842182291167), (5, 2.590335248598558)]})
User Coverage (>=2.9):  0.5
User Coverage (>=2.5):  1.0
User Coverage (>=2.7):  1.0
User Coverage (>=4.0):  0.0


## Diversity

For diversity we use similarity matrix between movies. Diversity is (1-S) where S is similarity for each pair in the user recommendation.

Very High diversity very often means that algorithm return random values.
Too Low is not good because it shows very similar ones

In [0]:
import itertools

def Diversity(topNPredicted, simsAlgo, fun = int):
    n = 0
    total = 0
    simsMatrix = simsAlgo.compute_similarities()
    for userID in topNPredicted.keys():
        pairs = itertools.combinations(topNPredicted[userID], 2)
        for pair in pairs:
            movie1 = pair[0][0]
            movie2 = pair[1][0]
            innerID1 = simsAlgo.trainset.to_inner_iid(fun(movie1))
            innerID2 = simsAlgo.trainset.to_inner_iid(fun(movie2))
            similarity = simsMatrix[innerID1][innerID2]
            total += similarity
            n += 1

    S = total / n
    return (1-S)


In [0]:
sim_options = {'name': 'cosine', 'user_based': False}
similarty_algorithm = KNNBaseline(sim_options=sim_options)
similarty_algorithm.fit(sample_full)
print('Inner index', 
      {'i'+str(id):   similarty_algorithm.trainset.to_inner_iid(id) for id in range(1,6)}
    )
matrix =  similarty_algorithm.compute_similarities()
print('\nSimilarity matrix: \n',matrix)

innerID1 = similarty_algorithm.trainset.to_inner_iid(int(3))
innerID2 = similarty_algorithm.trainset.to_inner_iid(int(5))

print('\nFor user10: There is no pair. Similarity = 0.0')
print('For user20: Similarity between i3 and i5 is: ', 
      matrix[innerID1][innerID2] )
print('Diversity is (1-1/1) =  0.0' )
print('\nDiversity: ', Diversity(sample_anti_topn, similarty_algorithm))

sample_anti_topn2 = {
    10: [(2,4.0), (1,3.0)],
    20: [(3,4.0), (5,3.0)]
}
print('\n\nSample 2', sample_anti_topn2)
print('For user10: There is pair(i2,i1). Similarity = 0.965')
print('For user20: There is pair(i3,i5). Similarity = 1.0')
print('Diversity is: (1- (0.965+1.0)/2) = 0.0175')

print('Diversity: ', Diversity(sample_anti_topn2, similarty_algorithm))

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Inner index {'i1': 0, 'i2': 1, 'i3': 2, 'i4': 4, 'i5': 3}
Computing the cosine similarity matrix...
Done computing similarity matrix.

Similarity matrix: 
 [[1.         0.96561576 1.         1.         1.        ]
 [0.96561576 1.         1.         1.         1.        ]
 [1.         1.         1.         1.         0.        ]
 [1.         1.         1.         1.         0.        ]
 [1.         1.         0.         0.         1.        ]]

For user10: There is no pair. Similarity = 0.0
For user20: Similarity between i3 and i5 is:  1.0
Diversity is (1-1/1) =  0.0
Computing the cosine similarity matrix...
Done computing similarity matrix.

Diversity:  0.0


Sample 2 {10: [(2, 4.0), (1, 3.0)], 20: [(3, 4.0), (5, 3.0)]}
For user10: There is pair(i2,i1). Similarity = 0.965
For user20: There is pair(i3,i5). Similarity = 1.0
Diversity is: (1- (0.965+1.0)/2) = 0.0175
Computing the cos

## Novelty

High Novelty can show movies from long tail.

In [0]:
def Novelty(topNPredicted, rankings):
    n = 0
    total = 0
    for userID in topNPredicted.keys():
        for rating in topNPredicted[userID]:
            movieID = rating[0]
            rank = rankings[int(movieID)]
            total += rank
            n += 1
    return total / n

print('Popularity rankings: \n', get_popularity_ranking(sample_dataset))
print('Ranking: ', sample_anti_topn)
print('For Movie 4 rank is: 5, movie 3 is: 4, movie 5 is 3')
print('Novelty is: (5+4+3)/3 = 12/3 = 4.0')
print('Novelty is: ', Novelty(sample_anti_topn, get_popularity_ranking(sample_dataset)[1]))

Popularity rankings: 
 (defaultdict(<class 'int'>, {0: 2, 1: 2, 2: 1, 3: 1, 4: 1}), defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 3, 5: 4, 4: 5}))
Ranking:  defaultdict(<class 'list'>, {10: [(4, 2.9810168611713936)], 20: [(3, 2.777842182291167), (5, 2.590335248598558)]})
For Movie 4 rank is: 5, movie 3 is: 4, movie 5 is 3
Novelty is: (5+4+3)/3 = 12/3 = 4.0
Novelty is:  4.0


## EVALUATION

In [0]:
from surprise import accuracy


def get_evaluation(algorithm, 
                    recommendation_dataset, verbose = True):
  metrics = {}
  predictions = algorithm.test(recommendation_dataset.test_set)

  metrics['MAE'] = accuracy.mae(predictions, verbose=False)
  metrics['RMSE'] = accuracy.rmse(predictions, verbose=False)

  # LEAVE ONE OUT FIT/TEST
  algorithm.fit( recommendation_dataset.leave_one_out_train_set )
  leave_one_out_test_prediction = algorithm.test(recommendation_dataset.leave_one_out_test_set)

  #  LEAVE ONE OUT ANTI TEST PREDICTION/TOP-N
  leave_one_out_anti_test_prediction = algorithm.test(recommendation_dataset.leave_one_out_anti_test_set)
  leave_one_out_anti_test_topn = get_top_n(leave_one_out_anti_test_prediction, 10, 4.0)

  # See how often we recommended a movie the user actually rated
  metrics["HR"] = HitRate(leave_one_out_anti_test_topn, leave_one_out_test_prediction)   

  # See how often we recommended a movie the user actually liked
  metrics["cHR"] = CumulativeHitRate(leave_one_out_anti_test_topn, leave_one_out_test_prediction)

  # Compute ARHR
  metrics["ARHR"] = AverageReciprocalHitRank(leave_one_out_anti_test_topn, leave_one_out_test_prediction)

  # Rating HitRate
  metrics["rHR"] = RatingHitRate(leave_one_out_anti_test_topn, leave_one_out_test_prediction)



  # BASED ON FULL DATASET
  algorithm.fit(recommendation_dataset.full_dataset)
  anti_test_prediction = algorithm.test(recommendation_dataset.anti_test_set)
  anti_test_topn = get_top_n(anti_test_prediction, 10, 4.0 )

  # Coverage
  metrics["Coverage"] = UserCoverage(  anti_test_topn, 
                                     recommendation_dataset.full_dataset.n_users, 
                                     ratingThreshold=4.0)
  
  # Measure diversity of recommendations:
  metrics["Diversity"] = Diversity(anti_test_topn, 
                                   recommendation_dataset.similarty_algorithm, str)
  
  # Measure novelty (average popularity rank of recommendations):
  metrics["Novelty"] = Novelty(anti_test_topn, recommendation_dataset.rankings)
        
  if verbose:
    print('Mean Absolute Error:',  metrics['MAE'])
    print('Root Mean Square Error:',  metrics['RMSE'])

    print('Hit Rate (HR):',  metrics['HR'])
    print('Cumulative Hit Rate (cHR):',  metrics['cHR'])
    print('Average Reciprocal HitRate  (ARHR):',  metrics['ARHR'])

    print('Rating  HitRate  (rHR):',  metrics['rHR'])

    print('Coverage:',  metrics['Coverage'])
    print('Diversity:',  metrics['Diversity'])
    print('Novelty:',  metrics['Novelty'])
    
  return metrics

# CONTENT BASED ALGORITHM

## COSINE SIMILARITY

https://learning.oreilly.com/library/view/statistics-for-machine/9781788295758/eb9cd609-e44a-40a2-9c3a-f16fc4f5289a.xhtml



## Mise En Scene

In [0]:
!wget https://raw.githubusercontent.com/PacktPublishing/Building-Recommender-Systems-with-Machine-Learning-and-AI/master/ContentBased/LLVisualFeatures13K_Log.csv
#!unzip Mise-en-Scene-Dataset-v1.zip

#!mkdir miseenscene
#!mv ./"Mise-en-Scene Dataset_v1"/* ./miseenscene/
!ls ./*


MISE_EN_SCENE = './LLVisualFeatures13K_Log.csv'

--2019-12-17 13:28:05--  https://raw.githubusercontent.com/PacktPublishing/Building-Recommender-Systems-with-Machine-Learning-and-AI/master/ContentBased/LLVisualFeatures13K_Log.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1787176 (1.7M) [text/plain]
Saving to: ‘LLVisualFeatures13K_Log.csv’


2019-12-17 13:28:05 (19.8 MB/s) - ‘LLVisualFeatures13K_Log.csv’ saved [1787176/1787176]

./LLVisualFeatures13K_Log.csv  ./ml-latest-small.zip

./miseenscene:

./ml-latest-small:
links.csv  movies.csv  ratings.csv  README.txt	tags.csv

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


## Data Extractor

In [0]:
import re


class DataExtractor:
  def process(self, 
              file):
    print('processss')
    movies = pd.read_csv(file, sep=',')

    #add genres
    genre_dummies = self.generate_dummies(movies, 'genres')
    genre_dummies = genre_dummies.add_prefix('genre_')
    genres = pd.concat([movies['movieId'], genre_dummies], axis=1)

    genres.set_index('movieId', inplace=True)
    self.genres_list = genres.to_dict('index')
    self.genres = {key: np.array(list(value.values())) for key, value in self.genres_list.items()}

    # extract years
    p = re.compile(r"(?:\((\d{4})\))?\s*$")
    def get_year(r):
      title = r['title']
      m = p.search(title)
      year = m.group(1)
      return year

    movies['year'] = movies.apply(get_year, 'columns')
    movies.set_index('movieId', inplace=True)
    self.movies = movies
    years = movies['year'].fillna(0).astype(int).to_dict()
    self.years = years

    # extract mise_en_scene
    mise_en_scene = pd.read_csv(MISE_EN_SCENE, sep=',')
    mise_en_scene.set_index('ML_Id', inplace=True)
    self.mise_en_scene = {key: list(value.values()) for key,value in mise_en_scene.to_dict('index').items()}

  def get_movie(self,movieID):
      return self.movies.loc[movieID]

  def get_title(self, movieID):
      return self.get_movie(movieID)['title']

  def get_genres(self, movieID):
      return self.genres[movieID]

  def get_year(self, movieID):
      y1 = None
      if movieID in self.movies.index:
        y1 = self.movies.loc[movieID]['year']
      if y1 is None:
        return 0
      return int(y1)

  def get_mise_en_scene(self, movieID):
      return self.mise_en_scene[movieID]


  def to_list(self, textdata):
      return ''.join(textdata.lower().split()).split('|')

  def generate_dummies(self, df, column):
      series = df[column]
      return  pd.get_dummies(series.apply(self.to_list).apply(pd.Series).stack()).sum(level=0)


de = DataExtractor()
de.process(MOVIES_PATH);

processss


In [0]:
movieID = 96
print('Movie: ', de.get_movie(movieID))
print('\nYear: ', de.get_year(movieID))
print('Title: ', de.get_title(movieID))
print('Genres: ', de.get_genres(movieID))
print('Mise En Scene', de.get_mise_en_scene(movieID))

Movie:  title     In the Bleak Midwinter (1995)
genres                     Comedy|Drama
year                               1995
Name: 96, dtype: object

Year:  1995
Title:  In the Bleak Midwinter (1995)
Genres:  [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
Mise En Scene [0.7826967221359541, 0.6586550538895021, 0.6483078883071041, 0.13831277317628196, 0.172365024786917, 0.19138055098497603, 0.35379310371877]


## Content Base Algorithm

In [0]:
import heapq
from surprise import PredictionImpossible
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate
import math
from tqdm.autonotebook import tqdm

class SimilarityAlgorithm(AlgoBase):
    results = {}

    def __init__(self,de:DataExtractor ):
        AlgoBase.__init__(self)
        self.de = de

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        movie_ids = []
        for i in range(0, trainset.n_items):
            movie_ids.append(int(trainset.to_raw_iid(i)))
        movie_ids = list(set(movie_ids))

        print('trainset size: '+ str(len(movie_ids)))

        #calculate similarity
        self.similarity = self.calculate(movie_ids)
        return self

    def calculate_feature_similarity(self, feature1, feature2):
        # feature1 = np.array(list(feature1))
        # feature2 = np.array(list(feature2))
        intersection = feature1 * feature2
        return intersection.sum() / math.sqrt(feature1.sum() * feature2.sum())

    def compute_year_similarity(self, y1, y2):
        diff = abs(y2 - y1)
        sim = math.exp(-diff / 10.0)
        return sim

    def compute_mise_en_scene_similarity(self, movie1, movie2, mise):
        if not movie1 in mise.keys():
          return 1.0
        if not movie2 in mise.keys():
          return 1.0

        mes1 = mise[movie1]
        mes2 = mise[movie2]


        shotLengthDiff = math.fabs(mes1[0] - mes2[0])
        colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
        motionDiff = math.fabs(mes1[3] - mes2[3])
        lightingDiff = math.fabs(mes1[5] - mes2[5])
        numShotsDiff = math.fabs(mes1[6] - mes2[6])
        return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff


    def calculate(self, movie_ids):
        size = len(movie_ids)
        similarities = np.zeros((size, size))
        genres = self.de.genres
        years = self.de.years
        mise = self.de.mise_en_scene

        #rev_movie_ids = movie_ids.reverse()
        for id1 in tqdm(range(0, size)):
            m_id1 = movie_ids[id1]

            for id2 in range(id1, size):
                m_id2 = movie_ids[id2]
 
                genre_similarity = self.calculate_feature_similarity( genres[m_id1], genres[m_id2] )  
                year_similarity = self.compute_year_similarity(years[m_id1], years[m_id2])  
                miseenscene_similarity = self.compute_mise_en_scene_similarity(m_id1, m_id2, mise)  

                similarities[id1, id2] = genre_similarity * year_similarity * miseenscene_similarity
                similarities[id2, id1] = similarities[id1, id2]

        return similarities


    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')
        
        # Build up similarity scores between this item and everything the user rated
        neighbors = []
        for rating in self.trainset.ur[u]: #w datasesie user rating
            genreSimilarity = self.similarity[i,rating[0]]
            neighbors.append( (genreSimilarity, rating[1]) )
        
        # Extract the top-K most-similar ratings
        k_neighbors = heapq.nlargest(40, neighbors, key=lambda t: t[0])

        # Compute average sim score of K neighbors weighted by user ratings
        simTotal = weightedSum = 0
        for (simScore, rating) in k_neighbors:
            if (simScore > 0):
                simTotal += simScore
                weightedSum += simScore * rating
            
        if (simTotal == 0):
            raise PredictionImpossible('No neighbors')

        predictedRating = weightedSum / simTotal


        #add to results
        self.results[int(self.trainset.to_raw_iid(i))] = predictedRating
        return predictedRating

    def get_recomendations(self, count=10):
        res = sorted((value,key) for (key,value) in self.results.items())
        return self.results

In [0]:
%%time


algo = SimilarityAlgorithm(de)

cross_validate(algo, dataset, verbose=True)

trainset size: 8975


HBox(children=(IntProgress(value=0, max=8975), HTML(value='')))


trainset size: 8996


HBox(children=(IntProgress(value=0, max=8996), HTML(value='')))


trainset size: 8947


HBox(children=(IntProgress(value=0, max=8947), HTML(value='')))


trainset size: 8973


HBox(children=(IntProgress(value=0, max=8973), HTML(value='')))


trainset size: 8979


HBox(children=(IntProgress(value=0, max=8979), HTML(value='')))


Evaluating RMSE, MAE of algorithm SimilarityAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9932  0.9833  0.9688  0.9874  0.9757  0.9817  0.0086  
MAE (testset)     0.7673  0.7587  0.7466  0.7630  0.7539  0.7579  0.0072  
Fit time          715.63  739.89  726.09  748.31  719.81  729.95  12.32   
Test time         7.99    7.68    7.92    7.58    8.15    7.87    0.21    
CPU times: user 1h 1min 24s, sys: 17.6 s, total: 1h 1min 42s
Wall time: 1h 1min 29s


# Colaborative Filtering

## User Based

In [0]:
from surprise import KNNBasic

In [0]:
sim_options = {'name': 'cosine',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(recommendation_dataset.train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f79b77ae3c8>

In [0]:
%%time
print("\n\n", get_evaluation(model, recommendation_dataset))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Mean Absolute Error: 0.754729310903918
Root Mean Square Error: 0.9786509473598148
Hit Rate (HR): 0.0
Cumulative Hit Rate (cHR): 0.0
Average Reciprocal HitRate  (ARHR): 0.0
Rating  HitRate  (rHR): {}
Coverage: 1.0
Diversity: 0.8882370462217049
Novelty: 6126.978196721311


 {'MAE': 0.754729310903918, 'RMSE': 0.9786509473598148, 'HR': 0.0, 'cHR': 0.0, 'ARHR': 0.0, 'rHR': {}, 'Coverage': 1.0, 'Diversity': 0.8882370462217049, 'Novelty': 6126.978196721311}
CPU times: user 4min 26s, sys: 3.15 s, total: 4min 29s
Wall time: 4min 29s


## Movie Base

In [0]:
sim_options = {'name': 'cosine',
               'user_based': False
               }

model_item = KNNBasic(sim_options=sim_options)
model_item.fit(recommendation_dataset.train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f79b77ae518>

In [0]:
%%time
print("\n\n", get_evaluation(model_item, recommendation_dataset))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Mean Absolute Error: 0.7610169078515616
Root Mean Square Error: 0.9787736703737636
Hit Rate (HR): 0.0
Cumulative Hit Rate (cHR): 0.0
Average Reciprocal HitRate  (ARHR): 0.0
Rating  HitRate  (rHR): {}
Coverage: 0.9885245901639345
Diversity: 0.7204060469128939
Novelty: 6933.130047106326


 {'MAE': 0.7610169078515616, 'RMSE': 0.9787736703737636, 'HR': 0.0, 'cHR': 0.0, 'ARHR': 0.0, 'rHR': {}, 'Coverage': 0.9885245901639345, 'Diversity': 0.7204060469128939, 'Novelty': 6933.130047106326}
CPU times: user 21min 32s, sys: 3.4 s, total: 21min 35s
Wall time: 21min 36s


# SVD

In [0]:
from surprise import SVD

svd = SVD(random_state=10)
svd.fit(recommendation_dataset.train_set)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7d51d75278>

In [0]:
%%time
print("\n\n", get_evaluation(svd, recommendation_dataset))

Computing the cosine similarity matrix...
Done computing similarity matrix.
Mean Absolute Error: 0.49840804714417075
Root Mean Square Error: 0.6436712599229663
Hit Rate (HR): 0.036065573770491806
Cumulative Hit Rate (cHR): 0.036065573770491806
Average Reciprocal HitRate  (ARHR): 0.013333333333333332
Rating  HitRate  (rHR): {2.5: 0.06666666666666667, 3.0: 0.008695652173913044, 4.0: 0.044444444444444446, 4.5: 0.09433962264150944, 5.0: 0.056910569105691054}
Coverage: 0.9245901639344263
Diversity: 0.03138572161157538
Novelty: 504.3873857062885


 {'MAE': 0.49840804714417075, 'RMSE': 0.6436712599229663, 'HR': 0.036065573770491806, 'cHR': 0.036065573770491806, 'ARHR': 0.013333333333333332, 'rHR': {2.5: 0.06666666666666667, 3.0: 0.008695652173913044, 4.0: 0.044444444444444446, 4.5: 0.09433962264150944, 5.0: 0.056910569105691054}, 'Coverage': 0.9245901639344263, 'Diversity': 0.03138572161157538, 'Novelty': 504.3873857062885}
CPU times: user 2min 17s, sys: 2.88 s, total: 2min 20s
Wall time: 2mi

In [0]:
from surprise import SVDpp

svdpp = SVDpp(random_state=10)
svdpp.fit(recommendation_dataset.train_set)

In [0]:
%%time
print("\n\n", get_evaluation(svdpp, recommendation_dataset))