# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import time

In [2]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [3]:
df_books = pd.read_csv('books.csv').drop('Unnamed: 0', axis=1)
df_ratings = pd.read_csv('ratings.csv').drop('Unnamed: 0', axis=1)
df_ratings_ori = pd.read_csv('BX-Book-Ratings.csv',sep=';',encoding='iso-8859-1',error_bad_lines=False)

# Creating example users

In [4]:
alice = {'Name': 'Alice', 'Age': 30.0, 'Country': 'usa', 'Ratings': {'0140324623': 9, '0140434895': 10, '0451526562': 9, '0451521951': 10,  '0451527747': 7, 
                                                                   '0451524934': 10, '0060929871': 10, '0399501487': 8, '0618002219': 7, '0553211404': 8, 
                                                                   '0684801523': 9, '0140430075': 7, '0140430229': 6, '0486282112': 9, '0590203509': 8, 
                                                                   '0486280616': 6}}

In [5]:
bob = {'Name':'Bob', 'Age': 26.0, 'Country': 'usa', 'Ratings': {'0590353403': 0, '0439064872': 0, '0439136369': 0, '0812511816': 0, '0812517725': 0, 
                                                                '0812513711': 0, '0812513738': 0, '0812575717': 0, '0425080021': 0, '0553580272': 0, 
                                                                '0553580302': 0, '0553580337': 0, '0345339703': 0, '0345339711': 0, '0345339738': 0}}

+ Charlie is one of the users removed during cleaning

In [6]:
user_c = df_ratings_ori.loc[df_ratings_ori["User-ID"] == 140000]

In [7]:
charlie = {'Name':'Charlie', 'Age': 40.0, 'Country': 'usa', 'Ratings': {}}

In [8]:
for i in range(0, len(user_c)):
    if df_books['ISBN'].str.contains(user_c.iloc[i]['ISBN']).any():
        charlie['Ratings'][user_c.iloc[i]['ISBN']] = user_c.iloc[i]['Book-Rating']

# Showing individual user ratings

In [9]:
def user_ratings(user):
    df_userrate = pd.DataFrame.from_dict(user["Ratings"], orient='index', columns=['Rating']).sort_values(by=['Rating'], ascending=False)
    df_userrate = df_userrate.rename(index = dict(zip(df_books["ISBN"], df_books["Book-Title"])))
    print("User Name: " + user["Name"])
    print(df_userrate)

+ Alice likes classical literature and frequently rates her books

In [10]:
user_ratings(alice)

User Name: Alice
                                                    Rating
Great Expectations                                      10
The Count of Monte Cristo                               10
1984                                                    10
Brave New World                                         10
Anne of Green Gables                                     9
A Tale of Two Cities                                     9
The Great Gatsby                                         9
Frankenstein (Dover Thrift Editions)                     9
Lord of the Flies                                        8
Jane Eyre (Bantam Classics)                              8
Little Women                                             8
Alice's Adventures in Wonderland and Through th...       7
The Hobbit: or There and Back Again                      7
Robinson Crusoe                                          7
Gulliver's Travels (Penguin Classics)                    6
Adventures of Huckleberry Finn (Dover T

+ Bob likes fantasy and doesn't bother with ratings

In [11]:
user_ratings(bob)

User Name: Bob
                                                    Rating
Harry Potter and the Sorcerer's Stone (Book 1)           0
Harry Potter and the Chamber of Secrets (Book 2)         0
Harry Potter and the Prisoner of Azkaban (Book 3)        0
The Eye of the World (The Wheel of Time, Book 1)         0
The Great Hunt (The Wheel of Time, Book 2)               0
The Dragon Reborn (The Wheel of Time, Book 3)            0
The Shadow Rising (The Wheel of Time, Book 4)            0
Ender's Shadow                                           0
Dune                                                     0
House Atreides (Dune: House Trilogy, Book 1)             0
House Harkonnen (Dune: House Trilogy, Book 2)            0
House Corrino (Dune: House Trilogy, Book 3)              0
The Fellowship of the Ring (The Lord of the Rin...       0
The Two Towers (The Lord of the Rings, Part 2)           0
The Return of the King (The Lord of the Rings, ...       0


+ Charlie appears to be a middle aged housewife with a tendency towards the romance genre

In [12]:
user_ratings(charlie)

User Name: Charlie
                                                    Rating
The No. 1 Ladies' Detective Agency                      10
Where the Heart Is (Oprah's Book Club (Paperback))      10
Year of Wonders                                         10
Blindsighted                                            10
An Instance of the Fingerpost                           10
True History of the Kelly Gang                          10
Atonement : A Novel                                     10
The Lovely Bones: A Novel                               10
Grave Secrets (Temperance Brennan Novel (Hardco...      10
The Last Time They Met : A Novel                        10
Lasher                                                  10
Bel Canto: A Novel                                       9
Four To Score (A Stephanie Plum Novel)                   9
One for the Money (A Stephanie Plum Novel)               9
The Subtle Knife (His Dark Materials, Book 2)            9
Fatal Voyage                         

# Recommendation function

+ Function takes the method (defined below), user and number of recommendations as arguments

In [13]:
def recommend(method, user, rec_no):
    rating_dict = user["Ratings"]
    rec_list = method(rating_dict)
    rec_list = rec_list.drop(list(user['Ratings'].keys())).sort_values(ascending=False).head(rec_no)
    return df_books.loc[df_books["ISBN"].isin(rec_list.index)]

# Implicit correlation

In [14]:
irate_corr = pd.DataFrame.from_csv("irate_corr.csv")

In [15]:
def implicit(rating_dict):
    array_sum = irate_corr[list(rating_dict.keys())[0]].copy()
    if len(list(rating_dict.keys())) == 1:
        pass
    else:
        for i in range(1, len(list(rating_dict.keys()))):
            array_sum += irate_corr[list(rating_dict.keys())[i]]
    return array_sum

In [16]:
recommend(implicit, alice, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
7,0446310786,To Kill a Mockingbird,Harper Lee,1988,Little Brown & Company
16,0440225701,The Street Lawyer,JOHN GRISHAM,1999,Dell
99,0451526341,Animal Farm,George Orwell,2004,Signet
134,0316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
174,0312278586,The Nanny Diaries: A Novel,Emma McLaughlin,2002,St. Martin's Press
323,0060914653,The Unbearable Lightness of Being,Milan Kundera,1988,Harpercollins
344,0553273604,Of Love and Shadows,Isabel Allende,1988,Bantam Books
404,0553296981,Anne Frank: The Diary of a Young Girl,ANNE FRANK,1993,Bantam
432,0345361792,A Prayer for Owen Meany,John Irving,1990,Ballantine Books
439,1400034779,The No. 1 Ladies' Detective Agency (Today Show...,Alexander McCall Smith,2003,Anchor


# Content based correlation

In [17]:
desc_coscorr_2 = pd.DataFrame.from_csv("desc_coscorr_2.csv")

In [18]:
def content(rating_dict):
    array_sum = desc_coscorr_2[list(rating_dict.keys())[0]].copy()
    if len(list(rating_dict.keys())) == 1:
        pass
    else:
        for i in range(1, len(list(rating_dict.keys()))):
            array_sum += desc_coscorr_2[list(rating_dict.keys())[i]]
    return array_sum

In [19]:
recommend(content, bob, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
512,0812565959,Shadow of the Hegemon (Ender Wiggins Saga (Pap...,Orson Scott Card,2001,Tor Books
633,0345416880,Enchantment,Orson Scott Card,2000,Del Rey Books
655,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books
669,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1994,Tor Books
673,0441294677,"God Emperor of Dune (Dune Chronicles, Book 4)",Frank Herbert,1991,ACE Charter
674,0441104029,"Children of Dune (Dune Chronicles, Book 3)",Frank Herbert,1991,Ace Books
683,0812513754,"Lord of Chaos (The Wheel of Time, Book 6)",Robert Jordan,1995,Tor Fantasy
684,0812550307,"The Fires of Heaven (The Wheel of Time, Book 5)",Robert Jordan,1994,Tor Fantasy
911,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey
1345,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic


# Matrix factorization

+ This is nearly the same code as before but with the following modifications:
    + Input matrix is now only a single row for the new user
    + Item matrix Q is loaded from previous results and is fixed. We only optimize for the single user
    + Code is set to run for a fixed time (set at 0.5s here) to ensure recommendations can be updated in real time

In [20]:
ratings_pivot = df_ratings.pivot(index='User-ID', columns='ISBN')["Book-Rating"]

In [21]:
def matrix(rating_dict):
    blank_matrix = pd.DataFrame(np.NaN, index=[0], columns=ratings_pivot.columns)
    class MF():

        def __init__(self, R, K, alpha, beta, runtime):
            """
            Perform matrix factorization to predict empty
            entries in a matrix.

            Arguments
            - R (ndarray)   : user-item rating matrix
            - K (int)       : number of latent dimensions
            - alpha (float) : learning rate
            - beta (float)  : regularization parameter
            """

            self.R = R
            self.num_users, self.num_items = R.shape
            self.K = K
            self.alpha = alpha
            self.beta = beta
            self.runtime = runtime

        def train(self):
            # Initialize user and item latent feature matrice
            self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
            self.Q = np.load('mf_weights.npy')

            # Initialize the biases
            self.b_u = np.zeros(self.num_users)
            self.b_i = np.zeros(self.num_items)
            self.b = np.mean(self.R[np.where(self.R != 0)])

            # Create a list of training samples
            self.samples = [
                (i, j, self.R[i, j])
                for i in range(self.num_users)
                for j in range(self.num_items)
                if self.R[i, j] > 0
            ]

            # Perform stochastic gradient descent for number of iterations
            training_process = []
            ts = time.time()
            iteration = 1
            while True:
                np.random.shuffle(self.samples)
                self.sgd()
                mse = self.mse()
                training_process.append((i, mse))

    #             print("Iteration: %d ; error = %.4f ; time taken = %.4f seconds" % (iteration, mse, time.time() - ts))
                iteration += 1

                if (time.time() - ts) > self.runtime:
    #                 print("Stopping training")
                    break
            return training_process

        def mse(self):
            """
            A function to compute the total mean square error
            """
            xs, ys = self.R.nonzero()
            predicted = self.full_matrix()
            error = 0
            for x, y in zip(xs, ys):
                error += pow(self.R[x, y] - predicted[x, y], 2)
            return np.sqrt(error)

        def sgd(self):
            """
            Perform stochastic gradient descent
            """
            for i, j, r in self.samples:
                # Computer prediction and error
                prediction = self.get_rating(i, j)
                e = (r - prediction)

                # Update biases
                self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
                self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

                # Update user and item latent feature matrices
                self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])

        def get_rating(self, i, j):
            """
            Get the predicted rating of user i and item j
            """
            prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
            return prediction

        def full_matrix(self):
            """
            Computer the full matrix using the resultant biases, P and Q
            """
            return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)
        
    for i in range(0, len(list(rating_dict.values()))):
        blank_matrix[list(rating_dict.keys())[i]].iloc[0] = list(rating_dict.values())[i]

    blank_matrix = blank_matrix.fillna(0)
    blank_matrix = blank_matrix.astype('int32')
    R = blank_matrix.values

    mf = MF(R, K=32, alpha=0.01, beta=0.001, runtime=0.5)
    mf.train()
    full_matrix = mf.full_matrix()
    return pd.DataFrame(np.transpose(mf.full_matrix()), index=ratings_pivot.columns)[0]

In [22]:
recommend(matrix, charlie, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
19,042511774X,Breathing Lessons,Anne Tyler,1994,Berkley Publishing Group
312,0312282990,The Amazing Adventures of Kavalier & Clay,Michael Chabon,2001,Picador
349,0671003755,She's Come Undone (Oprah's Book Club (Paperback)),Wally Lamb,1996,Washington Square Press
443,037570504X,"Breath, Eyes, Memory",Edwidge Danticat,1998,Vintage Books USA
592,0440235464,Lost Girls,Andrew Pyper,2001,Dell Publishing Company
663,0671708635,Seven Habits Of Highly Effective People,Stephen R. Covey,1990,Free Press
766,0671695304,"FOREVER : A Novel of Good and Evil, Love and Hope",Judy Blume,1989,Pocket
792,051513287X,Face the Fire (Three Sisters Island Trilogy),Nora Roberts,2002,Jove Books
962,0425118703,Midnight,Dean R. Koontz,1995,Berkley Publishing Group
1160,0380017601,Love Story,Erich Segal,2002,HarperTorch


# Hybrid systems

## Weighted
+ Methods are given different weights
+ Here implicit correlation and content based are both given 50% after scaling

In [23]:
from sklearn import preprocessing

def weighted(rating_dict):
    arr_implicit = (implicit(rating_dict)-implicit(rating_dict).min())/(implicit(rating_dict).max()-implicit(rating_dict).min())
    arr_content = (content(rating_dict)-content(rating_dict).min())/(content(rating_dict).max()-content(rating_dict).min())
    return arr_implicit + arr_content

In [24]:
recommend(weighted, alice, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
7,0446310786,To Kill a Mockingbird,Harper Lee,1988,Little Brown & Company
99,0451526341,Animal Farm,George Orwell,2004,Signet
134,0316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
174,0312278586,The Nanny Diaries: A Novel,Emma McLaughlin,2002,St. Martin's Press
318,0140620125,Wuthering Heights (Penguin Popular Classics),Emily Bronte,1994,Penguin Books Ltd
418,0446606812,Message in a Bottle,Nicholas Sparks,1999,Warner Vision
432,0345361792,A Prayer for Owen Meany,John Irving,1990,Ballantine Books
536,0618126988,The Silmarillion,J.R.R. Tolkien,2001,Houghton Mifflin
871,0380002930,Watership Down,Richard Adams,1976,Avon
893,0767907817,Bookends : A Novel,Jane Green,2003,Broadway


## Switching
+ A specific method is chosen based on set criteria
+ Here the method with highest standard deviation is used

In [25]:
from sklearn import preprocessing

def switching(rating_dict):
    arr_implicit = (implicit(rating_dict)-implicit(rating_dict).min())/(implicit(rating_dict).max()-implicit(rating_dict).min())
    arr_content = (content(rating_dict)-content(rating_dict).min())/(content(rating_dict).max()-content(rating_dict).min())
    arr_matrix = (matrix(rating_dict)-matrix(rating_dict).min())/(matrix(rating_dict).max()-matrix(rating_dict).min())
    val, idx = max((val, idx) for (idx, val) in enumerate([arr_implicit.std() , arr_content.std(), arr_matrix.std()]))
    
    return [implicit, content, matrix][idx](rating_dict)

In [26]:
recommend(switching, bob, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
512,0812565959,Shadow of the Hegemon (Ender Wiggins Saga (Pap...,Orson Scott Card,2001,Tor Books
633,0345416880,Enchantment,Orson Scott Card,2000,Del Rey Books
655,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books
669,0812550706,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1994,Tor Books
673,0441294677,"God Emperor of Dune (Dune Chronicles, Book 4)",Frank Herbert,1991,ACE Charter
674,0441104029,"Children of Dune (Dune Chronicles, Book 3)",Frank Herbert,1991,Ace Books
683,0812513754,"Lord of Chaos (The Wheel of Time, Book 6)",Robert Jordan,1995,Tor Fantasy
684,0812550307,"The Fires of Heaven (The Wheel of Time, Book 5)",Robert Jordan,1994,Tor Fantasy
911,0345339681,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey
1345,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic


## Mixed
+ Results from different methods are mixed together
+ Here the recommendations from implicit and matrix are laced in a 1-1 ratio

In [27]:
def mixed(rating_dict):
    arr_implicit = implicit(rating_dict).sort_values()
    arr_matrix = matrix(rating_dict).sort_values()
    for i in range(0, len(arr_implicit)):
        arr_implicit.iloc[i] = i
        arr_matrix.iloc[i] = i
    return pd.concat([arr_implicit, arr_matrix])

In [28]:
recommend(mixed, charlie, 20)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
19,042511774X,Breathing Lessons,Anne Tyler,1994,Berkley Publishing Group
312,0312282990,The Amazing Adventures of Kavalier & Clay,Michael Chabon,2001,Picador
349,0671003755,She's Come Undone (Oprah's Book Club (Paperback)),Wally Lamb,1996,Washington Square Press
443,037570504X,"Breath, Eyes, Memory",Edwidge Danticat,1998,Vintage Books USA
592,0440235464,Lost Girls,Andrew Pyper,2001,Dell Publishing Company
663,0671708635,Seven Habits Of Highly Effective People,Stephen R. Covey,1990,Free Press
766,0671695304,"FOREVER : A Novel of Good and Evil, Love and Hope",Judy Blume,1989,Pocket
792,051513287X,Face the Fire (Three Sisters Island Trilogy),Nora Roberts,2002,Jove Books
962,0425118703,Midnight,Dean R. Koontz,1995,Berkley Publishing Group
1160,0380017601,Love Story,Erich Segal,2002,HarperTorch


# Other hybrid methods

## Feature Combination
+ Features derived from different knowledge sources are combined together and given to a single recommendation algorithm
+ This is similar to what was done during the content-based approach (TF-IDF scores were merged with dummies for Author and Publisher before calculating cosine similarity)

## Feature Augmentation
+ One recommendation technique is used to compute a feature or set of features, which is then part of the input to the next technique

## Cascade
+ Recommenders are given strict priority, with the lower priority ones breaking ties in the scoring of the higher ones
+ The individual methods used here all produce a continuous rating, so this is not applicable

## Meta-level
+ One recommendation technique is applied and produces some sort of model, which is then the input used by the next technique