In [7]:
#imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import time
import random
import sys
pd.set_option('display.max_colwidth', None)

from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from scipy import sparse
from matplotlib import pyplot as plt
import utils as ut

The first thing to do on this page is to resave my lookup dataframes using the function I perfected at the end of rec3 - this should significantly improve the speed of my recommender function when returning results. I'll use the same variable for the read-in dataframes (lookup) in order to overwrite it and save memory.

Movies:

In [6]:
lookup= pd.read_pickle('./pickles/movies_look.pkl')

In [7]:
movies_small = ut.make_smaller_lookup(lookup)

(4405432, 7)
72385


In [8]:
movies_small #preview

Unnamed: 0,product_title,tot_prod_reviews,avg_prod_stars
0,Firefly: The Complete Series,4959,4.859649
1,Jillian Michaels - 30 Day Shred,4958,4.547802
2,Frozen,4569,4.688116
3,Frozen [Blu-ray],4492,4.701915
4,Mean Girls (Full Screen Edition),4409,4.993876
...,...,...,...
72380,Repeat Performance,11,3.363636
72381,Repentance,11,4.636364
72382,Replicant [Blu-ray],11,4.363636
72383,Glenn Gould: Hereafter,11,4.272727


In [10]:
#movies_small.to_pickle('./pickles/movies_look_small.pkl')

Books:

In [11]:
lookup= pd.read_pickle('./pickles/books_look.pkl')

In [12]:
books_small = ut.make_smaller_lookup(lookup)

(1489354, 7)
46575


In [13]:
books_small #preview

Unnamed: 0,product_title,tot_prod_reviews,avg_prod_stars
0,Harry Potter and the Order of the Phoenix (Book 5),4857,4.401277
1,Harry Potter And The Goblet Of Fire (Book 4),3739,4.775876
2,The Da Vinci Code,2684,3.482861
3,Lies and the Lying Liars Who Tell Them,2665,3.849531
4,Unfit For Command: Swift Boat Veterans Speak Out Against John Kerry,2614,3.447590
...,...,...,...
46570,"The Girls' Book of Success: Winning Wisdom, Stars' Secrets, Tales of Triumph, and More",10,5.000000
46571,"All I Did Was Ask: Conversations with Writers, Actors Musicians, and Artists",10,4.400000
46572,Wobblies!: A Graphic History of the Industrial Workers of the World,10,4.600000
46573,One of Ours (Vintage Classics),10,3.500000


In [14]:
#books_small.to_pickle('./pickles/books_look_small.pkl')

Video Games:

In [15]:
lookup= pd.read_pickle('./pickles/videog_look.pkl')

In [16]:
vg_small = ut.make_smaller_lookup(lookup)

(1648136, 7)
15938


In [17]:
vg_small #preview

Unnamed: 0,product_title,tot_prod_reviews,avg_prod_stars
0,PlayStation 4 500GB Console [Old Model],10317,4.162256
1,Grand Theft Auto V,8656,4.545055
2,Call of Duty: Ghosts,7762,3.787426
3,Battlefield 4,4795,3.666945
4,Assassin's Creed 4,4702,4.564866
...,...,...,...
15933,Ultimate Civil War Battles: Robert E. Lee vs. Ulysses S. Grant - PC,11,1.181818
15934,Duke Nukem: Critical Mass,11,3.454545
15935,Duke Nukem,11,4.090909
15936,DualPenSports - Nintendo 3DS,11,4.000000


In [19]:
#vg_small.to_pickle('./pickles/vg_look_small.pkl')

Okay, now that those are all done, I want to edit my lookup function to accept these new dataframes. I'm also going to incorporate a few other ideas:  
- searching the query item as a lowercase over all titles (also in lowercase) rather than using title-case
- using .lower() on the "wout" item and corresponding search to capture all variations of that input
- returning a dataframe of results ONLY (not showing the item that was searched); this may be controversial but I'm not sure showing people the precise item their search was matched with is going to be helpful. Either the results are what they expect or they're not. I think the results are more consistently correct than the particular item chosen, and I don't want the printout listing the chosen item to negatively tint the otherwise impressive results

In [6]:
vg_df, vg_rec = pd.read_pickle('./pickles/vg_look_small.pkl'), pd.read_pickle('./pickles/videog_rec.pkl')
movies_df, movies_rec = pd.read_pickle('./pickles/movies_look_small.pkl'), pd.read_pickle('./pickles/movie_rec.pkl')
books_df, books_rec = pd.read_pickle('./pickles/books_look_small.pkl'), pd.read_pickle('./pickles/books_rec.pkl')

In [7]:
def rec_search(category, query, wout='no'):
    

    if category.lower() == 'video games':
        lookup, recommender = vg_df, vg_rec #pd.read_pickle('./pickles/videog_look.pkl'),pd.read_pickle('./pickles/videog_rec.pkl')
    elif category.lower() == 'movies':
        lookup, recommender = movies_df, movies_rec #pd.read_pickle('./pickles/movies_look.pkl'), pd.read_pickle('./pickles/movie_rec.pkl') 
    elif category.lower() == 'books':
        lookup, recommender = books_df, books_rec #pd.read_pickle('./pickles/books_look.pkl'), pd.read_pickle('./pickles/books_rec.pkl')
    else:
        return "Sorry, that wasn't one of the available categories"

    try:
        query=query.lower() #lowercase entry, lowercase titles (only during search, below)
        titles = list(lookup[lookup['product_title'].map(lambda x: x.lower()).str.contains(query)]['product_title'])
        q = titles[0] #this is the item to search for

        if wout.lower() == 'no':
            query_dict = dict(recommender.loc[q].sort_values())
            
            top10_prods = []
            num_prod_revs = []
            avg_prod_stars = []
            for key in list(query_dict.keys())[1:11]:
                top10_prods.append(key)
                num_prod_revs.append(round(lookup[lookup['product_title']==key]['tot_prod_reviews'].mean()))
                avg_prod_stars.append(round(lookup[lookup['product_title']==key]['avg_prod_stars'].mean(), 2))
            #print(top10_prods, num_prod_revs, avg_prod_stars)
            final_output_df = pd.DataFrame(data = {
                'Recommended Items':top10_prods,
                'Total Reviews for Product':num_prod_revs,
                'Avg Product Star Rating(1-5)':avg_prod_stars
            }, index=range(1,11))
            
            is_df = True #returning a dataframe now
            
            return final_output_df            
            
            
            
        else:
            
            wout = wout.lower() #capitlize first letters
            query_dict = dict(recommender.loc[q].sort_values())
            filtered_query = [] #make empty list
            for key, value in query_dict.items(): #index into dictionary of results
                if wout not in key.lower(): #check if avoided keyword is in results
                    filtered_query.append((key, value)) #make list of results that DON'T include "wout" keyword
            
            top10_prods = []
            num_prod_revs = []
            avg_prod_stars = []
            for item in filtered_query[1:11]:
                top10_prods.append(item[0])
                num_prod_revs.append(round(lookup[lookup['product_title']==item[0]]['tot_prod_reviews'].mean()))
                avg_prod_stars.append(round(lookup[lookup['product_title']==item[0]]['avg_prod_stars'].mean(), 2))
            
                
            final_output_df = pd.DataFrame(data = {
                'Recommended Items':top10_prods,
                'Total Reviews for Product':num_prod_revs,
                'Avg Product Star Rating(1-5)':avg_prod_stars
                }, index=range(1,11))
                
                
            is_df = True #returning a dataframe now
                
            return final_output_df
        
    except:
        return f'Sorry, "{query}" does not appear to be in the product database'

In [9]:
rec_search('video games', 'harry potter', 'lego')

Unnamed: 0,Recommended Items,Total Reviews for Product,Avg Product Star Rating(1-5)
1,Harry Potter and the Order of the Phoenix,223,3.6
2,Alien 3 - Nintendo Super NES,11,4.36
3,HORI 3DS Protector and Pouch Set (Super Mario 3D Land version),14,3.79
4,Duck Tales,24,4.62
5,Marvel Super Hero Squad: The Infinity Gauntlet,159,4.16
6,Harry Potter and the Deathly Hallows Part 1,169,3.28
7,Disney's A Christmas Carol - Nintendo DS,18,4.44
8,Cake Mania 3 NDS,29,3.93
9,Pac Pix - Nintendo DS,16,4.19
10,Romancing SaGa - PlayStation 2,18,4.06


The reduction of lookup dataframe size did speed up my web app, but unfortunately it's still pretty slow. In order to improve its performance, I'm going to try and cache some of the results of my function. In order to do THAT, I'm going to break my recommender function down into smaller components so I can store the anticipated entries more easily.

In [15]:
#enter category, returns dataframes for lookup and recommender
def choose_look_and_rec(category):
    if category.lower() == 'video games':
        lookup, recommender = pd.read_pickle('./pickles/vg_look_small.pkl'), pd.read_pickle('./pickles/videog_rec.pkl') #vg_df, vg_rec #pd.read_pickle('./pickles/videog_look.pkl'),pd.read_pickle('./pickles/videog_rec.pkl')
    elif category.lower() == 'movies':
        lookup, recommender = pd.read_pickle('./pickles/movies_look_small.pkl'), pd.read_pickle('./pickles/movie_rec.pkl') #movies_df, movies_rec #pd.read_pickle('./pickles/movies_look.pkl'), pd.read_pickle('./pickles/movie_rec.pkl') 
    elif category.lower() == 'books':
        lookup, recommender = pd.read_pickle('./pickles/books_look_small.pkl'), pd.read_pickle('./pickles/books_rec.pkl') #books_df, books_rec #pd.read_pickle('./pickles/books_look.pkl'), pd.read_pickle('./pickles/books_rec.pkl')
    else:
        return "Sorry, that wasn't one of the available categories"
    return lookup, recommender

In [16]:
def load_vg():
    return pd.read_pickle('./pickles/vg_look_small.pkl'), pd.read_pickle('./pickles/videog_rec.pkl')
def load_movies():
    return pd.read_pickle('./pickles/movies_look_small.pkl'), pd.read_pickle('./pickles/movie_rec.pkl')
def load_books():
    return pd.read_pickle('./pickles/books_look_small.pkl'), pd.read_pickle('./pickles/books_rec.pkl')
    

In [2]:
#enter query term, returns item from available product list
def query_to_item(query, lookup):
    try:
        query=query.lower() #lowercase entry, lowercase titles (only during search, below)
        titles = list(lookup[lookup['product_title'].map(lambda x: x.lower()).str.contains(query)]['product_title'])
        return titles[0] #this is the item to search for
    except:
        return f'Sorry, "{query}" does not appear to be in the product database'

In [10]:
#returns dictionary of ALL recommended items once product has been selected
def give_recs(product):
    #if wout == 'no':
    return dict(recommender.loc[product].sort_values()) #sort distances smallest to largest    

In [12]:
def filter_recs(prod_dictionary, wout=''):
    if wout == '':
        top10_prods = []
        num_prod_revs = []
        avg_prod_stars = []
        for key in list(prod_dictionary.keys())[1:11]:
            top10_prods.append(key)
            num_prod_revs.append(round(lookup[lookup['product_title']==key]['tot_prod_reviews'].mean()))
            avg_prod_stars.append(round(lookup[lookup['product_title']==key]['avg_prod_stars'].mean(), 2))
        #print(top10_prods, num_prod_revs, avg_prod_stars)
        final_output_df = pd.DataFrame(data = {
            'Recommended Items':top10_prods,
            'Total Reviews for Product':num_prod_revs,
            'Avg Product Star Rating(1-5)':avg_prod_stars
        }, index=range(1,11))

        return final_output_df            
    else:
        wout = wout.lower() #capitlize first letters
        filtered_query = [] #make empty list
        for key, value in prod_dictionary.items(): #index into dictionary of results
            if wout not in key.lower(): #check if avoided keyword is in results
                filtered_query.append((key, value)) #make list of results that DON'T include "wout" keyword

        top10_prods = []
        num_prod_revs = []
        avg_prod_stars = []
        for item in filtered_query[1:11]:
            top10_prods.append(item[0])
            num_prod_revs.append(round(lookup[lookup['product_title']==item[0]]['tot_prod_reviews'].mean()))
            avg_prod_stars.append(round(lookup[lookup['product_title']==item[0]]['avg_prod_stars'].mean(), 2))


        final_output_df = pd.DataFrame(data = {
            'Recommended Items':top10_prods,
            'Total Reviews for Product':num_prod_revs,
            'Avg Product Star Rating(1-5)':avg_prod_stars
            }, index=range(1,11))

        return final_output_df

In [17]:
#test - as if on video game lookup page
lookup, recommender = load_vg() #would be cached
search_prod = query_to_item('harry potter', lookup) #needs to search live
prod_dict = give_recs(search_prod) #would have all products cached
filter_recs(prod_dict, 'lego')

Unnamed: 0,Recommended Items,Total Reviews for Product,Avg Product Star Rating(1-5)
1,Harry Potter and the Order of the Phoenix,223,3.6
2,Alien 3 - Nintendo Super NES,11,4.36
3,HORI 3DS Protector and Pouch Set (Super Mario 3D Land version),14,3.79
4,Duck Tales,24,4.62
5,Marvel Super Hero Squad: The Infinity Gauntlet,159,4.16
6,Harry Potter and the Deathly Hallows Part 1,169,3.28
7,Disney's A Christmas Carol - Nintendo DS,18,4.44
8,Cake Mania 3 NDS,29,3.93
9,Pac Pix - Nintendo DS,16,4.19
10,Romancing SaGa - PlayStation 2,18,4.06


I want to see if I can use the give_recs() function to store the return of each product in relation to the others. If I can, I could save that and use it instead of actually loading in the full recommender (1.8 GB) to my web app.

I'll need to do this for each category (books, movies, video games)

In [20]:
ut.size_in_gb(vg_rec) #original

'0.077166065 GB'

In [21]:
ut.size_in_gb(movies_rec) #original

'1.843095412 GB'

In [22]:
ut.size_in_gb(books_rec) #original

'0.422205132 GB'

In [23]:
vg_df.head()

Unnamed: 0,product_title,tot_prod_reviews,avg_prod_stars
0,PlayStation 4 500GB Console [Old Model],10317,4.162256
1,Grand Theft Auto V,8656,4.545055
2,Call of Duty: Ghosts,7762,3.787426
3,Battlefield 4,4795,3.666945
4,Assassin's Creed 4,4702,4.564866


In [39]:
#reminder of what give_recs looks like

def give_recs(product):
    return dict(recommender.loc[product].sort_values()[:100]) #sort distances smallest to largest, max 100 returns 

In [40]:
lookup = vg_df
recommender = vg_rec
len(give_recs('Call of Duty: Ghosts'))

100

In [60]:
def make_rec_df(look, rec):
    #recommender = rec #necessary in give_recs function
    prod_list = list(look['product_title'])
    #return {item : give_recs(item) for item in prod_list}
    return pd.DataFrame(data={
        'item':[item for item in prod_list],
        'similar':[give_recs(item) for item in prod_list]
    })

In [50]:
new_vg_rec = make_rec_list(vg_df, vg_rec)

print(len(new_vg_rec))
print(ut.size_in_gb(new_vg_rec))
print(type(new_vg_rec))

15938
0.00058992 GB
<class 'dict'>


In [55]:
#compared to rec df size...
ut.size_in_gb(vg_rec)

'0.077166065 GB'

In [57]:
list(new_vg_rec['Call of Duty: Ghosts'].keys())[1:11]

['Battlefield 4',
 "Assassin's Creed 4",
 'Killzone: Shadow Fall (PlayStation 4)',
 'Need for Speed Rivals',
 'Grand Theft Auto V',
 'Call of Duty: Black Ops II',
 'DualShock 4 Wireless Controller for PlayStation 4 - Jet Black [Old Model]',
 'FIFA 14',
 'Call of Duty: Advanced Warfare',
 'Watch Dogs']

That looks good - I can still call in the closest items and it is a fraction of the size of the recommender df.  


Time to apply that function to my other (larger) dataframes and see how they turn out.

In [62]:
#reset global variable
recommender = books_rec

new_books_rec = make_rec_list(books_df, books_rec)

print(len(new_books_rec))
print(f'New size: {ut.size_in_gb(new_books_rec)}')
print(f'Original size: {ut.size_in_gb(new_books_rec)}')
print(type(new_books_rec))

46575
New size: 0.002621536 GB
Original size: 0.002621536 GB
<class 'dict'>


In [63]:
import pickle

In [66]:
#save dictionaries

In [65]:
with open('./pickles/new_vg_rec.pkl', 'wb') as f:
    pickle.dump(new_vg_rec, f)
    
with open ('./pickles/new_movies_rec.pkl', 'wb') as f:
    pickle.dump(new_movies_rec, f)
    
with open('./pickles/new_books_rec.pkl', 'wb') as f:
    pickle.dump(new_books_rec, f)

In [67]:
#test saves successful - should print 3x "True"
with open('./pickles/new_vg_rec.pkl', 'rb') as f:
    testp = pickle.load(f)
print(new_vg_rec == testp) #assert the pickled dict is the same as the current one

with open('./pickles/new_movies_rec.pkl', 'rb') as f:
    testp = pickle.load(f)
print(new_movies_rec == testp) #assert the pickled dict is the same as the current one

with open('./pickles/new_books_rec.pkl', 'rb') as f:
    testp = pickle.load(f)
print(new_books_rec == testp) #assert the pickled dict is the same as the current one

True
True
True


In [70]:
#check sizes compared to former sizes
print(f'Original Book Recommender size: {ut.size_in_gb(books_rec)}')
print(f'New Book Recommender size: {ut.size_in_gb(new_books_rec)}')
print()
print(f'Original Movie Recommender size: {ut.size_in_gb(movies_rec)}')
print(f'New Movie Recommender size: {ut.size_in_gb(new_movies_rec)}')
print()
print(f'Original Video Game Recommender size: {ut.size_in_gb(vg_rec)}')
print(f'New Video Game Recommender size: {ut.size_in_gb(new_vg_rec)}')

Original Book Recommender size: 0.42352907 GB
New Book Recommender size: 0.002621536 GB

Original Movie Recommender size: 1.845735916 GB
New Movie Recommender size: 0.002621536 GB

Original Video Game Recommender size: 0.077186335 GB
New Video Game Recommender size: 0.00058992 GB


Woof! That took a long time to run (and I messed up comparing the sizes (above) so it's not quite as pretty an output as I had hoped, but there's no doubt I'm saving a substantial amount of memory with this process. On to build a new recommender that can handle these dictionaries!