In [1]:
#imports
import pandas as pd
import numpy as np
import re
import time
import sys
pd.set_option('display.max_colwidth', None)
import pickle
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from scipy import sparse
from matplotlib import pyplot as plt
import utils as ut

In [3]:
#load in new recommender dictionaries (not dataframes)
with open('./pickles/new_vg_rec.pkl', 'rb') as f:
    vg_rec = pickle.load(f)

with open('./pickles/new_movies_rec.pkl', 'rb') as f:
    movies_rec = pickle.load(f)

with open('./pickles/new_books_rec.pkl', 'rb') as f:
    books_rec = pickle.load(f)

In [4]:
#load in lookup dataframes
vg_look = pd.read_pickle('./pickles/vg_look_small.pkl')
movies_look = pd.read_pickle('./pickles/movies_look_small.pkl')
books_look = pd.read_pickle('./pickles/books_look_small.pkl')

In [80]:
#test

lookup = books_look
recommender = books_rec

def make_recs_new(query, wout=''):  #need to set lookup and recommender global variables prior to calling   
    try:
        query=query.lower() #lowercase entry, lowercase titles (only during search, below)
        titles = list(lookup[lookup['product_title'].map(lambda x: x.lower()).str.contains(query)]['product_title'])
        q = titles[0] #this is the item to search for
        
        message = f'''
        **Most Popular Item Containing Your Search Term(s):** {q}  
        There are {round(lookup[lookup['product_title']==q]['tot_prod_reviews'].mean())} total reviews for this item and it has an average star rating of {round(lookup[lookup['product_title']==q]['avg_prod_stars'].mean(), 2)}
        '''      

        if wout == '':           
            top10_prods = []
            num_prod_revs = []
            avg_prod_stars = []
            for key in list(recommender[q].keys())[1:11]:
                top10_prods.append(key)
                num_prod_revs.append(round(lookup[lookup['product_title']==key]['tot_prod_reviews'].mean()))
                avg_prod_stars.append(round(lookup[lookup['product_title']==key]['avg_prod_stars'].mean(), 2))
            final_output_df = pd.DataFrame(data = {
                'Recommended Items':top10_prods,
                'Total Reviews for Product':num_prod_revs,
                'Avg Product Star Rating(1-5)':avg_prod_stars
            }, index=range(1,11))
            return message, final_output_df            
            
        else:
            
            wout = wout.lower() #lowercase
            filtered_query = [] #make empty list
            for key in list(recommender[q].keys()):
                if wout not in key.lower(): #check if avoided keyword is in results
                    filtered_query.append(key)
            top10_prods = []
            num_prod_revs = []
            avg_prod_stars = []
            for item in filtered_query[1:11]:
                top10_prods.append(item)
                num_prod_revs.append(round(lookup[lookup['product_title']==item]['tot_prod_reviews'].mean()))
                avg_prod_stars.append(round(lookup[lookup['product_title']==item]['avg_prod_stars'].mean(), 2))
            final_output_df = pd.DataFrame(data = {
                'Recommended Items':top10_prods,
                'Total Reviews for Product':num_prod_revs,
                'Avg Product Star Rating(1-5)':avg_prod_stars
                }, index=range(1,11))
            return message, final_output_df
        
    except:
        return f'Sorry, "{query}" does not appear to be in the product database'

In [81]:
make_recs_new('batman')

('\n        **Most Popular Item Containing Your Search Term(s):** Batman: The Dark Knight Returns  \n        There are 177 total reviews for this item and it has an average star rating of 4.64\n        ',
                                   Recommended Items  \
 1                                          Watchmen   
 2                                  Batman: Year One   
 3                          Batman: The Killing Joke   
 4                             Batman: Arkham Asylum   
 5   Batman: Year One (Batman (DC Comics Hardcover))   
 6                                    V for Vendetta   
 7                        Batman: The Long Halloween   
 8      Daredevil Visionaries - Frank Miller, Vol. 1   
 9                              Batman: Dark Victory   
 10                                          Marvels   
 
     Total Reviews for Product  Avg Product Star Rating(1-5)  
 1                         203                          4.68  
 2                          39                     

In [41]:
#https://docs.python.org/3/library/bz2.html
#https://medium.com/better-programming/load-fast-load-big-with-compressed-pickles-5f311584507e#:~:text=Compressed%20Pickles%20If%20you%20have%20been%20working%20in,bytes.%20It%20cuts%20loading%20time%20to%20a%20fraction.
import bz2

In [42]:
import _pickle as cPickle

In [43]:
#save compressed pickle
with bz2.BZ2File('./compressed/movies_rec_c.pbz2', 'w') as f:
    cPickle.dump(movies_rec, f)

In [44]:
#test opening compressed pickle, assert = to original
test_movies = bz2.BZ2File('./compressed/movies_rec_c.pbz2', 'rb')
test_movies = cPickle.load(test_movies)

print(test_movies == movies_rec)

True


Okay - that worked! Time to save compressed forms of the other two dataframes in the same format (for good measure).

In [45]:
with bz2.BZ2File('./compressed/vg_rec_c.pbz2', 'w') as f:
    cPickle.dump(vg_rec, f)
    
with bz2.BZ2File('./compressed/books_rec_c.pbz2', 'w') as f:
    cPickle.dump(books_rec, f)

In [48]:
#adding duplicate pickled lookup dataframes to the "compressed" folder
books_look.to_pickle('./compressed/books_look_c.pkl')
movies_look.to_pickle('./compressed/movies_look_c.pkl')
vg_look.to_pickle('./compressed/vg_look_c.pkl')

Ideally I can make a function to read in the compressed dictionaries - that way my app can cache that return and bring it up faster when a user is interacting with it.

In [49]:
def b_rec():
    return cPickle.load(bz2.BZ2File('./compressed/books_rec_c.pbz2'))

def m_rec():
    return cPickle.load(bz2.BZ2File('./compressed/movies_rec_c.pbz2'))

def v_rec():
    return cPickle.load(bz2.BZ2File('./compressed/vg_rec_c.pbz2'))

As it happens - streamlit does not accept the default protocol (5) of pickled dataframes. I need to resave them with protocol 3 in order to use them on the application.

In [51]:
vg_look.to_pickle('./compressed/vg_look_p3', protocol=3)

In [52]:
movies_look.to_pickle('./compressed/movies_look_p3', protocol=3)
books_look.to_pickle('./compressed/books_look_p3', protocol=3)

The latest iteration of the recommender function (which returns a dataframe of 10 recommendations) no longer shows the item that everything else is being compared to. I'd like to re-introduce that functionality with a separate function so that people know what they're searching.

In [78]:
def show_query_desc(query):
    try:
        query=query.lower()
        titles = list(lookup[lookup['product_title'].map(lambda x: x.lower()).str.contains(query)]['product_title'])
        item = titles[0]
        return f'''
        Most Popular Item Containing Your Search Term(s): {item}
        There are {round(lookup[lookup['product_title']==item]['tot_prod_reviews'].mean())} total reviews for this item and it has an average star rating of {round(lookup[lookup['product_title']==item]['avg_prod_stars'].mean(), 2)}
        '''
        
    except:
        return f'{query} not found; please enter a valid search term'
        
        
        #return pd.DataFrame(data = {'Most Popular Item Containing Your Search Term(s):' : item,
        #                           'Total Reviews for Product': round(lookup[lookup['product_title']==item]['tot_prod_reviews'].mean()),
        #                            'Avg Product Star Rating(1-5)': round(lookup[lookup['product_title']==item]['avg_prod_stars'].mean(), 2)
        #                           }, index=['Search'])
   # except:
    #    return pd.Dataframe(data = {'Error' : 'Sorry, no search results found'})

In [79]:
lookup = vg_look

show_query_desc('harry potter')

'\n        Most Popular Item Containing Your Search Term(s): LEGO Harry Potter: Years 1-4\n        There are 1012 total reviews for this item and it has an average star rating of 4.17\n        '

Looks good!