In [1]:
#imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import re
import time
import random
import sys
pd.set_option('display.max_colwidth', None)

from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from scipy import sparse
from matplotlib import pyplot as plt
import utils as ut

This is my first attempt at building the full recommender in an easily replicated format.

In [2]:
vg_df = pd.read_csv('./data/video_games.csv', usecols=['customer_id', 'product_id', 'product_title', 
                                                        'star_rating', 'review_date'])
vg_rec = pd.read_pickle('./pickles/videog_rec.pkl')

In [3]:
movie_df = pd.read_csv('./data/movie_dvd.csv', usecols=['customer_id', 'product_id', 'product_title', 
                                                        'star_rating', 'review_date'])

movie_rec = pd.read_pickle('./pickles/movie_rec.pkl')

In [4]:
books_df = pd.read_csv('./data/books.csv', usecols=['customer_id', 'product_id', 'product_title', 
                                                        'star_rating', 'review_date'])

books_rec = pd.read_pickle('./pickles/books_rec.pkl')

With all 3 dataframes and recommender dataframes loaded in, I need to write the functions that I'll use to query my recommender. I need to account for a few things:  

- which item category (video games, movies, or books) is being queried?
- if the search term is vague or ambiguous, how do I determine which item to choose for comparison?
- how do I give the option to add "not like" terms to the search bar (but only as an option, not essential)

In [22]:
vg_df.head()

Unnamed: 0,customer_id,product_id,product_title,star_rating,review_date
0,12039526,B001CXYMFS,Thrustmaster T-Flight Hotas X Flight Stick,5,2015-08-31
1,2331478,B0029CSOD2,Hidden Mysteries: Titanic Secrets of the Fateful Voyage,1,2015-08-31
2,52495923,B00GOOSV98,GelTabz Performance Thumb Grips - PlayStation 4 and PlayStation 3,3,2015-08-31
3,14533949,B00Y074JOM,Zero Suit Samus amiibo - Japan Import (Super Smash Bros Series),4,2015-08-31
4,17521011,B008XHCLFO,Protection for your 3DS XL,5,2015-08-31


In [29]:
vg_df[vg_df['product_title']=='Hidden Mysteries: Titanic Secrets of the Fateful Voyage']['star_rating'].count()

101

In [38]:
#test
title_list = list(vg_df[vg_df['product_title'].str.contains('Force Unleashed II')]['product_title'])

chosen = title_list[0]
for item in title_list:
    if vg_df[vg_df['product_title']==item]['star_rating'].count() > vg_df[vg_df['product_title']==chosen]['star_rating'].count():
        chosen = item
print(f"Final Selection: {chosen} with {vg_df[vg_df['product_title']==chosen]['star_rating'].count()} ratings")

Final Selection: Star Wars: The Force Unleashed II Platinum Edition with 959 ratings


In [39]:
len(title_list)

978

In [40]:
title_list[0]

'Star Wars: The Force Unleashed II Platinum Edition'

Okay - so when a search is ambiguous or vague, I can have my recommender return the item that contains the search term AND has the most ratings which indicates it's probably the most popular item i.e. what the searcher intended.  

It would be nice if I could return that number of ratings and the average star rating for the searched product, and maybe the recommendations too? The fastest way to do this (so I don't have to compute them for every query) would be to add them as columns in my dataframes so I can just print them when called upon.

In [50]:
df_list = [vg_df, books_df, movie_df]

for dataframe in df_list:
    prod_num_dict = dict(dataframe.groupby('product_title')['star_rating'].count())
    prod_avg_dict = dict(dataframe.groupby('product_title')['star_rating'].mean())
    dataframe['tot_prod_reviews'] = dataframe['product_title'].map(lambda x: prod_num_dict[x])
    dataframe['avg_prod_stars'] = dataframe['product_title'].map(lambda x: prod_avg_dict[x])
    print(dataframe.head(3))

   customer_id  product_id  \
0     12039526  B001CXYMFS   
1      2331478  B0029CSOD2   
2     52495923  B00GOOSV98   

                                                       product_title  \
0                         Thrustmaster T-Flight Hotas X Flight Stick   
1            Hidden Mysteries: Titanic Secrets of the Fateful Voyage   
2  GelTabz Performance Thumb Grips - PlayStation 4 and PlayStation 3   

   star_rating review_date  tot_prod_reviews  avg_prod_stars  
0            5  2015-08-31               821        4.356882  
1            1  2015-08-31               101        2.831683  
2            3  2015-08-31               621        4.246377  
   customer_id  product_id                                  product_title  \
0     12076615  0385730586     Sisterhood of the Traveling Pants (Book 1)   
1     12703090  0811828964  The Bad Girl's Guide to Getting What You Want   
2     31048862  0316769487                         The Catcher in the Rye   

   star_rating review_date  t

Okay, now we've got new columns that account for number of reviews and average star rating. If I sort the dataframes by number of reviews, then I should be able to always choose the first item in my query list to make recommendations. This out to be much faster than checking each one against the other every time.

In [54]:
for dataframe in df_list:
    dataframe.sort_values(by='tot_prod_reviews', ascending=False, inplace=True)

That should do it! I'm going to pickle these edited lookup dataframes so I can use them later

In [73]:
#vg_df.to_pickle('./pickles/videog_look.pkl')
#books_df.to_pickle('./pickles/books_look.pkl')
#movie_df.to_pickle('./pickles/movies_look.pkl')

In [84]:
#def choose_item(question):

#select category based on user input:

category = input('Would you like to search for "video games", "movies", or "books"? Please enter one option: ')
while category.lower() not in ['video games', 'movies', 'books']:
    print('Sorry, you need to enter "video games", "movies", or "books"') #error message
    time.sleep(1) #wait 1 second, then make input available for user to try again
    category = input('Would you like to search for "video games", "movies", or "books"? Please enter one option: ')
if category.lower() == 'video games':
    print('Okay, video game recommendations!') #vg_df
    lookup, recommender = vg_df, vg_rec
elif category.lower() == 'movies':
    print('Okay, movie recommendations!') #movie_df
    lookup, recommender = movie_df, movie_rec
elif category.lower() == 'books':
    print('Okay, book recommendations!') #books_df
    lookup, recommender = books_df, books_rec
else:
    print("Sorry, that wasn't one of the options")

#provide input option for search parameters
query = input('Please enter search term; the more specific your term is, the more accurate the results will be!')
#wout = input('Wou')
try:
    titles = list(lookup[lookup['product_title'].str.contains(query)]['product_title'])
    print(f'Recommending items similar to: {titles[0]}')
    print(f"""This item has {round(lookup[lookup['product_title']==titles[0]]['tot_prod_reviews'].mean())} reviews
    and a {round(lookup[lookup['product_title']==titles[0]]['avg_prod_stars'].mean(), 2)} average star rating""")
    print('Here are the 10 recommended items for you based on your search parameters: ')
    print(len(titles))
    print(recommender.loc[titles[0],:].sort_values()[1:11]) #first matching item will be itself so start at second
except:
    print(f'Sorry, "{query}" does not appear to be in the product database')
#list of titles that contain given keyword
#print('Here are the 10 recommended items for you based on your search parameters: ')
#print(len(titles))
#print(recommender.loc[titles[0],:].sort_values()[1:11]) #first matching item will be itself so start at second


Would you like to search for "video games", "movies", or "books"? Please enter one option:  video games


Okay, video game recommendations!


Please enter search term; the more specific your term is, the more accurate the results will be! Witcher


Recommending items similar to: The Witcher 3: Wild Hunt
This item has 713 reviews
    and a 4.46 average star rating
Here are the 10 recommended items for you based on your search parameters: 
1997
Lords of the Fallen: Limited Edition - PlayStation 4    0.943657
Batman: Arkham Knight                                   0.950284
The Order: 1886                                         0.957811
Dark Souls II: Scholar of the First Sin                 0.958174
The Witcher 2: Assassins Of Kings Enhanced Edition      0.959459
State of Decay- Year-One Survival Edition               0.959624
Bloodborne                                              0.959825
Dragon Age Inquisition                                  0.965065
Middle Earth: Shadow of Mordor                          0.966122
Battlefield Hardline                                    0.966467
Name: The Witcher 3: Wild Hunt, dtype: Sparse[float64, 1]


There's my MVP! A strong start.