## Content Based Model

**Name**: Diane Lu

**Contact**: dianengalu@gmail.com

**Date**: 07/31/2023

### Table of Contents 

1. [Introduction](#intro)
2. [Model Dataset](#model)

### Introduction <a class="anchor" id="intro"></a>

During the Initial Modeling stage, we create the first version of the restaurant recommendation system, which will serve as our starting point for future improvements and enhancements.

#### Importing Python Libraries 

Importing necessary libraries.

In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd 

# Import data visualization libraries
import matplotlib.pyplot as plt

# Import NLP Packages
import string
import nltk

# Import TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Import from scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# Import SVD algorithm from Surprise library
from surprise import SVD

# Import Reader and Dataset from Surprise library
from surprise.reader import Reader
from surprise import Dataset

# Import FunkSVD algorithm from Surprise library
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD

# Import train_test_split and GridSearchCV from Surprise library
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

# Import accuracy module from Surprise library
from surprise import accuracy

# Ignore all warnings to avoid cluttering the output
import warnings
warnings.filterwarnings("ignore")

### Model Dataset <a class="anchor" id="model"></a>

**Data Dictionary:**
* `user_id`: unique user id
* `business_id`: unique user id
* `rating`: star rating

In [2]:
# Read data from a pickle file into a Pandas DataFrame
vancouver_data = pd.read_pickle('/Users/diane/Desktop/BrainStation/Brainstation_Capstone/yelp_data/vancouver_data.pkl')

In [3]:
# Display concise information about the 'vancouver_data' DataFrame
vancouver_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64660 entries, 1101 to 5561981
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          64660 non-null  int64  
 1   business_id      64660 non-null  int64  
 2   rating           64660 non-null  float64
 3   restaurant_name  64660 non-null  object 
 4   categories       64660 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 3.0+ MB


In [4]:
# Display the first few rows of the 'vancouver_data' DataFrame
vancouver_data.head()

Unnamed: 0,user_id,business_id,rating,restaurant_name,categories
1101,70315,1407,4.0,Meat & Bread,"[Fast Food, Bakeries, Sandwiches, Salad, Soup,..."
1105,70315,1356,3.0,Edible Canada At the Market,"[Seafood, Canadian (New), American (New), Spec..."
1109,70315,7370,4.0,The Lamplighter Public House,"[Nightlife, Gastropubs, Bars, Pubs]"
1144,70315,1143,5.0,Miku,"[Japanese, Sushi Bars]"
1151,70315,13469,4.0,Lupo,[Italian]


In [5]:
# Count the number of missing values in each column of the 'vancouver_data' DataFrame
vancouver_data.isnull().sum()

user_id            0
business_id        0
rating             0
restaurant_name    0
categories         0
dtype: int64

In [6]:
# Print the size of our model dataset
print(f"The size of our model dataset is {vancouver_data.shape[0]} entries.")

The size of our model dataset is 64660 entries.


In [7]:
# Extract columns 'user_id', 'restaurant_name', and 'rating' from 'vancouver_data',
# then sort the data by 'user_id' in ascending order
sorted_data = vancouver_data.sort_values(by='user_id')

# Display the sorted data
display(sorted_data)

Unnamed: 0,user_id,business_id,rating,restaurant_name,categories
2328038,4,6537,2.0,Breakfast Table,[Breakfast & Brunch]
2328033,4,8801,5.0,Yolks,"[Coffee & Tea, Breakfast & Brunch]"
2328050,4,11888,3.0,Fable,"[Canadian (New), American (New)]"
2328052,4,10005,5.0,Minami,"[Japanese, Sushi Bars]"
2328053,4,5393,4.0,The Flying Pig - Gastown,"[Seafood, Bistros, Canadian (New), Cajun/Creole]"
...,...,...,...,...,...
1342235,81124,338,5.0,The Sandbar Seafood Restaurant,"[Seafood, Lounges, American (New), Nightlife, ..."
1342237,81124,5624,1.0,The Flying Pig - Yaletown,"[American (Traditional), American (New), Canad..."
1342238,81124,2073,2.0,Black Rice Izakaya,"[Tapas Bars, Tapas/Small Plates, Japanese, Sus..."
1999029,81139,5313,5.0,Marutama Ramen,"[Noodles, Japanese, Ramen]"


In [8]:
# Get unique user_id values and map them to new values starting from 0
user_id_mapping = {user_id: new_id for new_id, user_id in enumerate(sorted_data['user_id'].unique())}

# Replace the 'user_id' values in the DataFrame using the mapping
sorted_data['user_id'] = sorted_data['user_id'].map(user_id_mapping)

# Display the updated DataFrame
display(sorted_data)

Unnamed: 0,user_id,business_id,rating,restaurant_name,categories
2328038,0,6537,2.0,Breakfast Table,[Breakfast & Brunch]
2328033,0,8801,5.0,Yolks,"[Coffee & Tea, Breakfast & Brunch]"
2328050,0,11888,3.0,Fable,"[Canadian (New), American (New)]"
2328052,0,10005,5.0,Minami,"[Japanese, Sushi Bars]"
2328053,0,5393,4.0,The Flying Pig - Gastown,"[Seafood, Bistros, Canadian (New), Cajun/Creole]"
...,...,...,...,...,...
1342235,8976,338,5.0,The Sandbar Seafood Restaurant,"[Seafood, Lounges, American (New), Nightlife, ..."
1342237,8976,5624,1.0,The Flying Pig - Yaletown,"[American (Traditional), American (New), Canad..."
1342238,8976,2073,2.0,Black Rice Izakaya,"[Tapas Bars, Tapas/Small Plates, Japanese, Sus..."
1999029,8977,5313,5.0,Marutama Ramen,"[Noodles, Japanese, Ramen]"


In [9]:
# Number of restaurants 
print("Number of restaurants:", sorted_data['restaurant_name'].nunique())

# Number of unique reviewers 
print("Number of unique reviewers:", sorted_data['user_id'].nunique())

# Range of ratings
print("Range of ratings:", sorted_data['rating'].min(), "to", sorted_data['rating'].max())

Number of restaurants: 766
Number of unique reviewers: 8978
Range of ratings: 1.0 to 5.0


In [10]:
# Group by 'user_id' and count the number of non-NaN ratings for each user
user_ratings_count = sorted_data.groupby('user_id')['rating'].count()

# Find the user with the most ratings (index of the maximum count)
user_with_most_ratings = user_ratings_count.idxmax()

# Get the actual count of ratings for the user with the most ratings
most_ratings_count = user_ratings_count.max()

# Print the results
print(f"User with the most ratings: {user_with_most_ratings}")
print(f"Number of ratings for the user: {most_ratings_count}")

User with the most ratings: 4056
Number of ratings for the user: 543


### Baseline Content Based Filtering <a class="anchor" id="base"></a>

In [11]:
sorted_data['categories'] = sorted_data['categories'].apply(lambda x: ', '.join(x))
sorted_data

Unnamed: 0,user_id,business_id,rating,restaurant_name,categories
2328038,0,6537,2.0,Breakfast Table,Breakfast & Brunch
2328033,0,8801,5.0,Yolks,"Coffee & Tea, Breakfast & Brunch"
2328050,0,11888,3.0,Fable,"Canadian (New), American (New)"
2328052,0,10005,5.0,Minami,"Japanese, Sushi Bars"
2328053,0,5393,4.0,The Flying Pig - Gastown,"Seafood, Bistros, Canadian (New), Cajun/Creole"
...,...,...,...,...,...
1342235,8976,338,5.0,The Sandbar Seafood Restaurant,"Seafood, Lounges, American (New), Nightlife, A..."
1342237,8976,5624,1.0,The Flying Pig - Yaletown,"American (Traditional), American (New), Canadi..."
1342238,8976,2073,2.0,Black Rice Izakaya,"Tapas Bars, Tapas/Small Plates, Japanese, Sush..."
1999029,8977,5313,5.0,Marutama Ramen,"Noodles, Japanese, Ramen"


In [12]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

ENGLISH_STOP_WORDS = stopwords.words('english')
stemmer = PorterStemmer() 

def tokenizer(sentence):
    # remove punctuation and set to lower case
    for punctuation_mark in string.punctuation:
        sentence = sentence.replace(punctuation_mark,'').lower()

    # split sentence into words
    listofwords = sentence.split(' ')
    listofstemmed_words = []
    
    # remove stopwords and any tokens that are just empty strings
    for word in listofwords:
        if (not word in ENGLISH_STOP_WORDS) and (word!=''):
            # Stem words
            stemmed_word = stemmer.stem(word)
            listofstemmed_words.append(stemmed_word)

    return listofstemmed_words

In [13]:
# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, min_df=30, max_features=5000)

# Fit and transform the corpus using the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform(sorted_data['categories'])

In [14]:
# Print the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (64660, 225)


In [15]:
# Create a DataFrame 'features' from the TF-IDF transformed data
features = pd.DataFrame(columns=tfidf_vectorizer.get_feature_names_out(), data=tfidf_matrix.toarray())

# Display the DataFrame
display(features)

Unnamed: 0,activ,afghan,african,american,arabian,arcad,art,asian,australian,bagel,...,video,vietnames,vinyl,waffl,whiskey,wine,wineri,wing,women,yogurt
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.420788,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64655,0.0,0.0,0.0,0.206260,0.0,0.0,0.380425,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64656,0.0,0.0,0.0,0.521094,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64657,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64658,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Calculate the cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(features)

: 

: 

In [None]:
# Display the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_similarity_matrix)