# Restaurant Recommendations

This project was done to get experience building recommendation systems. This project could also be adapted and applied to hotels, travel services, and car recommendations.

In [None]:
#Import needed packages
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#import and view dataframe. I used the TripAdvisor Restaurant Recommendation dataset from Kaggle. 
import pandas as pd
df = pd.read_csv("TripAdvisor_RestauarantRecommendation.csv")
df.head()

# Data cleaning and preprocessing

In [None]:
#The phone number, URL, and menu link aren't needed. We can remove these from the dataframe. 
df.drop(['Contact Number', 'Trip_advisor Url', 'Street Address', 'Menu'], axis=1, inplace=True)

In [None]:
#Null values can mess up preprocessing and analysis. This step allows us to get rid of null values. 
df.dropna(inplace=True, axis=0)
df.reset_index(drop=True, inplace=True)

In [None]:
#The review ratings are within a string. This allows us to extract numbers from review columns
df['Reviews'] = [r.split()[0] for r in df.Reviews]

#remove whitespace within review columns
df['Reviews'] = df['Reviews'].str.lstrip()
df['Reviews'] = df['Reviews'].str.rstrip()

#convert review data to float and number of reviews to integers
df['Reviews'] = df['Reviews'].astype('float')
df['No of Reviews'] = [n.split()[0].replace(',', '') for n in df['No of Reviews']]
df['No of Reviews'] = df['No of Reviews'].astype('int')

In [None]:
#Getting a mean rating allows for comparison against all other restaurants in the data set 
restaurants = list(df['Name'].unique())
df['Mean Rating'] = 0
for i in range(len(restaurants)):
    df['Mean Rating'][df['Name'] == restaurants[i]] = df['Reviews'][df['Name'] == restaurants[i]].mean()   
#Scaling the mean rating values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
df[['Mean Rating']] = scaler.fit_transform(df[['Mean Rating']]).round(2)

In [None]:
#We can convert the price range to string categories. This will come in handy for EDA and recommending. 
df['Price_Range'] = ['Cheap' if p == '$' else 'Moderate' if p == '$$ - $$$' else 'Expensive' for p in df.Price_Range]

In [None]:
#We'll want to categorize restaurants by their city and state in our analysis, so let's extract that information from the Location column.
df['City'] = [c.split(',')[0].strip() for c in df.Location]
df['State'] = [s.split(',')[1][:3].strip() if len(s.split(',')) == 2 else s.split(',')[2][:3].strip() for s in df.Location]

In [None]:
#Check value counts of states to see any anomalies in the representation
df['State'].value_counts()

In [None]:
df[df['State'] == ""]

In [None]:
#We can drop PA, OR, and the Canadian province as these states do not have as many restaurant entries as the other states. 
df.drop(df[(df['State'] == 'PA') | (df['State'] == 'OR') | (df['State'] == '')].index, inplace=True)
df.drop('Location', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
##Cleaning up the comments data. We'll use this data to see how comments may be helpful in recommending a restaurant. 

## Lower casing the text
df["Comments"] = df["Comments"].str.lower()

## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["Comments"] = df["Comments"].apply(lambda text: remove_punctuation(text))

# Removal of Stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["Comments"] = df["Comments"].apply(lambda text: remove_stopwords(text))

#Cleaning URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df["Comments"] = df["Comments"].apply(lambda text: remove_urls(text))

# Exploratory data analysis

In [None]:
#Visualizing the types of restaurants
counts = df["Type"].value_counts()[:10] 
p = counts.sort_values().plot.barh(figsize=(8,5), fontsize=18) 
p.set_xlabel("Number of Restaurants",fontsize=18) 
p.set_ylabel("Restaurant Types",fontsize=18)
p.set_title("Types of Restaurants", fontsize=20)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
#Total restaurants by state
ax = df['State'].value_counts().plot(kind='bar', figsize=(10,5), color='b')
ax.set_xlabel('No of Restaurants')
ax.set_ylabel('State')
ax.set_title('No of Restaurants by State')

In [None]:
#total restaurants by city
ax = df['City'].value_counts()[:20].plot(kind='barh', figsize=(10,8), color='m')
ax.set_xlabel('No of Restaurants')
ax.set_ylabel('State')
ax.set_title('No of Restaurants by City')

In [None]:
#Restaurant types
#The categories of restaurants are all lumped together. People may find it more helpful when searching for a restauant to just use one restaurant type.


# Create set of all unique Types
types = []
for i in df.Type:
    for s in i.split(','):
        types.append(s.strip())
types = set(types)

# Create Dict of Types + Frequency
type_count = {}
for i in df.Type:
    for s in i.split(','):
        if s.strip() in type_count.keys():
            type_count[s.strip()] += 1
        else:
            type_count[s.strip()] = 1
            
#Convert Type_count dict into DataFrame
df_type = pd.DataFrame(type_count, index = [0])
df_type = df_type.transpose()
df_type.columns = ['No of Restaurants']
df_type.sort_values(by=['No of Restaurants'], ascending=False, inplace = True)

# Create DataFrame with top 20 types + the total sum of all other types
df_type_top = df_type[:20]
other = pd.DataFrame({'No of Restaurants' : sum(df_type['No of Restaurants'][20:])}, index=[0])
other.rename(index={0:'Other'}, inplace=True)
df_type_top = pd.concat([df_type_top, other], axis=0)
df_type_top.sort_values(by=['No of Restaurants'], ascending=False, inplace = True)
df_type_top


In [None]:
# Plot top 20 Types on Treemap
fig, ax = plt.subplots(figsize=(18,6))
sns.set_style(style="darkgrid") # set seaborn plot style
ax = squarify.plot(sizes=df_type_top['No of Restaurants'], label=df_type_top.index, alpha=0.6).set(title='Top 20 Most Common Restaurant Types')
plt.axis('off')
plt.show()

In [None]:
#Ranking Type by Average Review Score
typeAverageReview = {}
for t in types:
    if len(df[df['Type'].str.contains(t) == True]) >=  30:
        typeAverageReview[t] = df[df['Type'].str.contains(t) == True]['Reviews'].mean()
        
        
pd.Series(typeAverageReview).sort_values(ascending=False).plot(kind='barh', figsize=(10,12), bottom=[3])

7 of the top 10 best rated restaurant types where foriegn food. On the other hand, only 3 of the lowest rated types were foreign with the others being generic types like Bar or Pub, or explicitly American such as Southwestern, Barbecue, or American.



In [None]:
#Review Score by Price Range
fig, ax = plt.subplots(figsize = (8,6))
ax = sns.violinplot(data=df, x='Reviews', y='Price_Range', order=['Expensive', 'Moderate', 'Cheap'])
ax.set_title('Review Scores by Price Range')

The violin plot reinforces that 4.5 is the most common rating. It's the median rating for all 3 Price Ranges. It's interesting that cheap restaurants contain a slightly higher distribution of 5 ratings than the other Price Ranges. Perhaps this is because these restaurants are more accessible to a wider range of customers. 



# Restaurant recommendation system based on the content in the reviews

In [None]:
#EDA- Word Frequency Distribution


def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]


#Top 15-word frequency for restaurant types
lst = get_top_words(df['Type'], 15, (2,2))

df_words = pd.DataFrame(lst, columns=['Word','Count'])
plt.figure(figsize=(7,6))
sns.barplot(data=df_words, x='Count', y='Word')
plt.title('Word Couple Frequency for Restaurant Types');

In [None]:
df.head()

# Content-based recommender system using TF-IDF Matrix (Term Frequency — Inverse Document Frequency Matrix)

TF-IDF method is used to quantify words and compute weights for them. 
In other words, representing each word (or couples of words etc.) with a number in order to use mathematics in our recommender system. 
Put simply, the higher the TF*IDF score (weight), the rarer and more important the term, and vice versa.

In [None]:
df.set_index('Name', inplace=True)
indices = pd.Series(df.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['Comments'])

Here, the tfidf_matrix is the matrix containing each word and its TF-IDF score with regard to each document, or item in this case. 
Also, stop words are simply words that add no significant value to our system, like ‘an’, ‘is’, ‘the’, and hence are ignored by the system.


Cosine similarity is a metric used to determine how similar the documents are irrespective of their size. 

In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

## Let's build the recommendation system

In [None]:
def recommend(name, cosine_similarities = cosine_similarities):
   
    recommend_restaurant = []
    
    # Find the index of the restaurant entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['Type', 'Mean Rating', 'Price_Range'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df[['Type','Mean Rating', 'Price_Range']][df.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Type','Mean Rating', 'Price_Range'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

## Let's test the recommendation system

I picked The Clam Bar since I was thinking about going out for seafood when I did this...

In [None]:
df.loc['The Clam Bar'][:1]

In [None]:
#Testing the recommendation system
recommend('The Clam Bar')

Sources:

1. https://medium.com/mlearning-ai/restaurant-recommendation-system-based-on-the-content-in-reviews-dfc3351004db
2. https://thecleverprogrammer.com/2022/07/26/restaurant-recommendation-system-using-python/
3. https://towardsdatascience.com/how-to-build-a-restaurant-recommendation-system-using-latent-factor-collaborative-filtering-ffe08dd57dca