# The Thai Yelp Mythbuster and Food Recommender

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Loading and EDA - Overview Data-Set (restaurants)

In [None]:
#Loading data-set of top restaurants with the reviews as a single string(overview yelp-data set; from BeautifulSoup)
df=pd.read_csv("./yelp_sfo_thai.csv")

In [None]:
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
# Analysing distribution of Thai restaurant rating in SF
rating=df["rating"]
sns.set(style="white",palette="muted",color_codes=True)
ax=sns.countplot(rating)
ax.set_title("Distribution of Rating")

In [None]:
# Analysing distribution of number of reviews per Thai restaurant in SF
reviews=df["review_count"]
sns.set(style="white",palette="muted",color_codes=True)
ax=sns.distplot(reviews)
ax.set_title("Distribution of Number of Reviews");

In [None]:
# Looking at the distribution of Thai price range in SF
price_map={"$":"level_1","$$":"level_2","$$$":"level_3","$$$$":"level_4"}
price=df.price.map(price_map)
sns.set(style="white",palette="muted",color_codes=True)
ax=sns.countplot(price)
ax.set_title("Distribution of price")

In [None]:
df=df.rename(columns={'Name':'name'})
df.dtypes

In [None]:
# Loading the reviews data set from Beautiful Soup
df2 = pd.read_csv('./yelp_sfo_thai_reviews_again.csv')

In [None]:
df2 = pd.DataFrame(data=df2, columns=['name','reviews'])

In [None]:
df2.head(3)

In [None]:
# Merging both the data sets
df=df.merge(df2,on='name',how='left')

In [None]:
# importing sentiment analysis packages from NLTK
# Vader SentimentAnalyzer was used to obtain the polarity scores for the reviews of restaurants. 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
sid = SentimentIntensityAnalyzer()

In [None]:
# Analysing 'reviews'
df['reviews']

In [None]:
#Appending Sentiments Scores to the restaurants
sentiments = []
sid = SentimentIntensityAnalyzer()
for i in range(df.shape[0]):
        line = df['reviews'].iloc[i]
        sentiment = sid.polarity_scores(line)
        sentiments.append([sentiment['neg'], sentiment['pos'],
                           sentiment['neu'], sentiment['compound']])
df[['neg', 'pos', 'neu', 'compound']] = pd.DataFrame(sentiments)
#df['Negative'] = df['compound'] < -0.1
#df['Positive'] = df['compound'] > 0.1

In [None]:
df.tail(3)

In [None]:
sns.heatmap(df.corr(),annot=True)

#### The rating is positively correlated with the positive sentiment and negatively correlated with the negative sentiment. The Thai restaurant  can reasonably represent the perception of a restaurant as is expressed in the reviews. However, the correlation between review count and rating is inconclusive.

## Data Loading and EDA - master(granular) data-set (individual reviews)

In [None]:
#Loading data-set of all the restaurants with the reviews(granular yelp-data set; from scrapy)
df = pd.read_csv('./yelp/yelp_thai_clean.csv')

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.columns.to_series().groupby(df.dtypes).groups

In [None]:
df.dtypes

In [None]:
df['price'] = df['price'].apply(lambda x: len(x))

In [None]:
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df.groupby(['restaurant', 'address']).agg({'text': ['count'], 'rating': ['mean', 'std']}).sort_values(('text', 'count'), ascending = False).head(10)

In [None]:
#Analyzing the scatter distribution between rating v/s price
plt.scatter(df['price'], df['rating'])
plt.xlabel('price')
plt.ylabel('rating')

In [None]:
#df.loc[df.price==3,:]

In [None]:
# Histogram of ratings(frequency)
df['rating'].plot(kind='hist')
plt.xlabel('rating')

In [None]:
# Looking at the linear relation b/w rating and price - inconclusive
sns.lmplot("price", "rating", df)

In [None]:
# sentiment analysis on reviews for EDA
sentiments = []
sid = SentimentIntensityAnalyzer()
for i in range(df.shape[0]):
        line = df['text'].iloc[i]
        sentiment = sid.polarity_scores(line)
        sentiments.append([sentiment['neg'], sentiment['pos'],
                           sentiment['neu'], sentiment['compound']])
df[['neg', 'pos', 'neu', 'compound']] = pd.DataFrame(sentiments)

In [None]:
# Looking at the linear relation b/w rating and pos_sentiment - positive correlation
sns.lmplot("rating", "pos", df)

In [None]:
# Looking at the linear relation b/w price and pos_sentiment - inconclusive
sns.lmplot("price", "pos", df)

In [None]:
# Looking at the linear relation b/w rating and neg_sentiment - negative correlation
sns.lmplot("rating", "neg", df)

In [None]:
# Looking at the linear relation b/w price and neg_sentiment - inconclusive
sns.lmplot("price", "neg", df)

In [None]:
# Analysing box-plot of ratings which show a bulk of ratings b/w 3 and 5
sns.boxplot(x=df['rating'],  data=df)

##  Classification Modelling

#### We perform a classification modelling to check if we can predict a good/bad rating based on a review as a baseline before sentiment as the reviews contain all of our target(food) tokens. We vectorize the reviews using Count Vectorizer and TFIDF Vectorizer and run a Logistic Regression and a Naive Bayes Model

In [None]:
# Getting value counts on the ratings
df['rating'].value_counts()

In [None]:
# Creating a binary classifier for rating>=4 as good, rating<4 as bad 
df['binary_rating'] = df['rating'].apply(lambda x:1 if x==4 or x==5 else 0)

In [None]:
# Checking for class imbalance
df['binary_rating'].value_counts()

In [None]:
# Setting our X(feature) and y(target)
X = df['text']
y = df['binary_rating']

In [None]:
# Text Preprossing for modelling
# import string
# from nltk.corpus import stopwords

# def text_process(text):
#     '''
#     Takes in a string of text, then performs the following:
#     1. Remove all punctuation
#     2. Remove all stopwords
#     3. Return the cleaned text as a list of words
#     '''
#     nopunc = [char for char in text if char not in string.punctuation]
#     nopunc = ''.join(nopunc)
#     return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
cvec = CountVectorizer(stop_words='english').fit(X)
tfidf = TfidfVectorizer(stop_words='english').fit(X)

In [None]:
print(len(cvec.get_feature_names()))
print(len(tfidf.get_feature_names()))

In [None]:
X1 = cvec.transform(X)
X2 = tfidf.transform(X)

In [None]:
X1.dtype

In [None]:
# Performing a train-test split
from sklearn.model_selection import train_test_split
X1_train,X1_test,y1_train,y1_test = train_test_split(X1,y,shuffle=True,stratify=y,random_state=42,test_size=0.3)
X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y,shuffle=True,stratify=y,random_state=42,test_size=0.3)

#### Resampling Train Data Set

In [None]:
# from sklearn.utils import resample
# traindata = X_train.to_frame().merge(pd.DataFrame(y_train), how = 'left', right_index = True, left_index = True)
# train_majority = traindata[traindata['binary_rating'] == 1]
# train_minority = traindata[traindata['binary_rating'] == 0]
# train_minority_upsampled = resample(train_minority, 
#                                      replace = True, 
#                                      n_samples = train_majority.shape[0],
#                                      random_state = 42)

# train_data_upsampled = pd.concat([train_majority, train_minority_upsampled])
# X_train = train_data_upsampled.drop(columns = 'binary_rating')
# y_train = train_data_upsampled['binary_rating']
# y_train.value_counts()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

### Count Vectorizer - Classification Modelling

#### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X1_train, y1_train)
y1_pred = lr.predict(X1_test)
print('accuracy score',accuracy_score(y1_test, y1_pred))

In [None]:
# Confusion Matrix and Classification Report
print(confusion_matrix(y1_test, y1_pred))
print('\n')
print(classification_report(y1_test, y1_pred))

#### Multinomial Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X1_train, y1_train)
y1_pred = nb.predict(X1_test)
print('accuracy score',accuracy_score(y1_test, y1_pred))

In [None]:
# Confusion Matrix and Classification Report
print(confusion_matrix(y1_test, y1_pred))
print('\n')
print(classification_report(y1_test, y1_pred))

### TFIDF Vectorizer - Classification Modelling 

#### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X2_train, y2_train)
y2_pred = lr.predict(X2_test)
print('accuracy score',accuracy_score(y2_test, y2_pred))

In [None]:
# Confusion Matrix and Classification Report
print(confusion_matrix(y2_test, y2_pred))
print('\n')
print(classification_report(y2_test, y2_pred))

#### Multinomial Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X2_train, y2_train)
y2_pred = nb.predict(X2_test)
print('accuracy score',accuracy_score(y2_test, y2_pred))

In [None]:
# Confusion Matrix and Classification Report
print(confusion_matrix(y2_test, y2_pred))
print('\n')
print(classification_report(y2_test, y2_pred))

### Unsupervised Word2Vec Modelling on tokens in the corpus - finding dishes to predict/relate

#### The goal of word vector embedding models, or word vector models for short, is to learn dense, numerical vector representations for each term in a corpus vocabulary. If the model is successful, the vectors it learns about each term should encode some information about the meaning or concept the term represents, and the relationship between it and other terms in the vocabulary. Word vector models are also fully unsupervised — they learn all of these meanings and relationships solely by analyzing the text of the corpus, without any advance knowledge provided.

In [None]:
# Importing gensim and nltk packages
import gensim
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models import Phrases
import nltk
from nltk import word_tokenize
from nltk.util import ngrams

In [None]:
# Setting the text corpus as a list of individual reviews
reviews_list = list(df.text)

In [None]:
# tokenizing the words in the reviews - setting uni-grams
sentences = [word_tokenize(x) for x in reviews_list]

In [None]:
len(sentences), len(reviews_list)

In [None]:
# Creating a uni-gram Word2Vec model as a baseline
model = gensim.models.Word2Vec(sentences)
model.train

#### Evaluating Cosine-Similarity for word-vectors

In [None]:
# Testing token similarity with other words in the corpus
model.most_similar('chicken')

In [None]:
# Testing token similarity with other words in the corpus
model.most_similar('soup')

In [None]:
# Testing token similarity with other words in the corpus
model.most_similar('good')

In [None]:
# Testing token similarity with other words in the corpus
model.most_similar('bad')

In [None]:
# Individually comparing word vectors
model.similarity('chicken', 'good')

In [None]:
# Individually comparing word vectors
model.similarity('chicken', 'beef')

In [None]:
# Creating bi-grams from word vectors for further analysis

In [None]:
bigramer = gensim.models.Phrases(sentences)

In [None]:
model2 = Word2Vec(bigramer[sentences])

In [None]:
model2.most_similar('pad_thai')

In [None]:
# Creating tri-grams from bi-grams for further analysis

In [None]:
trigram = Phrases(bigramer[sentences])

In [None]:
model3 = Word2Vec(trigram[bigramer[sentences]])

In [None]:
model3.most_similar('pad_thai')

#### We can see that the cosine similarity gives more accurate results when the unsupervised model incorporates trigrams in the unsupervised machine learning model 

#### Performing addition of the word vectors to give logical inferences from the unsupervised model(cosine-addition)

In [None]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model3.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print(term)

In [None]:
word_algebra(add=[u'breakfast', u'lunch'])

In [None]:
word_algebra(add=[u'chicken', u'good'])

In [None]:
word_algebra(add=[u'lunch', u'night'], subtract=[u'day'])

In [None]:
word_algebra(add=[u'noodle', u'burmese'], subtract=[u'rice'])

In [None]:
word_algebra(add=[u'pad_thai', u'thai'], subtract=[u'chicken'])

In [None]:
word_algebra(add=[u'thai', u'fine_dining'])

### LDA modelling
#### In NLP applications, documents are represented a mixture of the individual tokens (words and phrases) they contain. There are two layers in this model — documents and tokens — and the size or dimensionality of the document vectors is the number of tokens in the corpus vocabulary. 
#### Using LDA documents are represented as a mixture of a pre-defined number of topics, and the topics are represented as a mixture of the individual tokens in the vocabulary thereby reducing the dimensionality of the model. 
#### LDA is fully unsupervised. The topics are "discovered" automatically from the data by trying to maximize the likelihood of observing the documents in your corpus, given the modeling assumptions. LDA uses a simplifying assumption known as the bag-of-words model. In the bag-of-words model, a document is represented by the counts of distinct terms that occur within it. 
#### pyLDAvis takes topic models created with gensim and prepare their data for visualization.

In [None]:
# ### The scope of LDA is reduced to a single restaurant
# dr = df[df.restaurant == 'Marnee Thai']

In [None]:
# reviews_list_ = list(dr.text)
# sentences_    = [word_tokenize(x) for x in reviews_list_]

In [None]:
# import pyLDAvis.gensim
# from gensim import corpora, models
# pyLDAvis.enable_notebook() # in order for our visual to show up

In [None]:
# dictionary = corpora.Dictionary(sentences_)

# corpus = [dictionary.doc2bow(text) for text in sentences]

In [None]:
# ldamodel = models.ldamodel.LdaModel(corpus,
#                                     id2word = dictionary, # connect each word to its "spot" in the dictionary
#                                     num_topics = 50, # hyperparameter T for number of topics
#                                     passes = 5, # similar to epochs, how many times do we iterate through the data
#                                     minimum_probability = 0.01) # only including topics that meet some probability threshold

In [None]:
# pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

### TSNE - wordvector visualization with sci-kit learn

In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

In [None]:
# model = Word2Vec(sentences_, workers=4, size=100, min_count=50, window=10, sample=1e-3)

In [None]:
# X = model[model.wv.vocab]

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
# plt.show()

# Review Modelling to find Best Thai Dishes in SF using ratings/sentiments 

In [None]:
# reading the granular data-set with all the Thai restaurants and individual reviews
df = pd.read_csv('./yelp/yelp_thai_clean.csv')

In [None]:
# creating a master menu based on knowledge of Thai cuisine and based of most menus
sample_menu=[
'noodle soup','chicken noodle soup','beef noodle soup','noodles','pad see ew','pad kee mao','pad thai','fried rice','salad','papaya salad','papaya','chicken satay','satay','egg rolls','chicken','beef','fried chicken','roast duck','bbq pork','pork','roasted duck','panang curry','green curry','yellow curry','tom yum','tom kha','tom ka','thai iced tea','thai iced coffee',
'imperial rolls','angel wings','wings','corn cakes','mango salad','panang beef','curry','basa fillet','tofu','pumpkin curry','coconut ice cream',
'eggplant','fried banana','sticky rice','basil','pork belly','silver noodle','crab','calamari','cashew nut','fish cake','fish cakes','peanut sauce','samosa','catfish','pineapple fried rice','puff','money bag','money bags','silver noodle','pad see you','larb','quail','prawns','fried prawn','fried prawns','shrimp','seafood','salmon','ribs','chicken noodle','beef noodle','roti','pad cha','spring rolls','rolls','fried egg','imperial roll','spring roll','egg roll','tuna tower','volcanic beef','sea bass','crab cake',
'pad kee mow','massamam','lamb','drunken noodles','mango','coconut'
]

In [None]:
#just rating and review text
df = df[['rating', 'text']]

#switch to lower case
df.text = df.text.apply(lambda x: x.lower())

In [None]:
sample_menu = list(set(sample_menu))

#### Looking for dishes in sample menu and scoring them based on review-rating based on the average of the frequency they occur in the corpus - Scoring 1

In [None]:
# this returns just those reviews that have the word
# in the text of the review
def subset_reviews(word, df):
    return df[df.text.str.contains(word)]

# return avg rating of revies that contain dish
def avg_review_of_dish(item, df):
    return subset_reviews(item, df).mean()

# return nuber of times dishes reviewed
def dish_count(item, df):
    return subset_reviews(item, df).shape[0]

In [None]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df)[0] for item in sample_menu]
dish_counts = [dish_count(item, df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_reviewed': dish_counts}
dish_df = pd.DataFrame(data)

In [None]:
# dish review dataframe
dish_df[dish_df.times_reviewed > 99].sort_values(by = 'rating', ascending=False)

#### Evaluating each review by scoring sentiment on the review for dishe rating - Scoring 2

In [None]:
# create sentiment df which is text, polarity of text
from textblob import TextBlob
df_sentiment = df[['text']]
df_sentiment.text = df_sentiment.text.apply(lambda x: x.lower())
df_sentiment['polarity'] = df_sentiment.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df_sentiment)[0] for item in sample_menu]
dish_counts = [dish_count(item, df_sentiment) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_reviewed': dish_counts}
dish_df_sentiment = pd.DataFrame(data)

In [None]:
dish_df_sentiment[dish_df_sentiment.times_reviewed > 99].sort_values(by = 'rating', ascending=False)

#### Granularly evaluating each review by scoring sentence sentiment as opposed to review sentiment for dishes - Scoring 3

In [None]:
# split the reviews into sentences
reviews_sentences = ''.join(list(df.text)).split('.')

# create a dataframe with these sentences
data = {'text' : reviews_sentences}
sentences_df = pd.DataFrame(data)
sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

# create column which is polarity of text
from textblob import TextBlob
sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
dish_ratings = [avg_review_of_dish(item, sentences_df)[0] for item in sample_menu]
dish_counts = [dish_count(item, sentences_df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_mentioned': dish_counts}
sentences_df = pd.DataFrame(data)

In [None]:
sentences_df[sentences_df.times_mentioned >=150].sort_values(by = 'rating', ascending=False)

In [None]:
# Saving best Thai dishes in SF
sentences_df.to_csv('./best_dishes.csv')

## Linear Regression Modelling to infer if beta coefficients are consistent with top dishes

In [None]:
#Creating new menu from previous data-set
new_menu = ['fish cake',
'roast duck',
'samosa',
'coconut ice cream',
'tom ka',
'egg rolls',
'ribs',
'egg roll',
'pork belly',
'fried chicken',
'panang curry',
'larb',
'salmon',
'calamari',
'angel wings',
'fried banana',
'chicken satay',
'tom kha',
'pad kee mao',
'pineapple fried rice',
'prawns',
'seafood',
'spring rolls',
'peanut sauce',
'noodle soup',
'pumpkin curry',
'spring roll',
'crab',
'satay',
'thai iced tea',
'sticky rice',
'papaya salad',
'yellow curry',
'papaya',
'eggplant',
'wings',
'tom yum',
'roti',
'mango',
'pad see ew',
'green curry',
'rolls',
'basil',
'coconut',
'pork',
'shrimp',
'tofu',
'beef',
'fried rice',
'salad',
'noodles',
'pad thai',
'chicken',
'curry']

In [None]:
def sentence_to_vector(sentence, menu):
    return [1*(dish in sentence) for dish in menu]

In [None]:
variable_sentences = [sentence_to_vector(sentence, new_menu) for sentence in reviews_sentences]

In [None]:
X = np.matrix(variable_sentences)
X.shape

In [None]:
y = [TextBlob(x).sentiment.polarity for x in reviews_sentences]
len(y)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lr = LinearRegression()
lr.fit(X, y)

In [None]:
# Printing out beta coefficients
data = {'coef' : lr.coef_,
       'dish' : new_menu}
df = pd.DataFrame(data)
df.sort_values('coef', ascending=False)

### The beta coefficients are close to the top Thai dishes in San Francisco but inconsistent

In [None]:
# from sklearn.linear_model import LassoCV, Lasso
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

In [None]:
# def cv_r2(model):
#     r2 = np.mean(cross_val_score(model, X, y,scoring="r2",cv = 5))
#     return(r2)
# def lasso_selector(a):
#     lasso_model = make_pipeline(StandardScaler(),LassoCV(max_iter=1e7, alphas = [a],cv=5)).fit(X, y)
#     lasso_r2 = cv_r2(lasso_model).mean()
#     return(lasso_r2)
# lasso_alphas = [.0001, .0003, .0005, .0007, .0009,.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 20, 30, 50, 100]
# lasso_scores = []
# for alpha in lasso_alphas:
#     score = lasso_selector(alpha)
#     lasso_scores.append(score)  

In [None]:
# # Analyzing our Alphas
# plt.plot(lasso_alphas, lasso_scores, label='Lasso')
# lasso_score_table = pd.DataFrame(lasso_scores, lasso_alphas, columns=['R2'])
# lasso_score_table

In [None]:
# reg = Lasso(alpha = 0.0001)
# reg.fit(X,y)

In [None]:
# data = {'coef' : reg.coef_,
#        'dish' : new_menu}
# las_df = pd.DataFrame(data)
# las_df.sort_values('coef', ascending=False)

## Finding Top Dishes in Individual Restaurants

In [None]:
# recreating restaurant menu for finding top dishes in each restaurant
sample_menu_rest=[
'noodle soup','chicken noodle soup','beef noodle soup','pad see ew','pad kee mao','pad thai','fried rice','salad','papaya salad','satay','egg rolls','chicken','beef','fried chicken','roast duck','bbq pork','pork','roasted duck','panang curry','green curry','yellow curry','tom yum','tom kha','tom ka','thai iced tea','thai iced coffee',
'imperial rolls','angel wings','wings','corn cakes','mango salad','panang beef','curry','basa fillet','tofu','pumpkin curry','coconut ice cream',
'fried banana','sticky rice','pork belly','silver noodle','crab','calamari','fish cake','fish cakes','peanut sauce','samosa','catfish','pineapple fried rice','money bag','money bags','silver noodle','pad see you','larb','quail','prawns','fried prawn','fried prawns','shrimp','seafood','salmon','ribs','chicken noodle','beef noodle','roti','pad cha','spring rolls','fried egg','imperial roll','spring roll','egg roll','tuna tower','volcanic beef','sea bass','crab cake',
'pad kee mow','massamam','lamb','drunken noodles'   
]

In [None]:
# creating a restaurant function which takes in name of the restaurant and returns top 10 dishes
def restaurant_food(a):
    df = pd.read_csv('./yelp/yelp_thai_clean.csv')
    #restrict to a Thai restaurant
    df = df[df.restaurant == a]

    #just grab rating and review text
    df = df[['rating', 'text']]

    #switch to lower case
    df.text = df.text.apply(lambda x: x.lower())
    
    # this returns just those reviews that have the word
    # in the text of the review
    def subset_reviews(word, df):
        return df[df.text.str.contains(word)]

    # return avg rating of revies that contain dish
    def avg_review_of_dish(item, df):
        return subset_reviews(item, df).mean()

    # return nuber of times dishes reviewed
    def dish_count(item, df):
        return subset_reviews(item, df).shape[0]
    # split the reviews into sentences
    reviews_sentences = ''.join(list(df.text)).split('.')

    # create a dataframe with these sentences
    data = {'text' : reviews_sentences}
    sentences_df = pd.DataFrame(data)
    sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

    # create column which is polarity of text
    from textblob import TextBlob
    sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)
    
    dish_ratings = [avg_review_of_dish(item, sentences_df)[0] for item in sample_menu_rest]
    dish_counts = [dish_count(item, sentences_df) for item in sample_menu_rest]
    data = {'dish' : sample_menu_rest,
           'rating' : dish_ratings,
           'times_mentioned': dish_counts}
    sentences_df = pd.DataFrame(data)
    return sentences_df[sentences_df.times_mentioned >=9].sort_values(by = 'rating', ascending=False).head(10)

In [None]:
# finding the best dishes in an individual restaurant
restaurant_food("Osha Thai")

In [None]:
# finding the best dishes in an individual restaurant
restaurant_food("Marnee Thai")