# The Thai Yelp Mythbuster and Food Recommender - Part 3

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

## Review Modelling to find Best Thai Dishes in SF using ratings/sentiments 

In [3]:
# reading the granular data-set with all the Thai restaurants and individual reviews
df = pd.read_csv('./yelp/yelp_thai_clean.csv')

In [4]:
# creating a master menu based on knowledge of Thai cuisine and based of most menus
sample_menu=[
'noodle soup','chicken noodle soup','beef noodle soup','noodles','pad see ew','pad kee mao','pad thai','fried rice','salad','papaya salad','papaya','chicken satay','satay','egg rolls','chicken','beef','fried chicken','roast duck','bbq pork','pork','roasted duck','panang curry','green curry','yellow curry','tom yum','tom kha','tom ka','thai iced tea','thai iced coffee',
'imperial rolls','angel wings','wings','corn cakes','mango salad','panang beef','curry','basa fillet','tofu','pumpkin curry','coconut ice cream',
'eggplant','fried banana','sticky rice','basil','pork belly','silver noodle','crab','calamari','cashew nut','fish cake','fish cakes','peanut sauce','samosa','catfish','pineapple fried rice','puff','money bag','money bags','silver noodle','pad see you','larb','quail','prawns','fried prawn','fried prawns','shrimp','seafood','salmon','ribs','chicken noodle','beef noodle','roti','pad cha','spring rolls','rolls','fried egg','imperial roll','spring roll','egg roll','tuna tower','volcanic beef','sea bass','crab cake',
'pad kee mow','massamam','lamb','drunken noodles','mango','coconut'
]

In [5]:
#just rating and review text
df = df[['rating', 'text']]

#switch to lower case
df.text = df.text.apply(lambda x: x.lower())

In [6]:
sample_menu = list(set(sample_menu))

#### Looking for dishes in sample menu and scoring them based on review-rating based on the average of the frequency they occur in the corpus - Scoring 1

In [7]:
# this returns just those reviews that have the word
# in the text of the review
def subset_reviews(word, df):
    return df[df.text.str.contains(word)]

# return avg rating of revies that contain dish
def avg_review_of_dish(item, df):
    return subset_reviews(item, df).mean()

# return nuber of times dishes reviewed
def dish_count(item, df):
    return subset_reviews(item, df).shape[0]

In [8]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df)[0] for item in sample_menu]
dish_counts = [dish_count(item, df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_reviewed': dish_counts}
dish_df = pd.DataFrame(data)

In [9]:
# dish review dataframe
dish_df[dish_df.times_reviewed > 99].sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times_reviewed
8,coconut ice cream,4.057325,157
78,angel wings,4.025,280
37,panang curry,3.973684,266
62,samosa,3.969925,133
73,sticky rice,3.959872,623
77,roasted duck,3.947368,114
12,mango,3.944242,825
31,fried banana,3.943218,317
72,wings,3.934783,690
58,larb,3.889344,244


#### Evaluating each review by scoring sentiment on the review for dishe rating - Scoring 2

In [10]:
# create sentiment df which is text, polarity of text
from textblob import TextBlob
df_sentiment = df[['text']]
df_sentiment.text = df_sentiment.text.apply(lambda x: x.lower())
df_sentiment['polarity'] = df_sentiment.text.apply(lambda x: TextBlob(x).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
# create the dish review dataframe
dish_ratings = [avg_review_of_dish(item, df_sentiment)[0] for item in sample_menu]
dish_counts = [dish_count(item, df_sentiment) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_reviewed': dish_counts}
dish_df_sentiment = pd.DataFrame(data)

In [12]:
dish_df_sentiment[dish_df_sentiment.times_reviewed > 99].sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times_reviewed
67,lamb,0.267332,155
37,panang curry,0.255705,266
12,mango,0.25241,825
8,coconut ice cream,0.248124,157
79,pumpkin curry,0.24693,612
31,fried banana,0.242966,317
77,roasted duck,0.241001,114
62,samosa,0.238047,133
78,angel wings,0.237894,280
73,sticky rice,0.237128,623


#### Granularly evaluating each review by scoring sentence sentiment as opposed to review sentiment for dishes - Scoring 3

In [13]:
# split the reviews into sentences
reviews_sentences = ''.join(list(df.text)).split('.')

# create a dataframe with these sentences
data = {'text' : reviews_sentences}
sentences_df = pd.DataFrame(data)
sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

# create column which is polarity of text
from textblob import TextBlob
sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)

In [14]:
dish_ratings = [avg_review_of_dish(item, sentences_df)[0] for item in sample_menu]
dish_counts = [dish_count(item, sentences_df) for item in sample_menu]
data = {'dish' : sample_menu,
       'rating' : dish_ratings,
       'times_mentioned': dish_counts}
sentences_df = pd.DataFrame(data)

In [15]:
sentences_df[sentences_df.times_mentioned >=150].sort_values(by = 'rating', ascending=False)

Unnamed: 0,dish,rating,times_mentioned
8,coconut ice cream,0.284138,176
31,fried banana,0.23581,357
21,thai iced tea,0.232521,698
67,lamb,0.219962,207
79,pumpkin curry,0.20085,683
62,samosa,0.200051,158
12,mango,0.197959,1055
59,roti,0.192295,963
48,tom ka,0.182124,198
53,spring rolls,0.175485,592


In [16]:
# Saving best Thai dishes in SF
# sentences_df.to_csv('./best_dishes.csv')

## Linear Regression Modelling to infer if beta coefficients are consistent with top dishes

In [17]:
#Creating new menu from previous data-set
new_menu = ['fish cake',
'roast duck',
'samosa',
'coconut ice cream',
'tom ka',
'egg rolls',
'ribs',
'egg roll',
'pork belly',
'fried chicken',
'panang curry',
'larb',
'salmon',
'calamari',
'angel wings',
'fried banana',
'chicken satay',
'tom kha',
'pad kee mao',
'pineapple fried rice',
'prawns',
'seafood',
'spring rolls',
'peanut sauce',
'noodle soup',
'pumpkin curry',
'spring roll',
'crab',
'satay',
'thai iced tea',
'sticky rice',
'papaya salad',
'yellow curry',
'papaya',
'eggplant',
'wings',
'tom yum',
'roti',
'mango',
'pad see ew',
'green curry',
'rolls',
'basil',
'coconut',
'pork',
'shrimp',
'tofu',
'beef',
'fried rice',
'salad',
'noodles',
'pad thai',
'chicken',
'curry']

In [18]:
def sentence_to_vector(sentence, menu):
    return [1*(dish in sentence) for dish in menu]

In [19]:
variable_sentences = [sentence_to_vector(sentence, new_menu) for sentence in reviews_sentences]

In [20]:
X = np.matrix(variable_sentences)
X.shape

(145598, 54)

In [21]:
y = [TextBlob(x).sentiment.polarity for x in reviews_sentences]
len(y)

145598

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
lr = LinearRegression()
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
# Printing out beta coefficients
data = {'coef' : lr.coef_,
       'dish' : new_menu}
df = pd.DataFrame(data)
df.sort_values('coef', ascending=False)

Unnamed: 0,coef,dish
29,0.09537,thai iced tea
23,0.078316,peanut sauce
4,0.072487,tom ka
2,0.070445,samosa
37,0.068342,roti
41,0.063834,rolls
38,0.060017,mango
3,0.057275,coconut ice cream
15,0.055172,fried banana
43,0.049954,coconut


#### The beta coefficients are close to the top Thai dishes in San Francisco but inconsistent

In [25]:
# from sklearn.linear_model import LassoCV, Lasso
# from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

In [26]:
# def cv_r2(model):
#     r2 = np.mean(cross_val_score(model, X, y,scoring="r2",cv = 5))
#     return(r2)
# def lasso_selector(a):
#     lasso_model = make_pipeline(StandardScaler(),LassoCV(max_iter=1e7, alphas = [a],cv=5)).fit(X, y)
#     lasso_r2 = cv_r2(lasso_model).mean()
#     return(lasso_r2)
# lasso_alphas = [.0001, .0003, .0005, .0007, .0009,.01, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 20, 30, 50, 100]
# lasso_scores = []
# for alpha in lasso_alphas:
#     score = lasso_selector(alpha)
#     lasso_scores.append(score)  

In [27]:
# # Analyzing our Alphas
# plt.plot(lasso_alphas, lasso_scores, label='Lasso')
# lasso_score_table = pd.DataFrame(lasso_scores, lasso_alphas, columns=['R2'])
# lasso_score_table

In [28]:
# reg = Lasso(alpha = 0.0001)
# reg.fit(X,y)

In [29]:
# data = {'coef' : reg.coef_,
#        'dish' : new_menu}
# las_df = pd.DataFrame(data)
# las_df.sort_values('coef', ascending=False)

## Finding Top Dishes in Individual Restaurants

In [30]:
# recreating restaurant menu for finding top dishes in each restaurant
sample_menu_rest=[
'noodle soup','chicken noodle soup','beef noodle soup','pad see ew','pad kee mao','pad thai','fried rice','salad','papaya salad','satay','egg rolls','chicken','beef','fried chicken','roast duck','bbq pork','pork','roasted duck','panang curry','green curry','yellow curry','tom yum','tom kha','tom ka','thai iced tea','thai iced coffee',
'imperial rolls','angel wings','wings','corn cakes','mango salad','panang beef','curry','basa fillet','tofu','pumpkin curry','coconut ice cream',
'fried banana','sticky rice','pork belly','silver noodle','crab','calamari','fish cake','fish cakes','peanut sauce','samosa','catfish','pineapple fried rice','money bag','money bags','silver noodle','pad see you','larb','quail','prawns','fried prawn','fried prawns','shrimp','seafood','salmon','ribs','chicken noodle','beef noodle','roti','pad cha','spring rolls','fried egg','imperial roll','spring roll','egg roll','tuna tower','volcanic beef','sea bass','crab cake',
'pad kee mow','massamam','lamb','drunken noodles'   
]

In [31]:
# creating a restaurant function which takes in name of the restaurant and returns top 10 dishes
def restaurant_food(a):
    df = pd.read_csv('./yelp/yelp_thai_clean.csv')
    #restrict to a Thai restaurant
    df = df[df.restaurant == a]

    #just grab rating and review text
    df = df[['rating', 'text']]

    #switch to lower case
    df.text = df.text.apply(lambda x: x.lower())
    
    # this returns just those reviews that have the word
    # in the text of the review
    def subset_reviews(word, df):
        return df[df.text.str.contains(word)]

    # return avg rating of revies that contain dish
    def avg_review_of_dish(item, df):
        return subset_reviews(item, df).mean()

    # return nuber of times dishes reviewed
    def dish_count(item, df):
        return subset_reviews(item, df).shape[0]
    # split the reviews into sentences
    reviews_sentences = ''.join(list(df.text)).split('.')

    # create a dataframe with these sentences
    data = {'text' : reviews_sentences}
    sentences_df = pd.DataFrame(data)
    sentences_df.text = sentences_df.text.apply(lambda x: x.lower())

    # create column which is polarity of text
    from textblob import TextBlob
    sentences_df['polarity'] = sentences_df.text.apply(lambda x: TextBlob(x).sentiment.polarity)
    
    dish_ratings = [avg_review_of_dish(item, sentences_df)[0] for item in sample_menu_rest]
    dish_counts = [dish_count(item, sentences_df) for item in sample_menu_rest]
    data = {'dish' : sample_menu_rest,
           'rating' : dish_ratings,
           'times_mentioned': dish_counts}
    sentences_df = pd.DataFrame(data)
    return sentences_df[sentences_df.times_mentioned >=9].sort_values(by = 'rating', ascending=False).head(10)

In [32]:
# finding the best dishes in an individual restaurant
restaurant_food("Osha Thai")

Unnamed: 0,dish,rating,times_mentioned
72,volcanic beef,0.341867,24
71,tuna tower,0.339284,22
39,pork belly,0.297829,11
21,tom yum,0.250101,21
24,thai iced tea,0.237322,29
38,sticky rice,0.229123,15
59,seafood,0.198861,19
6,fried rice,0.190993,70
60,salmon,0.189683,21
35,pumpkin curry,0.183295,36


In [33]:
# finding the best dishes in an individual restaurant
restaurant_food("Marnee Thai")

Unnamed: 0,dish,rating,times_mentioned
17,roasted duck,0.346962,12
26,imperial rolls,0.314263,13
68,imperial roll,0.314263,13
64,roti,0.301287,24
36,coconut ice cream,0.297135,16
46,samosa,0.27799,22
69,spring roll,0.268089,9
42,calamari,0.267397,15
24,thai iced tea,0.262864,26
29,corn cakes,0.260882,17
