In [106]:
from pyspark.sql.functions import lower, regexp_replace
from pyspark.sql.functions import col
from pyspark.sql.functions import col, sum as spark_sum
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
%%pyspark
df = spark.sql("SELECT * FROM `default`.`restaurants_reviews`")

In [105]:
display(df.limit(5))

In [4]:
#checking missing values
missing_values_counts = df.select([spark_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
display(missing_values_counts)

In [5]:
# Convert text to lowercase
df = df.withColumn("text", lower(df["text"]))

# Remove unnecessary punctuation
df = df.withColumn("removed_punct_text",regexp_replace(df["text"], r'[\n!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~]', ''))

In [6]:
# Reading positive words from the txt files
positive_words_df = spark.read.text("abfss://files@datalakehay9qx2.dfs.core.windows.net/ML_texts/positive.txt")
positive_words = positive_words_df.select(col("value").alias("positive_word")).rdd.flatMap(lambda x: x).collect()

# Reading negative words from the txt files
negative_words_df = spark.read.text("abfss://files@datalakehay9qx2.dfs.core.windows.net/ML_texts/negative.txt")
negative_words = negative_words_df.select(col("value").alias("negative_word")).rdd.flatMap(lambda x: x).collect()

print("Positive Words:", positive_words)
print("Negative Words:", negative_words)

In [7]:
display(df.limit(5))

In [8]:
# Function getting data with a certain category and its label into a pandas dataframe.

def get_dataset(category):
    df1 = df.filter(col("category") == category) \
                           .select(col("removed_punct_text").alias("text"), col("labels")) \
                           .toPandas()
    return df1

In [9]:
# Filtering words and getting only the words which are in the positive and negative words lists.

def filter_words(review):
    words = review.split()
    filtered_words = [word for word in words if word in positive_words or word in negative_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [10]:
Korean_reviews = get_dataset('Korean')

In [11]:
# Getting dataset and splitting it into train and test sets. Returns train set.

def split_data(dataset, test_size):
    df_train, df_test = train_test_split(dataset[['text','labels']],test_size=test_size)
    return df_train

In [12]:
Korean_train, Korean_test = train_test_split(Korean_reviews[['text','labels']],test_size=0.5)

In [13]:
Korean_train.shape

In [14]:
Korean_test.shape

In [15]:
Korean_train.text = Korean_train.text.apply(filter_words)

In [16]:
Korean_train

In [17]:
## Assigning text and labels columns to different variables.
traintexts=list(Korean_train['text'])
trainlabels=list(Korean_train['labels'])

testtexts=list(Korean_test['text'])
testlabels=list(Korean_test['labels'])

In [18]:
traintexts

In [19]:
#Applied 'bag of words': the frequencies of various words appeared in each review as features using CountVectorizer.

vectorizer = CountVectorizer()
count_trainfeatures=vectorizer.fit_transform(traintexts)
count_trainfeatures.shape

In [29]:
print(count_trainfeatures)

In [30]:
## Implementing SVM model to get relatively positive and negative words and get score of each word.
svm = LinearSVC(max_iter=10000)
svm.fit(count_trainfeatures, trainlabels)

In [31]:
## Creating dataframe for score of each word in a review calculated by svm model
coeff = svm.coef_[0]
Korean_words_score = pd.DataFrame({'score': coeff, 'word': vectorizer.get_feature_names()})

In [32]:
Korean_words_score

In [37]:
## Getting frequency of each word in all reviews in specific category
Korean_reviews = pd.DataFrame(count_trainfeatures.toarray(), columns=vectorizer.get_feature_names())
Korean_reviews['labels'] = trainlabels
Korean_frequency = Korean_reviews[Korean_reviews['labels'] =='positive'].sum()[:-1]

In [39]:
Korean_words_score.set_index('word', inplace=True)

In [40]:
Korean_polarity_score = Korean_words_score
Korean_polarity_score['frequency'] = Korean_frequency
Korean_polarity_score

In [41]:
# Calculating the polarity score.

Korean_polarity_score['polarity'] = Korean_polarity_score.score * Korean_polarity_score.frequency / Korean_reviews.shape[0]
Korean_polarity_score

In [42]:
## Dropping unnecessery words.
unuseful_positive_words = Korean_polarity_score.loc[['great','amazing','love','best','awesome','excellent','good',
                                                    'favorite','loved','perfect','gem','perfectly','wonderful',
                                                    'happy','enjoyed','nice','well','super','like','better','decent','fine',
                                                    'pretty','enough','excited','impressed','ready','fantastic','glad','right',
                                                    'fabulous']]
unuseful_negative_words =  Korean_polarity_score.loc[['bad','disappointed','unfortunately','disappointing','horrible',
                                                     'lacking','terrible','sorry', 'disappoint']]

Korean_polarity_score.drop(unuseful_positive_words.index, axis=0, inplace=True)
Korean_polarity_score.drop(unuseful_negative_words.index, axis=0, inplace=True)

In [43]:
Korean_polarity_score.polarity = Korean_polarity_score.polarity.astype(float)
Korean_polarity_score.frequency = Korean_polarity_score.frequency.astype(float)

In [44]:
# Displaying top 20 words with highest polarity score.
Top_polarity_scores=Korean_polarity_score[Korean_polarity_score.polarity>0].sort_values('polarity', ascending=False)
Top_polarity_scores[:20]

In [45]:
Korean_polarity_score_sorted=Korean_polarity_score.sort_values("polarity",ascending=False)
Korean_polarity_score_sorted

In [46]:
# Getting top 10 positive and negative words.
pos_words=[]
neg_words=[]
for i in range(len(Korean_polarity_score_sorted)):
    if Korean_polarity_score_sorted.index[i] in positive_words:
        pos_words.append(Korean_polarity_score_sorted.index[i])
    else:
        neg_words.append(Korean_polarity_score_sorted.index[i])

In [47]:
Korean_top_positive_words=pos_words[:10]
Korean_top_negative_words=neg_words[-10:]
Korean_top_words = Korean_polarity_score.loc[Korean_top_positive_words+Korean_top_negative_words,'polarity']

In [48]:
import matplotlib.pyplot as plt
import seaborn as sns

In [49]:
# Plotting top ten negative and positive words.
plt.figure(figsize=(11,6))
colors = ['red' if c < 0 else 'blue' for c in Korean_top_words.values]
sns.barplot(y=Korean_top_words.index, x=Korean_top_words.values, palette=colors)
plt.xlabel('Polarity Score', labelpad=10, fontsize=14)
plt.ylabel('Words', fontsize=14)
plt.title('TOP 10 Positive and Negative Words in Korean Restaurants', fontsize=15)
plt.tick_params(labelsize=14)
plt.xticks(rotation=15)

In [50]:
# Function to calculate polarity score of any dataset.

def get_polarity_score(dataset):
    dataset.text = dataset.text.apply(filter_words)

    terms_train=list(dataset['text'])
    class_train=list(dataset['labels'])

    ## get bag of words
    vectorizer = CountVectorizer()
    feature_train_counts=vectorizer.fit_transform(terms_train)

    ## run model
    svm = LinearSVC(max_iter=10000)
    svm.fit(feature_train_counts, class_train)

    ## create dataframe for score of each word in a review calculated by svm model
    coeff = svm.coef_[0]
    cuisine_words_score = pd.DataFrame({'score': coeff, 'word': vectorizer.get_feature_names()})

    ## get frequency of each word in all reviews in specific category
    cuisine_reviews = pd.DataFrame(feature_train_counts.toarray(), columns=vectorizer.get_feature_names())
    cuisine_reviews['labels'] = class_train
    cuisine_frequency = cuisine_reviews[cuisine_reviews['labels'] =='positive'].sum()[:-1]

    cuisine_words_score.set_index('word', inplace=True)
    cuisine_polarity_score = cuisine_words_score
    cuisine_polarity_score['frequency'] = cuisine_frequency

    cuisine_polarity_score.score = cuisine_polarity_score.score.astype(float)
    cuisine_polarity_score.frequency = cuisine_polarity_score.frequency.astype(int)

    ## calculate polarity score
    cuisine_polarity_score['polarity'] = cuisine_polarity_score.score * cuisine_polarity_score.frequency / cuisine_reviews.shape[0]

    cuisine_polarity_score.polarity = cuisine_polarity_score.polarity.astype(float)
    ## drop unnecessary words
    unuseful_positive_words = ['great','amazing','love','best','awesome','excellent','good',
                                                   'favorite','loved','perfect','gem','perfectly','wonderful',
                                                    'happy','enjoyed','nice','well','super','like','better','decent','fine',
                                                    'pretty','enough','excited','impressed','ready','fantastic','glad','right',
                                                    'fabulous']
    unuseful_negative_words =  ['bad','disappointed','unfortunately','disappointing','horrible',
                                                    'lacking','terrible','sorry']
    unuseful_words = unuseful_positive_words + unuseful_negative_words
    cuisine_polarity_score.drop(cuisine_polarity_score.loc[unuseful_words].index, axis=0, inplace=True)

    return cuisine_polarity_score

In [51]:
# Plotting top 10 negative and positive words according to their polarity scores.

def plot_top_words(top_words, category):
    plt.figure(figsize=(11,6))
    colors = ['red' if c < 0 else 'blue' for c in top_words.values]
    sns.barplot(y=top_words.index, x=top_words.values, palette=colors)
    plt.xlabel('Polarity Score', labelpad=10, fontsize=14)
    plt.ylabel('Words', fontsize=14)
    plt.title('TOP 10 Positive and Negative Words in %s Restaurants ' % category, fontsize=15)
    plt.tick_params(labelsize=14)
    plt.xticks(rotation=15)

In [53]:
# Getting top 10 negative and positive words according to their polarity scores.

def get_top_words(dataset, label, number=20):
    if label == 'positive':
        df = dataset[dataset.polarity>0].sort_values('polarity',ascending = False)[:number]
    else:
        df = dataset[dataset.polarity<0].sort_values('polarity')[:number]
    return df

## Japanese

In [54]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for Japanese restaurants.

Japanese_reviews = get_dataset('Japanese')
Japanese_train = split_data(Japanese_reviews, 0.9)
print('Total %d number of reviews' % Japanese_train.shape[0])
Japanese_polarity_score = get_polarity_score(Japanese_train)

In [55]:
Japanese_top_positive_words=get_top_words(Japanese_polarity_score, 'positive')[:10]
Japanese_top_positive_words

In [56]:
Japanese_top_negative_words=get_top_words(Japanese_polarity_score,'negative')[:10]
Japanese_top_negative_words

In [57]:
Japanese_top_words = Japanese_polarity_score.loc[Japanese_top_positive_words.index.tolist()+Japanese_top_negative_words.index.tolist(),'polarity']
plot_top_words(Japanese_top_words,'Japanese')

## Thai

In [58]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for Thai restaurants.

Thai_reviews = get_dataset('Thai')
Thai_train = split_data(Thai_reviews, 0.9)
print('Total %d number of reviews' % Thai_train.shape[0])
Thai_polarity_score = get_polarity_score(Thai_train)

In [59]:
Thai_top_positive_words=get_top_words(Thai_polarity_score, 'positive')[:10]
Thai_top_positive_words

In [60]:
Thai_top_negative_words=get_top_words(Thai_polarity_score,'negative')[:10]
Thai_top_negative_words

In [61]:
Thai_top_words = Thai_polarity_score.loc[Thai_top_positive_words.index.tolist()+Thai_top_negative_words.index.tolist(),'polarity']
plot_top_words(Thai_top_words,'Thai')

## Chinese

In [62]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for Chinese restaurants.

Chinese_reviews = get_dataset('Chinese')
Chinese_train = split_data(Chinese_reviews, 0.9)
print('Total %d number of reviews' % Chinese_train.shape[0])
Chinese_polarity_score = get_polarity_score(Chinese_train)

In [63]:
Chinese_top_positive_words=get_top_words(Chinese_polarity_score, 'positive')[:10]
Chinese_top_positive_words

In [64]:
Chinese_top_negative_words=get_top_words(Chinese_polarity_score,'negative')[:10]
Chinese_top_negative_words

In [65]:
Chinese_top_words = Chinese_polarity_score.loc[Chinese_top_positive_words.index.tolist()+Chinese_top_negative_words.index.tolist(),'polarity']
plot_top_words(Chinese_top_words,'Chinese')

## Vietnamese

In [66]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for Vietnamese restaurants.

Vietnamese_reviews = get_dataset('Vietnamese')
Vietnamese_train = split_data(Vietnamese_reviews, 0.9)
print('Total %d number of reviews' % Vietnamese_train.shape[0])
Vietnamese_polarity_score = get_polarity_score(Vietnamese_train)

In [67]:
Vietnamese_top_positive_words=get_top_words(Vietnamese_polarity_score, 'positive')[:10]
Vietnamese_top_positive_words

In [68]:
Vietnamese_top_negative_words=get_top_words(Vietnamese_polarity_score,'negative')[:10]
Vietnamese_top_negative_words

In [69]:
Vietnamese_top_words = Vietnamese_polarity_score.loc[Vietnamese_top_positive_words.index.tolist()+Vietnamese_top_negative_words.index.tolist(),'polarity']
plot_top_words(Vietnamese_top_words,'Vietnamese')

## French

In [70]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for French restaurants.

French_reviews = get_dataset('French')
French_train = split_data(French_reviews, 0.9)
print('Total %d number of reviews' % French_train.shape[0])
French_polarity_score = get_polarity_score(French_train)

In [71]:
French_top_positive_words=get_top_words(French_polarity_score, 'positive')[:10]
French_top_positive_words

In [72]:
French_top_negative_words=get_top_words(French_polarity_score,'negative')[:10]
French_top_negative_words

In [73]:
French_top_words = French_polarity_score.loc[French_top_positive_words.index.tolist()+French_top_negative_words.index.tolist(),'polarity']
plot_top_words(French_top_words,'French')

## Italian

In [74]:
# Getting dataset, calculating polarity scores, getting and plotting top 10 negative and positive words for Italian restaurants.

Italian_reviews = get_dataset('Italian')
Italian_train = split_data(Italian_reviews, 0.9)
print('Total %d number of reviews' % Italian_train.shape[0])
Italian_polarity_score = get_polarity_score(Italian_train)

In [75]:
Italian_top_positive_words=get_top_words(Italian_polarity_score, 'positive')[:10]
Italian_top_positive_words

In [76]:
Italian_top_negative_words=get_top_words(Italian_polarity_score,'negative')[:10]
Italian_top_negative_words

In [77]:
Italian_top_words = Italian_polarity_score.loc[Italian_top_positive_words.index.tolist()+Italian_top_negative_words.index.tolist(),'polarity']
plot_top_words(Italian_top_words,'Italian')

In [99]:
# Combining all top negative and positive words to compare among different cuisine types.

all_category = {'cuisine':['Korean','Japanese','Chinese','Thai','Vietnamese','French','Italian']}
cuisine_positive_words = pd.DataFrame(all_category)
for i,word in enumerate(Korean_top_positive_words):
    cuisine_positive_words.loc[0,i] = word

In [100]:
for i,word in enumerate(Korean_top_positive_words):
    cuisine_positive_words.iloc[0,i] = word
for i,word in enumerate(Japanese_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[1,i] = word
for i,word in enumerate(Chinese_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[2,i] = word
for i,word in enumerate(Thai_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[3,i] = word
for i,word in enumerate(Vietnamese_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[4,i] = word
for i,word in enumerate(French_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[5,i] = word
for i,word in enumerate(Italian_top_positive_words.index.tolist()):
    cuisine_positive_words.iloc[6,i] = word

cuisine_positive_words.drop(9,axis=1,inplace=True)
cuisine_positive_words.columns=['0','1','2','3','4','5','6','7','8','9']
cuisine_positive_words['cuisine']=['Korean','Japanese','Chinese','Thai','Vietnamese','French','Italian']
cuisine_positive_words.set_index('cuisine', inplace=True)

In [102]:
all_category = {'cuisine':['Korean','Japanese','Chinese','Thai','Vietnamese','French','Italian']}
cuisine_negative_words = pd.DataFrame(all_category)
for i,word in enumerate(Korean_top_negative_words):
    cuisine_negative_words.loc[0,i] = word

In [103]:
for i,word in enumerate(Korean_top_negative_words):
    cuisine_negative_words.iloc[0,i] = word
for i,word in enumerate(Japanese_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[1,i] = word
for i,word in enumerate(Chinese_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[2,i] = word
for i,word in enumerate(Thai_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[3,i] = word
for i,word in enumerate(Vietnamese_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[4,i] = word
for i,word in enumerate(French_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[5,i] = word
for i,word in enumerate(Italian_top_negative_words.index.tolist()):
    cuisine_negative_words.iloc[6,i] = word

cuisine_negative_words.drop(9,axis=1,inplace=True)
cuisine_negative_words.columns=['0','1','2','3','4','5','6','7','8','9']
cuisine_negative_words['cuisine']=['Korean','Japanese','Chinese','Thai','Vietnamese','French','Italian']
cuisine_negative_words.set_index('cuisine', inplace=True)

In [101]:
cuisine_positive_words

When we checked all cuisines, we see that delicious word is the most common top positive word among all cuisines. From there we can say that taste is the top concern in the restaurant businesses. In Japanese, Chinese and Vietnamese cuisines second best concern is freshness of foods that was expected but in the Korean and Thai cuisines being friendly coming earlier than freshness that was unexpected for the Far East cuisine. In French cuisine looks like exaggerated words are used more compare to other cuisines.
The reason of high score in French cuisine type is related to the romantic and beautiful appearence or environment.While French restaurants received positive reviews for their sweet food, sweet food is the reason for Korean restaurants to have negative reviews.

In [104]:
cuisine_negative_words

In Korean and Japanese restaurants customers first concern is pricing. In most of the cuisines cold food is not welcomed well. Also we can see that, sour and sweet foods are not welcomed well too. Being slow on serve is another important concern for all cuisines. For Italian and French cuisines we can see that cold and warm foods are the most important concerns for negative reviews, which are not expected in general restaurant reviews. In Vietnamese cuisine, we find that hard is the main concern which can be evaluated from different perspectives for example hard to find, hard to order and hard to communicate. 

# Recommendation
- For Korean and Japanese cuisines pricing can be considered.
- For Chinese restaurants sourness of foods should be adjusted and coldness of foods should be balanced.
- For Thai restaurants sweetness of foods should be arranged and they should be careful for not to give wrong orders.
- For Vietnamese cuisine they should find what is hard in their restaurant and try to fix it.
- For French and Italian restaurants they should be more careful about serving cold and warm foods.