## 0_Imports & Configs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import clone

## 1_Reading & Cleaning Data

In [3]:
df_0 = pd.read_csv('../yelp_dataset/yelp_reviews_PA.csv').drop(columns=['state','is_open'])

In [4]:
df_1 = df_0[df_0['categories'].str.contains('Restaurants',na=False)]

## 2_Exploratory Analysis

In [5]:
N_restaurants = 12

In [6]:
most_reviewd_restaurants = df_1.groupby(by='business_id').count().iloc[:,1].sort_values(ascending=False)[0:N_restaurants]
most_reviewd_IDs = most_reviewd_restaurants.index

print(f'                          G    B')
for _id in most_reviewd_IDs:
    good_reviews = df_1[(df_1['business_id']==_id) & (df_1['review_stars']>3)].shape[0]
    bad_reviews = df_1[(df_1['business_id']==_id) & (df_1['review_stars']<3)].shape[0]
    print(f'{_id}: {good_reviews}; {bad_reviews}')

                          G    B
JLbgvGM4FXh9zNP4O5ZWjQ: 1641; 212
u4sTiCzVeIHZY8OlaL346Q: 1800; 98
woXlprCuowrLJswWere3TQ: 1223; 122
lKom12WnYEjH5FFemK3M1Q: 794; 323
dLc1d1zwd1Teu2QED5TmlA: 1167; 80
ejaUQ1hYo7Q7xCL1HdPINw: 679; 193
SmkYLXEYhzwUZdS6TAevHg: 813; 77
sMzNLdhJZGzYirIWt-fMAg: 630; 115
Voeq7aGDmCGMjE_SQiHwRA: 678; 65
LQFmktF43j2NPncKdNd9mg: 629; 71
w_UCGMgok7N9p0XdYBx1VQ: 457; 192
wmCBxE0PfLZD8sxIwAY59Q: 584; 87


In [7]:
df_1[df_1['business_id'].isin(most_reviewd_IDs)].drop_duplicates(subset='business_id',keep='first')[['business_id','categories','stars']].head(10)

Unnamed: 0,business_id,categories,stars
26404,woXlprCuowrLJswWere3TQ,"American (New), Mexican, Restaurants",4.5
39260,ejaUQ1hYo7Q7xCL1HdPINw,"Bars, Restaurants, Breweries, Nightlife, Food,...",3.5
40801,SmkYLXEYhzwUZdS6TAevHg,"Butcher, American (New), Nightlife, Cocktail B...",4.5
43432,lKom12WnYEjH5FFemK3M1Q,"Sandwiches, Restaurants, Botanical Gardens, Ar...",3.5
82288,sMzNLdhJZGzYirIWt-fMAg,"Pizza, Cocktail Bars, American (New), Restaura...",4.0
85295,wmCBxE0PfLZD8sxIwAY59Q,"Breweries, Pubs, French, Food, German, Bars, R...",4.0
89038,dLc1d1zwd1Teu2QED5TmlA,"Thai, Noodles, Food, Restaurants",4.5
126458,u4sTiCzVeIHZY8OlaL346Q,"Seafood, Latin American, Argentine, Restaurant...",4.5
134349,w_UCGMgok7N9p0XdYBx1VQ,"Restaurants, Sandwiches, Bars, Chicken Wings, ...",3.5
163464,Voeq7aGDmCGMjE_SQiHwRA,"American (New), Seafood, Cocktail Bars, Nightl...",4.0


## 3_Topic Modeling

In [8]:
ID = 'lKom12WnYEjH5FFemK3M1Q'
df_2 = df_1[(df_1['business_id']==ID)]

### 3.1_Preprocessing

In [9]:
df_1positive = df_2[df_2['review_stars']>3]['text'].tolist()
df_1negative = df_2[df_2['review_stars']<3]['text'].tolist()

#### 3.1.1_Setting stopwords

In [10]:
extra_words = ['ve', 'like', 'got', 'just', 
                'restaurant', 'great',
                'topping', 'toppings',
               'don', 'really', 'said', 'told', 'ok',
               'came', 'went', 'did', 'didn', 'good','ll']
stop_words = text.ENGLISH_STOP_WORDS.union(extra_words)

#### 3.1.1_Replacing similar words by a common term

In [11]:
def replace_words(text, dict_words):
    for i,j in dict_words.items():
        text = text.replace(i,j)
    return text

In [12]:
dict_words = {'ordered':'order',
              'prices':'price', 
              'pizzas':'pizza', 
              'burgers': 'burguer',
              'waiting' : 'wait',
              'waited' : 'wait',
              'waitress' : 'waiter',
              'sandwiches' : 'sandwich'
             }


In [13]:
df_1negative = [replace_words(w, dict_words) for w in df_1negative]
df_1positive = [replace_words(w, dict_words) for w in df_1positive]

### 3.1_Dividing Dataset

In [14]:
vectorizer_template = TfidfVectorizer(max_df=0.95, min_df=8, stop_words=stop_words)

In [15]:
no_topics = 3
no_top_words = 8

### 3.2_Modeling the Positive Topic

In [16]:
def display_topics(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

#### 3.2.1_Preprocessing

In [17]:
## Primeira rodada: pegar o número de palavras
aux_vectorizer = clone(vectorizer_template)
aux_vectorizer.fit(df_1positive)
max_features = int(0.2*len(aux_vectorizer.get_feature_names()))

## Segunda rodada: faz a transformação
positive_vectorizer = clone(vectorizer_template).set_params(max_features=max_features)
positive_vectors = positive_vectorizer.fit_transform(df_1positive)
positive_words = np.array(positive_vectorizer.get_feature_names())

#### 3.2.2_Decomposing

In [18]:
positive_nmf = NMF(n_components=no_topics, solver="mu")
W = positive_nmf.fit_transform(positive_vectors)
H = positive_nmf.components_

In [19]:
display_topics(positive_nmf,positive_vectorizer.get_feature_names(),no_topics,no_top_words)

Topic 0:   1.580*sandwich, 1.062*fries, 0.708*slaw, 0.687*cheese, 0.579*pittsburgh, 0.543*bread, 0.520*pastrami, 0.513*coleslaw
Topic 1:   1.309*place, 1.190*food, 0.652*service, 0.544*pittsburgh, 0.450*love, 0.438*atmosphere, 0.328*time, 0.317*people
Topic 2:   1.186*primanti, 0.715*original, 0.698*location, 0.586*bros, 0.571*strip, 0.440*brothers, 0.421*district, 0.390*time


### 3.3_Modeling the Negative Topic

#### 3.3.1_Preprocessing

In [20]:
## Primeira rodada: pegar o número de palavras
aux_vectorizer = clone(vectorizer_template)
aux_vectorizer.fit(df_1negative)
max_features = int(0.2*len(aux_vectorizer.get_feature_names()))

## Segunda rodada: faz a transformação
negative_vectorizer = clone(vectorizer_template).set_params(max_features=max_features)
negative_vectors = negative_vectorizer.fit_transform(df_1negative)
negative_words = np.array(negative_vectorizer.get_feature_names())

#### 3.3.2_Decomposing

In [21]:
negative_nmf = NMF(n_components=no_topics, solver="mu")
W = negative_nmf.fit_transform(negative_vectors)
H = negative_nmf.components_

In [22]:
display_topics(negative_nmf,negative_vectorizer.get_feature_names(),no_topics,no_top_words)

Topic 0:   1.519*sandwich, 1.053*fries, 0.662*bread, 0.648*meat, 0.591*slaw, 0.570*cheese, 0.475*coleslaw, 0.405*french
Topic 1:   1.419*place, 1.384*food, 0.503*service, 0.396*pittsburgh, 0.334*eat, 0.211*time, 0.155*better, 0.137*special
Topic 2:   0.971*primanti, 0.920*pittsburgh, 0.572*brothers, 0.504*location, 0.447*experience, 0.322*time, 0.315*order, 0.305*try
