In [None]:
import pandas as pd

In [None]:
# Importando base e retirando colunas desnecessárias 
business_json_path = 'data-yelp/yelp_academic_dataset_business.json'
data_business = pd.read_json(business_json_path, lines=True)
drop_columns = ['name', 'address', 'postal_code', 'latitude', 'longitude', 'review_count', 'attributes', 'hours']
data_business = data_business.drop(drop_columns, axis=1)

# Filtrando apenas business da Pensilvânia (PA)
final_business = data_business[data_business['state'] == 'PA']
final_business

In [None]:
# size = 1000000
# review_json_path = 'data/yelp_academic_dataset_review.json'
# review = pd.read_json(review_json_path, lines=True, chunksize=size)

# # Merge da base de dados de review com a base de dados de business já filtrada
# chunk_list = []
# for chunk_review in review:
#     chunk_review = chunk_review.drop(['review_id', 'user_id','useful','funny','cool'], axis=1).rename(columns={'stars': 'review_stars'})
#     chunk_merged = pd.merge(final_business, chunk_review, on='business_id', how='inner')
#     print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
#     chunk_list.append(chunk_merged)
    
# final_df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

# # Gerando CSV de novo dataset
# final_csv_name = "yelp_reviews_PA.csv"
# final_df.to_csv(final_csv_name, index=False)

final_df = pd.read_csv('data-yelp/yelp_reviews_PA.csv')
final_df

In [None]:
number_of_reviews = 1500
df_count = final_df.groupby(['business_id']).count()
df_count = df_count[df_count['text']>=number_of_reviews].sort_values(by=['text'], ascending=False)
print(df_count)
top_reviews = df_count.index.values
final_df = final_df[final_df['business_id'].isin(top_reviews)]

**Análise da base**

In [None]:
import matplotlib.pyplot as plt

def word_count(x):
    return len(x.split())

final_df['words_count'] = final_df['text'].apply(word_count)
print(final_df)
print('Descrição estatística da quantidade de palavras dos reviews:\n\n', final_df['words_count'].describe())

plt.hist(final_df['words_count'], bins=100, label='Quantidade de palavras')

    
reviews_1 = final_df[final_df['review_stars'] <= 1]
reviews_2 = final_df[(final_df['review_stars'] <= 2) & (final_df['review_stars'] > 1)]
reviews_3 = final_df[(final_df['review_stars'] <= 3) & (final_df['review_stars'] > 2)]
reviews_4 = final_df[(final_df['review_stars'] <= 4) & (final_df['review_stars'] > 3)]
reviews_5 = final_df[(final_df['review_stars'] <= 5) & (final_df['review_stars'] > 4)]

labels = '1 star', '2 stars', '3 stars', '4 stars', '5 stars'
sizes = [len(reviews_1), len(reviews_2), len(reviews_3), len(reviews_4), len(reviews_5)]

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')

**Pré-processamento dos textos**

In [None]:
import string
import re

# lowercase
texts = [text.lower() for text in final_df['text']]

# remoção de números
texts = [re.sub(r'\d+', '', text) for text in texts]

# remoção de pontuação
translator = str.maketrans('', '', string.punctuation) 
texts = [text.translate(translator) for text in texts]

# remoção de espaços em branco
texts = [" ".join(text.split()) for text in texts]


final_df['text'] = texts
# print(final_df[:4])


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

**Reviews Negativos**

In [None]:
n_components = 7
max_df = 0.85
min_df=3

for business in top_reviews:
    business_reviews = final_df[(final_df['business_id'] == business) & (final_df['review_stars'] <= 2)]['text']
    print('\n*****', business, '*****\n')
    print(data_business[data_business['business_id'] == business], ' \n')
    print('Dataset size:', len(business_reviews))
    
    vectorizer = TfidfVectorizer(max_df=0.85, min_df=3, stop_words='english')
    
    vectors = vectorizer.fit_transform(business_reviews) 
    words = np.array(vectorizer.get_feature_names())
    print('words', len(words), vectors.shape, '\n')

    nmf = NMF(n_components=n_components, solver="mu")
    W = nmf.fit_transform(vectors)
    H = nmf.components_
    
    for i, topic in enumerate(H):
         print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-5:]]])))

**Reviews Positivos**

In [None]:
n_components = 7
max_df = 0.85
min_df=5

for business in top_reviews:
    business_reviews = final_df[(final_df['business_id'] == business) & (final_df['review_stars'] > 4)]['text']
    print('\n*****', business, '*****\n')
    print(data_business[data_business['business_id'] == business], ' \n')
    print('Dataset size:', len(business_reviews))
    
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english')
    
    vectors = vectorizer.fit_transform(business_reviews) 
    words = np.array(vectorizer.get_feature_names())
    print('words', len(words), vectors.shape, '\n')

    nmf = NMF(n_components=n_components, solver="mu")
    
    W = nmf.fit_transform(vectors)
    H = nmf.components_

    for i, topic in enumerate(H):
         print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-5:]]])))
    