## Pacotes Necessários

In [1]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import re
import string
from spacy import load


## Carregando o modelo pré-treinado
nlp = load('en_core_web_trf')

: 

## Funções

In [2]:
###############################################################
#### AVALIANDO O % NAS COLUNAS QUE CONTEM VALORES NULOS #######
###############################################################
def percentual_nulos(df,lista_avaliar):
    total_linhas = df.shape[0]
    lista_percentual = []

    ## SALVANDO A SAIDA DA FUNCAO QUE RETORNA A QTDE DE NULOS POR COLUNA
    teste_nulos = df.isnull().sum()

    ## SALVA O RESULTADO EM UM DATAFRAME
    teste_nulos = pd.DataFrame({'COLUNA':teste_nulos.index, 'VALOR':teste_nulos.values})

    ## SELECIONA APENAS AS COLUNAS QUE PRECISAM SER AVALIADAS
    teste_nulos = teste_nulos.query('COLUNA == @lista_avaliar')

    ## SALVA O NOME DAS COLUNAS EM UMA LISTA
    colunas = teste_nulos.COLUNA.tolist()

    ## SALVA OS VALORES DE MISSINGS POR COLUNA EM UMA LISTA
    valor = teste_nulos.VALOR.tolist()

    # obtenha a lista de tuplas de duas listas
    # e mescle-as usando zip().
    lista_de_tuplas = list(zip(colunas, valor))

    # converte uma lista de tuplas num DataFrame
    df_avaliado = pd.DataFrame(lista_de_tuplas, columns=['colunas', 'valor'])

    ## FILTRANDO NO DF APENAS COLUNAS QUE POSSUEM DADOS
    for i,coluna in enumerate(colunas):
        percentual = round((valor[i]/total_linhas)*100,2)
        lista_percentual.append(percentual)

    ## Transformando a lista de percentuais em um df pandas
    lista_percentual = pd.DataFrame(lista_percentual, columns = ['percentual'])

    ## Juntando a informação de percentual ao df 
    df_avaliado = pd.concat([df_avaliado,lista_percentual], axis = 1)

    return(df_avaliado)

In [3]:
## Carregando o conjunto de dados
df = pd.read_csv('C:/Users/clari/OneDrive/Documentos/python_codigos/nlp_starbucks/nlp_starbucks/dados/reviews_data.csv')
df.head()

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...


In [4]:
###############################################################################
########### AVALIANDO O PERCENTUAL DE NULOS NAS COLUNAS #######################
###############################################################################

lista_avaliar = list(df.columns)
df_avaliado = percentual_nulos(df,lista_avaliar)
df_avaliado.head(10)

Unnamed: 0,colunas,valor,percentual
0,name,0,0.0
1,location,0,0.0
2,Date,0,0.0
3,Rating,145,17.06
4,Review,0,0.0
5,Image_Links,0,0.0


Como é possível observar, há 145 valores nulos na coluna Rating que é a variável target. Esses dados nulos seriam correspondentes aos dados de teste para previsão depois do modelo treinado. No final deste código, os dados serão separados em treino e teste e, nos dados de treino não deverá conter nenhum dado nulo na variável target.

In [5]:
### Dados Gerais sobre os dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         850 non-null    object 
 1   location     850 non-null    object 
 2   Date         850 non-null    object 
 3   Rating       705 non-null    float64
 4   Review       850 non-null    object 
 5   Image_Links  850 non-null    object 
dtypes: float64(1), object(5)
memory usage: 40.0+ KB


A coluna Date contém valores de data, contudo o tipo identificado é do tipo object, é necessário alterar o tipo da coluna para utilizá-la de forma adequada em gráficos.

In [6]:
## Copiando a coluna original e tratando em uma coluna adicionada
df['date_treat'] = df['Date']
df['date_treat'] = df['date_treat'].str.split('Reviewed').str[1] # separando a string usando como criterio o /
df['date_treat'] = df['date_treat'].str.strip() # retirando os espacos em branco no inicio e final da string

## Retirando o ponto de algumas strings
df['date_treat'] = df['date_treat'].str.replace('.', '')

## Passando o tipo de dado para datetime
df['date_treat'] = pd.to_datetime(df['date_treat'], format = "mixed")

## Alterando a ordem das colunas e retirando a coluna Image_Links que não será utilizada
df = df[['name',
         'location',
         'Date',
         'date_treat',
         'Rating',
         'Review']]

df.head()

Unnamed: 0,name,location,Date,date_treat,Rating,Review
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",2023-09-13,5.0,Amber and LaDonna at the Starbucks on Southwes...
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",2023-07-16,5.0,** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",2023-07-05,5.0,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",2023-05-26,5.0,Me and my friend were at Starbucks and my card...
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",2023-01-22,5.0,I’m on this kick of drinking 5 cups of warm wa...


Agora que o campo de data foi ajustado, faz-se o tratamento do campo location separando a string em cidade e estado.

In [7]:
### Criando a coluna city com a informação que vem antes da vírgula na coluna location
df.insert(2, "city", df["location"].str.split(",").str[0])
df['city'] = df['city'].str.strip() # retirando os espacos em branco no inicio e final da string

### Criando a coluna state com a informação que vem depois da vírgula na coluna location
df.insert(3, "state", df["location"].str.split(",").str[1])
df['state'] = df['state'].str.strip() # retirando os espacos em branco no inicio e final da string

df.head()

Unnamed: 0,name,location,city,state,Date,date_treat,Rating,Review
0,Helen,"Wichita Falls, TX",Wichita Falls,TX,"Reviewed Sept. 13, 2023",2023-09-13,5.0,Amber and LaDonna at the Starbucks on Southwes...
1,Courtney,"Apopka, FL",Apopka,FL,"Reviewed July 16, 2023",2023-07-16,5.0,** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA",Cranberry Twp,PA,"Reviewed July 5, 2023",2023-07-05,5.0,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA",Seattle,WA,"Reviewed May 26, 2023",2023-05-26,5.0,Me and my friend were at Starbucks and my card...
4,Tenessa,"Gresham, OR",Gresham,OR,"Reviewed Jan. 22, 2023",2023-01-22,5.0,I’m on this kick of drinking 5 cups of warm wa...


In [8]:
## A coluna Rating deveria possuir apenas valores inteiros pois trata-se de um score. 
## Alterando o tipo de dado
df = df.astype({'Rating': 'Int64'})
df.Rating.unique()

<IntegerArray>
[5, 1, 2, 3, 4, <NA>]
Length: 6, dtype: Int64

Um score de 1 a 5 possui características que podem ser resumidas em um score de 0 a 2, sendo:

- Negative : 0

- Neutral : 1

- Positive : 2

Isto pode ajudar na performance do modelo de classificação e facilita também a compreensão dos resultados, ou seja, se o consumidor está satisfeito (positive), neutro (neutral) ou insatisfeito (negative) em relação ao serviço prestado.

In [9]:
df["rating_treat"] = df["Rating"]
df["rating_treat"]=df["rating_treat"].replace([1,2],0)
df["rating_treat"]=df["rating_treat"].replace(3,1)
df["rating_treat"]=df["rating_treat"].replace([4,5],2)
df.head()

Unnamed: 0,name,location,city,state,Date,date_treat,Rating,Review,rating_treat
0,Helen,"Wichita Falls, TX",Wichita Falls,TX,"Reviewed Sept. 13, 2023",2023-09-13,5,Amber and LaDonna at the Starbucks on Southwes...,2
1,Courtney,"Apopka, FL",Apopka,FL,"Reviewed July 16, 2023",2023-07-16,5,** at the Starbucks by the fire station on 436...,2
2,Daynelle,"Cranberry Twp, PA",Cranberry Twp,PA,"Reviewed July 5, 2023",2023-07-05,5,I just wanted to go out of my way to recognize...,2
3,Taylor,"Seattle, WA",Seattle,WA,"Reviewed May 26, 2023",2023-05-26,5,Me and my friend were at Starbucks and my card...,2
4,Tenessa,"Gresham, OR",Gresham,OR,"Reviewed Jan. 22, 2023",2023-01-22,5,I’m on this kick of drinking 5 cups of warm wa...,2


In [10]:
## Variáveis de tempo úteis na análise gráfica
df['month'] = df['date_treat'].dt.month
df['year'] = df['date_treat'].dt.year
df['day_of_week'] = df['date_treat'].dt.dayofweek

## Ordem dos dias da semana
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

df.head()

Unnamed: 0,name,location,city,state,Date,date_treat,Rating,Review,rating_treat,month,year,day_of_week
0,Helen,"Wichita Falls, TX",Wichita Falls,TX,"Reviewed Sept. 13, 2023",2023-09-13,5,Amber and LaDonna at the Starbucks on Southwes...,2,9,2023,2
1,Courtney,"Apopka, FL",Apopka,FL,"Reviewed July 16, 2023",2023-07-16,5,** at the Starbucks by the fire station on 436...,2,7,2023,6
2,Daynelle,"Cranberry Twp, PA",Cranberry Twp,PA,"Reviewed July 5, 2023",2023-07-05,5,I just wanted to go out of my way to recognize...,2,7,2023,2
3,Taylor,"Seattle, WA",Seattle,WA,"Reviewed May 26, 2023",2023-05-26,5,Me and my friend were at Starbucks and my card...,2,5,2023,4
4,Tenessa,"Gresham, OR",Gresham,OR,"Reviewed Jan. 22, 2023",2023-01-22,5,I’m on this kick of drinking 5 cups of warm wa...,2,1,2023,6


In [None]:
### Gráficos:
## 1. 
### Tratar a coluna Review com nlp

In [11]:
def nlp_function(text):

    ## Apply lower case
    text = str(text).lower()

    ## Removing caracteres
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    # print('Texto antes: \n', text)
    ## Applying the trained model
    doc = nlp(text)

    ## Lemmatization functions: its primary objective is to break down words into their essential roots, making it easier to identify resemblances.
    text = " ".join([text.lemma_ for text in doc])
    # print('Depois de lemma: \n', text)

    # Remove stop words: some words are not meaningful, so we remove them.
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    text=" ".join(filtered_tokens)
    # print('Depois de stopwords: \n', text)

    return text
df["review_treat"] = df["Review"].apply(nlp_function)

In [12]:
df.head()

Unnamed: 0,name,location,city,state,Date,date_treat,Rating,Review,rating_treat,month,year,day_of_week,review_treat
0,Helen,"Wichita Falls, TX",Wichita Falls,TX,"Reviewed Sept. 13, 2023",2023-09-13,5,Amber and LaDonna at the Starbucks on Southwes...,2,9,2023,2,amber ladonna starbucks southwest parkway warm...
1,Courtney,"Apopka, FL",Apopka,FL,"Reviewed July 16, 2023",2023-07-16,5,** at the Starbucks by the fire station on 436...,2,7,2023,6,starbucks fire station altamonte springs f...
2,Daynelle,"Cranberry Twp, PA",Cranberry Twp,PA,"Reviewed July 5, 2023",2023-07-05,5,I just wanted to go out of my way to recognize...,2,7,2023,2,wanted way recognize starbucks employee billy ...
3,Taylor,"Seattle, WA",Seattle,WA,"Reviewed May 26, 2023",2023-05-26,5,Me and my friend were at Starbucks and my card...,2,5,2023,4,friend starbucks card work thankful worker pai...
4,Tenessa,"Gresham, OR",Gresham,OR,"Reviewed Jan. 22, 2023",2023-01-22,5,I’m on this kick of drinking 5 cups of warm wa...,2,1,2023,6,kick drinking cups warm water work instacart...


In [37]:
text = df.iloc[0,7]
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
print('Texto antes: \n', text)
doc = nlp(text)

text = " ".join([text.lemma_ for text in doc])
print('Depois de lemma: \n', text)

# Remove stop words
text = " ".join([token.text for token in doc if not token.is_stop])
print('Depois de stopwords: \n', text)



Texto antes: 
 amber and ladonna at the starbucks on southwest parkway are always so warm and welcoming there is always a smile in their voice when they greet you at the drivethru and their customer service is always spoton they always get my order right and with a smile i would actually give them more than  stars if they were available
Depois de lemma: 
 amber and ladonna at the starbucks on southwest parkway be always so warm and welcoming there be always a smile in their voice when they greet you at the drivethru and their customer service be always spoton they always get my order right and with a smile I would actually give they more than   star if they be available
Depois de stopwords: 
 amber ladonna starbucks southwest parkway warm welcoming smile voice greet drivethru customer service spoton order right smile actually   stars available


In [14]:
text = "This is a sample sentence with some stop words"
doc = nlp(text)

# Remove stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]

# Print the text excluding stop words
print(filtered_tokens)

['sample', 'sentence', 'stop', 'words']


In [None]:
## Rodar depois que retirar os NaN
df_graph = df.copy()
df_graph = df_graph.groupby(['day_of_week'])['Rating'].mean().reset_index()

plt.figure(figsize=(10, 6))
plt.bar(df['day_of_week'], df['Rating'], color='skyblue')
plt.ylabel('Mean Rating')
plt.title('Total Rating Count Per Day of the Week')
plt.xticks(df['day_of_week'], labels=day_order)