In [32]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [4]:
# First, let's take a first look at the dataset

In [5]:
df = pd.read_csv("reviews_data.csv")

In [6]:
df.head(10)

Unnamed: 0,name,location,Date,Rating,Review,Image_Links
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,['No Images']
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,['No Images']
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,['https://media.consumeraffairs.com/files/cach...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,['No Images']
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,['https://media.consumeraffairs.com/files/cach...
5,Alyssa,"Sunnyvale, TX","Reviewed Sept. 14, 2023",1.0,We had to correct them on our order 3 times. T...,['No Images']
6,ken,"Spring Hill, FL","Reviewed Sept. 8, 2023",1.0,I have tried Starbucks several different times...,['No Images']
7,Nikki,"Asheville, NC","Reviewed Aug. 25, 2023",1.0,Starbucks near me just launched new fall foods...,['No Images']
8,Alex,"Reisterstown, MD","Reviewed Aug. 5, 2023",1.0,"I ordered online for the Reisterstown Rd, St T...",['No Images']
9,Sunny,"Zionville, Other","Reviewed Aug. 4, 2023",1.0,Staff at the Smythe St. Superstore location in...,['No Images']


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         850 non-null    object 
 1   location     850 non-null    object 
 2   Date         850 non-null    object 
 3   Rating       705 non-null    float64
 4   Review       850 non-null    object 
 5   Image_Links  850 non-null    object 
dtypes: float64(1), object(5)
memory usage: 40.0+ KB


In [8]:
df = df.drop(columns="Image_Links")

In [9]:
df.head(-10)

Unnamed: 0,name,location,Date,Rating,Review
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...
...,...,...,...,...,...
835,C.,"Allston, MA","Reviewed May 31, 2008",,For the past several years I've stopped in to ...
836,Lynette,"Los Angeles, CA","Reviewed May 13, 2008",,Found a hair inside of a rice crispy square.
837,Tenzin,"Woodside, NY","Reviewed March 17, 2008",,On 3/15/08 I was at the Starbucks on 42nd stre...
838,Rob,"Las Vegas, NV","Reviewed Feb. 18, 2008",,No Review Text


In [10]:
df.isna().sum()

name          0
location      0
Date          0
Rating      145
Review        0
dtype: int64

In [11]:
'''
    The main focus analysis: 
    1. Sentiment analysis from "Review Text":
        Step one: Data Cleaning. Need to verify "No Review Text" on Review Column
        Step two: Apply sentiment analysis
        


Data Cleaning issues for more analysis:
    2. Need to solve null data on Rating columns (This is interesting if I want to understand the average rating or the region in which people are giving Starbucks the most excellent ratings)
    
    3. Need to solve datetime problem (This will be necessary if I want to understand the evolution of feedback over the years)

'''


'\n    The main focus analysis: \n    1. Sentiment analysis from "Review Text":\n        Step one: Data Cleaning. Need to verify "No Review Text" on Review Column\n        Step two: Apply sentiment analysis\n\n\n\nData Cleaning issues for more analysis:\n    2. Need to solve null data on Rating columns (This is interesting if I want to understand the average rating or the region in which people are giving Starbucks the most excellent ratings)\n\n    3. Need to solve datetime problem (This will be necessary if I want to understand the evolution of feedback over the years)\n\n'

In [12]:
# Cleaning "No Review Text" column. I will need those comments, so let's drop the rows with "No Review Text"
df_filtered = df[df["Review"] == "No Review Text"]
len(df_filtered)

37

In [13]:
df = df[df["Review"] != "No Review Text"]

In [14]:
df.head(-10)

Unnamed: 0,name,location,Date,Rating,Review
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...
...,...,...,...,...,...
829,Misty,"Lincoln, CA","Reviewed July 17, 2008",,I then decided to call and ask her for her ma...
830,Mike,"New York, NY","Reviewed July 14, 2008",,Walked into buy a cup of coffee today and the ...
831,Raudys,"Miami Beach, FL","Reviewed July 13, 2008",,"Ordered two dopio machiato coffees, however I ..."
832,Lou,"Hackesnack, NJ","Reviewed June 16, 2008",,I stop at this Starbucks quite often. They ha...


In [20]:
# Ok! first cleaning problem solved. I have 803 rows with reviews on it. For my first purpose: Sentiment Analysis
text = df["Review"]

In [37]:
# Preprocessing data for trainning:

def clean_review(text):
    if pd.isnull(text):
        return ''
    text = text.lower()  # lower case
    text = re.sub(r'http\S+', '', text)  # remove links
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'[^a-záéíóúâêîôûãõç\s]', '', text)  # remove simbols, ponctuation and emojis
    text = re.sub(r'\s+', ' ', text)  # removes a lot of whitespace
    return text.strip()

# Aplica a função à coluna Review
df['review_cleaned'] = df['Review'].apply(clean_review)

# Exibe as primeiras linhas
print(df[['Review', 'review_limpo']].head())


                                              Review  \
0  Amber and LaDonna at the Starbucks on Southwes...   
1  ** at the Starbucks by the fire station on 436...   
2  I just wanted to go out of my way to recognize...   
3  Me and my friend were at Starbucks and my card...   
4  I’m on this kick of drinking 5 cups of warm wa...   

                                        review_limpo  
0  amber and ladonna at the starbucks on southwes...  
1  at the starbucks by the fire station on in alt...  
2  i just wanted to go out of my way to recognize...  
3  me and my friend were at starbucks and my card...  
4  im on this kick of drinking cups of warm water...  


In [38]:
# Vectorizing
vetor = TfidfVectorizer(stop_words='english', max_features=5000)
X = vetor.fit_transform(df['review_limpo'].dropna())

# Aplica clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

# Adiciona cluster no DataFrame
df['cluster'] = -1
df.loc[df['review_limpo'].notna(), 'cluster'] = kmeans.labels_

In [46]:
df

Unnamed: 0,name,location,Date,Rating,Review,review_cleaned,sentimento,cluster
0,Helen,"Wichita Falls, TX","Reviewed Sept. 13, 2023",5.0,Amber and LaDonna at the Starbucks on Southwes...,amber and ladonna at the starbucks on southwes...,positivo,1
1,Courtney,"Apopka, FL","Reviewed July 16, 2023",5.0,** at the Starbucks by the fire station on 436...,at the starbucks by the fire station on in alt...,positivo,0
2,Daynelle,"Cranberry Twp, PA","Reviewed July 5, 2023",5.0,I just wanted to go out of my way to recognize...,i just wanted to go out of my way to recognize...,positivo,1
3,Taylor,"Seattle, WA","Reviewed May 26, 2023",5.0,Me and my friend were at Starbucks and my card...,me and my friend were at starbucks and my card...,positivo,1
4,Tenessa,"Gresham, OR","Reviewed Jan. 22, 2023",5.0,I’m on this kick of drinking 5 cups of warm wa...,im on this kick of drinking cups of warm water...,positivo,1
...,...,...,...,...,...,...,...,...
841,Mary,"Hillsboro, OR","Reviewed Dec. 16, 2007",,"When using my debit card, how much and for how...",when using my debit card how much and for how ...,,1
842,Angie,"Innsbruck, OR","Reviewed Sept. 17, 2007",,I just bought a starbucks thermos mug about a ...,i just bought a starbucks thermos mug about a ...,,1
844,Sabine,"Redondo Beach, CA","Reviewed Dec. 19, 2006",,I'm (was) a regular customer who ordered a lat...,im was a regular customer who ordered a latte ...,,1
845,Becky,"Agoura Hills, CA","Reviewed July 13, 2006",,I ordered two venti frappacino's without whipp...,i ordered two venti frappacinos without whippe...,,1


In [None]:
pd.set_option('display.max_colwidth', None)

print("\n🟢 Exemplos do Cluster 0:")
print(df[df['cluster'] == 0]['review_cleaned'].head())

print("\n🔴 Exemplos do Cluster 1:")
print(df[df['cluster'] == 1]['review_cleaned'].head())



🟢 Exemplos do Cluster 0:
1                                                                                                                                           at the starbucks by the fire station on in altamonte springs fl made my day and finally helped me figure out the way to make my drink so id love it she took time out to talk to me for minutes to make my experience better than what im used to it was much appreciated ive had bad experiences one after another at the starbucks thats closest to me in my work building with my drinks not being great along with not great customer service from specific baristas niko was refreshing to speak to and pleasant the drink was perfect store
6                                                                                                                                                                                                                                                                                                               