In [None]:
import pandas as pd

## Show all review CSV files

In [None]:
google_df = pd.read_csv("csv_files/google.csv", encoding='latin-1') #need latin-1 encoding for the google reviews found through trial & error

In [None]:
google_df.head()

In [None]:
yelp_df = pd.read_csv("csv_files/yelp.csv")

In [None]:
yelp_df.head()

In [None]:
trip_df = pd.read_csv("csv_files/trip.csv")

In [None]:
trip_df

### We want to consolidate all of the reviews into one big CSV file

In [None]:
len(google_df), len(yelp_df), len(trip_df)

Convert 'date' column to standard format

In [None]:
google_df['date'] = pd.to_datetime(google_df['date'])

In [None]:
google_df.head()

In [None]:
yelp_df['date'] = pd.to_datetime(yelp_df['date'])

In [None]:
yelp_df.head()

In [None]:
trip_df['date'] = pd.to_datetime(trip_df['date'])

In [None]:
trip_df.head()

In [None]:
consolidated_df = pd.concat([google_df, yelp_df, trip_df], ignore_index=True)

In [None]:
consolidated_df

642 rows matches the summed lengths of each individual dataframe.

In [None]:
consolidated_df[490: 510] # Where google reviews meet yelp reviews in the new consolidated df

In [None]:
consolidated_df['review_text'].isna()

In [None]:
consolidated_df.isna().sum()

Let's get rid of the rows with NaN values.

In [None]:
cleaned_df = consolidated_df.dropna()

In [None]:
len(cleaned_df) # Should be 642 - 217 = 425

Now our data is all consolidated and the rows with missing review text have been removed. All that's left is to sort it by date.

In [None]:
sorted_df = cleaned_df.sort_values(by='date')

In [None]:
sorted_df

In [None]:
# # Save sorted df to csv
# sorted_df.to_csv("sorted.csv", index=False)

In [None]:
# Add sentiment scores based on 'rating' column
df = pd.read_csv("sorted.csv")
df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x >= 4 else 'negative')

In [None]:
df

In [None]:
# Let's make a copy of the dataframe in case we need to revert back
df_copy = df

### Text Cleaning
Let's clean the text to prepare it for analysis

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [None]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [None]:
df['cleaned_review_text'] = df['review_text'].apply(clean_text)
df.head()

### Exploratory Data Analysis (EDA)
Let's explore the data a bit.

In [None]:
import matplotlib.pyplot as plt

In [None]:
sentiment_distribution = df['sentiment'].value_counts(normalize=True) # `normalize=True` returns percentages, `False` returns raw counts
sentiment_distribution

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_distribution.plot(kind='bar', color='skyblue')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.tight_layout()

In [None]:
# Convert 'date' column to 'datetime' dtype if needed
df['date'] = pd.to_datetime(df['date'])

# Compare sentiment distribution across different time periods (e.g., months or years)
df['year_month'] = df['date'].dt.to_period('M')
sentiment_by_month = df.groupby(['year_month', 'sentiment']).size().unstack(fill_value=0)
sentiment_by_month_percentage = sentiment_by_month.div(sentiment_by_month.sum(axis=1), axis=0)

# Plot sentiment distribution over time
plt.figure(figsize=(10, 6))
sentiment_by_month_percentage.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Proportion')
plt.legend(title='Sentiment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def extract_ngrams(text, sentiment, pos_or_neg, ngram_range=(1, 2)):
    # Filter reviews by sentiment
    text_filtered = text[sentiment == pos_or_neg]
    # Initialize CountVectorizer to generate n-grams
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    # Fit and transform text data to extract n-grams
    X = vectorizer.fit_transform(text_filtered)
    # Get the feature names
    ngrams = vectorizer.get_feature_names_out()    
    # Get the count of each n-gram
    ngram_counts = X.sum(axis=0).A1
    # Create a dict mapping ngram to its count
    ngram_freq = dict(zip(ngrams, ngram_counts))
    # Sort the dict by frequency in descending order
    sorted_ngram_freq = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_ngram_freq

In [None]:
# Extract bigrams from cleaned review text
ngrams = extract_ngrams(df['cleaned_review_text'], df['sentiment'], 'positive', ngram_range=(2, 2))

# Display top 10 most frequent bigrams
top_ngrams = ngrams[:10]
for ngram, freq in top_ngrams:
    print(f'{ngram}: {freq}')

# Plot the most frequent bigrams
plt.figure(figsize=(10, 6))
plt.bar(*zip(*top_ngrams), color="coral")
plt.title('Top 10 Most Frequent Bigrams in Positive Reviews')
plt.xlabel('Bigram')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Extract bigrams from cleaned review text
ngrams = extract_ngrams(df['cleaned_review_text'], df['sentiment'], 'negative', ngram_range=(2, 2))

# Display top 10 most frequent bigrams
top_ngrams = ngrams[:10]
for ngram, freq in top_ngrams:
    print(f'{ngram}: {freq}')

# Plot the most frequent bigrams
plt.figure(figsize=(10, 6))
plt.bar(*zip(*top_ngrams))
plt.title('Top 10 Most Frequent Bigrams in Negative (<=3 stars) Reviews')
plt.xlabel('Bigram')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Let's try out some Latent Dirichlet Allocation (LDA) and Non-Negative Matrix Factorization (NMF) to see if we can find any latent topics

In [None]:
# LDA

from gensim import corpora, models
# Filter for negative reviews
negative_reviews = df[df['sentiment'] == 'negative']

# Tokenize the cleaned review text
tokenized_reviews = negative_reviews['cleaned_review_text'].apply(lambda x: x.split())

# Create a dictionary mapping words to unique ids
dictionary = corpora.Dictionary(tokenized_reviews)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

# Train the LDA model
num_topics = 5 # Specify the number of topics
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Print the topics
for topic in lda_model.print_topics():
    print(topic)

Now let's try to visualize.

In [None]:
# Install wordcloud
import sys
print(sys.executable)

In [None]:
!C:\Users\rwynn\anaconda3\python.exe -m pip install wordcloud

In [None]:
from wordcloud import WordCloud

# Iterate through the topics and create word clouds
for topic in lda_model.show_topics(num_topics=num_topics, formatted=False):
    topic_words = dict(topic[1])
    wordcloud = WordCloud(background_color='white').generate_from_frequencies(topic_words)
    plt.figure(figsize=(8, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Topic ' + str(topic[0]))
    plt.axis('off')
    plt.show()

In [None]:
# NMF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the cleaned review text
tfidf_matrix = vectorizer.fit_transform(negative_reviews['cleaned_review_text'])

# Specify number of topics
num_topics = 5

# Initialize and fit the NMF model
nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

# Print the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_index, topic in enumerate(nmf_model.components_):
    print(f"Topic {topic_index}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))

Let's visualize the NMF output.

In [None]:
# Define the top words for each topic
top_words = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words.append([feature_names[i] for i in topic.argsort()[:-11:-1]])

# Plot the top words for each topic
fig, axs = plt.subplots(nrows=num_topics, figsize=(10, 8))
for i, ax in enumerate(axs):
    ax.barh(range(10), top_words[i][::-1], color='skyblue')
    ax.set_title(f'Topic {i}')
    ax.set_xlabel('Frequency')
    ax.invert_yaxis()
    ax.set_yticks(range(10))
    ax.set_yticklabels(top_words[i][::-1])
plt.tight_layout()
plt.show()

#### Let's try some keyword extraction and see what we can see.

In [None]:
import re

# Define keywords for aspect extraction
aspect_keywords = {'food': ['food', 'meal', 'dish', 'pizza', 'pizzas', 'wings', 'bread', 'catch a fire', 'catch-a-fire', 'caf', 'appetizer', 'appetizers', 'pie'],
                   'service': ['service', 'waiter', 'waitress', 'staff', 'bar staff', 'bartender', 'bartenders'],
                   'parking': ['parking', 'parking lot', 'park'],
                   'beer': ['beer', 'beers', 'ipa', 'ipas', 'lager', 'stout', 'stouts', 'wine', 'brew', 'brews', 'drink', 'drinks'],
                   'cocktails': ['cocktail', 'cocktails', 'sway', 'mocktail', 'mocktails', 'mixed drinks'],
                   'dogs': ['dog', 'dogs', 'puppy'],
                   'kids': ['kid', 'kids', 'child', 'children'],
                   'price': ['price', 'prices', 'value', 'cost'],
                   'atmosphere': ['atmosphere', 'place', 'vibe', 'space', 'venue', 'crowd', 'ambiance', 'spot', 'brewery'],}

# Function to extract aspects from review text
def extract_aspects(review_text):
    aspects = []
    for aspect, keywords in aspect_keywords.items():
        for keyword in keywords:
            if re.search(r'\b{}\b'.format(keyword), review_text, flags=re.IGNORECASE):
                aspects.append(aspect)
                break
    return aspects

In [None]:
# Apply aspect extraction function to review text column
df['aspects'] = df['review_text'].apply(extract_aspects)
df

Now we have extracted the various aspects of each review. Let's try to analyze the sentiments associated with each of them.

In [None]:
# Sample DataFrame with review text and extracted aspects
test_df = pd.DataFrame({'review_text': ["The food was excellent but the service was slow.",
                                   "Great atmosphere but parking was a nightmare."],
                   'aspects': [['food', 'service'], ['atmosphere', 'parking']]})


# Function to perform sentiment analysis at aspect level
def analyze_sentiment_aspects(review_text, aspects):
    # Placeholder sentiment lexicons (replace with your actual lexicons)
    aspect_sentiment_lexicons = {
        'food': {'positive': ['excellent', 'tasty'], 'negative': ['slow', 'bland']},
        'service': {'positive': ['excellent', 'friendly'], 'negative': ['slow', 'poor']},
        'atmosphere': {'positive': ['great', 'pleasant'], 'negative': ['noisy', 'crowded']},
        'parking': {'positive': ['convenient', 'ample'], 'negative': ['nightmare', 'limited']}
    }

    aspect_sentiments = {}
    for aspect in aspects:
        sentiment_scores = {'positive': 0, 'negative': 0, 'neutral': 0}
        for word in review_text.split():
            if word.lower() in aspect_sentiment_lexicons.get(aspect, {}).get('positive', []):
                sentiment_scores['positive'] += 1
            elif word.lower() in aspect_sentiment_lexicons.get(aspect, {}).get('negative', []):
                sentiment_scores['negative'] += 1
            else:
                sentiment_scores['neutral'] += 1
        aspect_sentiments[aspect] = sentiment_scores

    return aspect_sentiments

# Apply sentiment analysis function to each row in the DataFrame
test_df['aspect_sentiments'] = test_df.apply(lambda row: analyze_sentiment_aspects(row['review_text'], row['aspects']), axis=1)
test_df

Think we're going to need something a little more in-depth that this simple rules-based analysis. Let's try a BERT model.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load pre-trained AUTO tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load pre-trained AUTO model for sequence classification (AutoModel will look at the bert-base-uncased modelâ€™s configuration and choose the appropriate base model architecture to use)
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment', num_labels=5, ignore_mismatched_sizes=True)  # 3 labels: positive, negative, neutral

# Encode some review text into a tensor
tokens = tokenizer.encode("this place is the worst. terrible", return_tensors="pt")
print(tokens)

# Decode the tensor back into review text
decoded = tokenizer.decode(tokens[0])
print(decoded)

In [None]:
# Pass tokens to model to return sentiment
result = model(tokens)
result

In [None]:
result.logits

In [None]:
torch.argmax(result.logits) + 1  # 1 is the worst, 5 is the best (as in star reviews)

Let's continue with the sentiment analysis. We can see that when the statement is positive:

In [None]:
tokens = tokenizer.encode("i love this product so much", return_tensors="pt")
result = model(tokens)
torch.argmax(result.logits) + 1

The result is a '5', as in '5 stars'. But when the statement is negative:

In [None]:
tokens = tokenizer.encode("the food is the worst", return_tensors="pt")
result = model(tokens)
torch.argmax(result.logits) + 1

It results in just 1 star. And with a neutral statement:

In [None]:
tokens = tokenizer.encode("this place is just ok", return_tensors="pt")
result = model(tokens)
torch.argmax(result.logits) + 1

It lands right in the middle with 3 stars. So how can we go about applying this to individual aspect statements? Meaning, not an entire review, but specifically to individual phrases that reference specific aspects of the business. For example, if the review text were: "The food was good but the service was awful", we would want to first extract the aspects 'food' and 'service', and then also apply a sentiment to each aspect. In this case, we'd want 'food' to get a 'positive' sentiment and 'service' to get a negative sentiment. How can we achieve this?

## Aspect Term Extract and Sentiment Analysis via [PyABSA](https://github.com/yangheng95/PyABSA)

In [None]:
pip install pyabsa

In [None]:
from pyabsa import available_checkpoints
from pyabsa import TaskCodeOption
# Check available checkpoints for a given task code (https://github.com/yangheng95/PyABSA/blob/v2/pyabsa/framework/checkpoint_class/checkpoint_utils.py)
# for current version
checkpoint_map = available_checkpoints(task_code=TaskCodeOption.Aspect_Term_Extraction_and_Classification, show_ckpts=True)
checkpoint_map

### Extract aspect terms and classify sentiments

In [72]:
from pyabsa import ATEPCCheckpointManager

# Init aspect extractor from online checkpoint or local checkpoint
aspect_extract = ATEPCCheckpointManager.get_aspect_extractor(checkpoint='english',
                                                             auto_device=False  # False means load model on CPU
                                                            )

[2024-03-22 21:03:21] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-03-22 21:03:21] (2.4.1.post1) ********** [32mAvailable ATEPC model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-03-22 21:03:21] (2.4.1.post1) [32mDownloading checkpoint:english [0m
[2024-03-22 21:03:21] (2.4.1.post1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2024-03-22 21:03:21] (2.4.1.post1) Checkpoint already downloaded, skip
[2024-03-22 21:03:21] (2.4.1.post1) Load aspect extractor from checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2024-03-22 21:03:21] (2.4.1.post1) config: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.config
[2024-03-22 21:03:21] (2.4.1.post1) state_dict: checkpoints\ATEPC_ENGLISH_CHECK

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have b

<pyabsa.tasks.AspectTermExtraction.prediction.aspect_extractor.AspectExtractor at 0x253c902abc0>

In [73]:
# Aspect term extract & sentiment inference
examples = ['the food is good but the service is terrible']
inference_source = examples
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,
                                               pred_sentiment=True # Predict the sentiment of the extracted aspect terms
                                              )

[2024-03-22 21:08:32] (2.4.1.post1) Can not load en_core_web_sm from spacy, try to download it in order to parse syntax tree: [32m
python -m spacy download en_core_web_sm[0m


RuntimeError: Download failed, you can download en_core_web_sm manually.