In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Download VADER lexicon (only required once)
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Fabio_UofT
[nltk_data]     SCS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
# Read the dataset
restaurants_df = pd.read_csv("Restaurants_Yelp_Reviews.csv")

In [5]:
# Print DF
restaurants_df.head()

Unnamed: 0,Restaurant_name,Category,Address,Province,Latitude,Longitude,Rating,Review
0,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,4,Busy place - strongly suggest a reservation. G...
1,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,4,I recommend Squid is the best part. I ordered ...
2,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,I really enjoyed my lunch at this cozy little ...
3,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,Ambiance is great with a great mixture of wood...
4,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,We went to the Flying Pig for dinner last nigh...


In [6]:
# Text Preprocessing
restaurants_df['Review'] = restaurants_df['Review'].astype(str)
restaurants_df['Review'] = restaurants_df['Review'].apply(lambda review: review.lower())
restaurants_df['Review'] = restaurants_df['Review'].str.replace('[^\w\s]', '')


  restaurants_df['Review'] = restaurants_df['Review'].str.replace('[^\w\s]', '')


In [7]:
# Perform Sentiment Analysis using VADER
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_polarity(review):
    return analyzer.polarity_scores(review)['compound']

restaurants_df['Sentiment_score'] = restaurants_df['Review'].apply(get_sentiment_polarity)


In [8]:
# Categorize reviews as positive or negative based on sentiment score
threshold = 0.5  # You can adjust this threshold based on your preference
restaurants_df['Sentiment'] = restaurants_df['Sentiment_score'].apply(lambda score: 'Good' if score >= threshold else 'Bad')


In [9]:
# Print DF
restaurants_df.head()

Unnamed: 0,Restaurant_name,Category,Address,Province,Latitude,Longitude,Rating,Review,Sentiment_score,Sentiment
0,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,4,busy place strongly suggest a reservation gre...,0.9485,Good
1,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,4,i recommend squid is the best part i ordered g...,0.8979,Good
2,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,i really enjoyed my lunch at this cozy little ...,0.9966,Good
3,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,ambiance is great with a great mixture of wood...,0.9892,Good
4,The Flying Pig,Canadian (New),"1168 Hamilton Street Unit 104 Vancouver, BC V6...",BC,49.275039,-123.122038,5,we went to the flying pig for dinner last nigh...,0.9845,Good


In [10]:
# Download the necessary resources for NLTK (run this once)
nltk.download('vader_lexicon')

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Custom sentiment lexicon with positive and negative words and their scores
custom_lexicon = {
    'good': 0.5, 'great': 0.6, 'excellent': 0.7, 'awesome': 0.8, 'fantastic': 0.9,
    'bad': -0.5, 'poor': -0.6, 'terrible': -0.7, 'awful': -0.8, 'horrible': -0.9,
    'busy': 0.1,  # Custom sentiment score for 'busy'
    'delicious': 0.8,
    'yummy': 0.7,
    'tasty': 0.6,
    'delectable': 0.7,
    'flavorful': 0.6,
    'mouthwatering': 0.8,
    'savory': 0.6,
    'scrumptious': 0.7,
    'heavenly': 0.8,
    'exquisite': 0.7,
    'disappointing': -0.6,
    'tasteless': -0.5,
    'bland': -0.4,
    'overcooked': -0.6,
    'undercooked': -0.6,
    'greasy': -0.5,
    'salty': -0.4,
    'soggy': -0.5,
    'stale': -0.4,
    'unappetizing': -0.6,
    'joyful': 0.7,
    'happy': 0.6,
    'delighted': 0.7,
    'pleased': 0.6,
    'content': 0.5,
    'sad': -0.6,
    'unhappy': -0.5,
    'miserable': -0.7,
    'disheartened': -0.6,
    'gloomy': -0.5,
    'angry': -0.6,
    'frustrated': -0.5,
    'irritated': -0.5,
    'enraged': -0.7,
    'excited': 0.7,
    'thrilled': 0.8,
    'ecstatic': 0.9,
    'nervous': -0.5,
    'anxious': -0.6,
    'afraid': -0.6,
    'terrified': -0.8,
    'calm': 0.5,
    'relaxed': 0.6,
    'peaceful': 0.7
}

def extract_positive_negative_words(review):
    review = str(review)  # Ensure review is in string format
    words = review.split()
    positive_words = [word for word in words if custom_lexicon.get(word.lower(), 0) > 0]
    negative_words = [word for word in words if custom_lexicon.get(word.lower(), 0) < 0]
    return positive_words, negative_words, len(positive_words), len(negative_words)

# Assuming you have your DataFrame loaded and named 'restaurants_df'
restaurants_df['Positive Words'], restaurants_df['Negative Words'], restaurants_df['Positive Count'], restaurants_df['Negative Count'] = zip(*restaurants_df['Review'].apply(extract_positive_negative_words))

# Display the new DataFrame
print(restaurants_df)

[nltk_data] Downloading package vader_lexicon to C:\Users\Fabio_UofT
[nltk_data]     SCS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


     Restaurant_name        Category  \
0     The Flying Pig  Canadian (New)   
1     The Flying Pig  Canadian (New)   
2     The Flying Pig  Canadian (New)   
3     The Flying Pig  Canadian (New)   
4     The Flying Pig  Canadian (New)   
...              ...             ...   
5503          Subway      Sandwiches   
5504          Subway      Sandwiches   
5505          Subway      Sandwiches   
5506          Subway      Sandwiches   
5507          Subway      Sandwiches   

                                                Address Province   Latitude  \
0     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
1     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
2     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
3     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
4     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
...                                          

In [11]:
restaurants_df.sample(10)

Unnamed: 0,Restaurant_name,Category,Address,Province,Latitude,Longitude,Rating,Review,Sentiment_score,Sentiment,Positive Words,Negative Words,Positive Count,Negative Count
482,Cafe FortyOne,Mediterranean,"5750 Oak Street Vancouver, BC V6M 2V9 Canada",BC,49.233718,-123.127746,5,i was looking for a place to eat falafel and m...,0.9299,Good,[tasty],[],1,0
121,So Hyang Korean Cuisine,Korean,"6345 Fraser Street Vancouver, BC V5W 3A3 Canada",BC,49.227,-123.090927,4,lots of tables were available at 1130 for lunc...,0.9559,Good,"[tasty, good]",[],2,0
3161,Fourchette Antillaise,Cajun/Creole,"5968 Rue de Salaberry Montreal, QC H4J 1J8 Canada",QC,45.526716,-73.713305,5,1010 ive been going to fourchette antillaise e...,0.9824,Good,"[great, great]",[],2,0
3941,Saltwater Restaurant,Seafood,"320 Water Street St. John's, NL A1C 1C1 Canada",NL,47.561479,-52.710602,5,we had a great experience food was great and w...,0.9451,Good,"[great, great]",[],2,0
2563,The Old Triangle Irish Alehouse,Pubs,"5136 Prince Street Halifax, NS B3J 1L4 Canada",NS,44.647489,-63.572405,4,great bar that reminds me of a local pub back ...,0.9806,Good,"[great, good]",[],2,0
3310,MR PATTY,Caribbean,"5312 Patricia Avenue Montreal, QC H4V 1Z2 Canada",QC,45.46021,-73.654373,5,best home made jamaican pattys in westmounttea...,0.7579,Good,[],[],0,0
5234,Kazoku,Sushi Bars,"30-2583 Pembina hwy Winnipeg, MB R3T Canada",MB,49.798829,-97.157773,4,fabulous hangover cure as you can see i comple...,0.9167,Good,"[great, great]",[],2,0
3707,Browns Socialhouse Eastgate,Canadian (New),"3610 Eastgate Drive Regina, SK S4Z 1A5 Canada",SK,50.451668,-104.531638,4,decent place to eat dinner food quality was go...,0.4678,Bad,[good],[],1,0
1465,Silver Streams Restaurant,Chinese,"7 St. Peters Road Charlottetown, PE C1A 5N3 Ca...",PEI,46.246684,-63.123657,2,the staff was kind but almost to extremes comi...,0.8765,Good,[],[],0,0
568,Sawasdee Thai Restaurant,Thai,"4250 Main Street Vancouver, BC V5V 3P9 Canada",BC,49.247056,-123.100828,5,all of the dishes are delicious and i highly r...,0.7574,Good,[delicious],[],1,0


In [12]:
# Function to get the length of a review
def get_review_length(review):
    return len(review)

# Apply the function to the 'Review' column to get the review lengths
restaurants_df['Review Length'] = restaurants_df['Review'].apply(get_review_length)

# Display the DataFrame with the review lengths
print(restaurants_df[['Review', 'Review Length']])

                                                 Review  Review Length
0     busy place  strongly suggest a reservation gre...            157
1     i recommend squid is the best part i ordered g...            179
2     i really enjoyed my lunch at this cozy little ...           1025
3     ambiance is great with a great mixture of wood...            626
4     we went to the flying pig for dinner last nigh...            501
...                                                 ...            ...
5503  worst sandwich ever given a 6 veggie with fing...            441
5504  subway is another must stop for my family when...            247
5505  the service here has been on the decline for s...            414
5506  this place has gone downhill and staff dont ca...            506
5507  ive only been here twice both times ordered th...            300

[5508 rows x 2 columns]


In [15]:
# !pip install summa

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from summa.summarizer import summarize as summa_summarize

# Download the necessary resources for NLTK (run this once)
nltk.download('punkt')

# Function to extract the summary for each review
def extract_summary(review):
    # Tokenize the review into sentences
    sentences = sent_tokenize(review)
    
    # Combine sentences into a single string
    text = " ".join(sentences)
    
    # Save the original review length
    original_length = len(text)
    
    # Use summa's TextRank algorithm for extractive summarization
    summary = summa_summarize(text, ratio=0.5)
    
    # Save the summary length
    summary_length = len(summary) if summary else 0
    
    return summary, original_length, summary_length

# Apply the summarization function to the 'Review' column
restaurants_df['Review Summary'], restaurants_df['Original Review Length'], restaurants_df['Summary Length'] = zip(*restaurants_df['Review'].apply(extract_summary))

# Display the DataFrame with the summary for each review and the review lengths
print(restaurants_df)

[nltk_data] Downloading package punkt to C:\Users\Fabio_UofT
[nltk_data]     SCS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


     Restaurant_name        Category  \
0     The Flying Pig  Canadian (New)   
1     The Flying Pig  Canadian (New)   
2     The Flying Pig  Canadian (New)   
3     The Flying Pig  Canadian (New)   
4     The Flying Pig  Canadian (New)   
...              ...             ...   
5503          Subway      Sandwiches   
5504          Subway      Sandwiches   
5505          Subway      Sandwiches   
5506          Subway      Sandwiches   
5507          Subway      Sandwiches   

                                                Address Province   Latitude  \
0     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
1     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
2     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
3     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
4     1168 Hamilton Street Unit 104 Vancouver, BC V6...       BC  49.275039   
...                                          

In [16]:
restaurants_df.sample(10)

Unnamed: 0,Restaurant_name,Category,Address,Province,Latitude,Longitude,Rating,Review,Sentiment_score,Sentiment,Positive Words,Negative Words,Positive Count,Negative Count,Review Length,Review Summary,Original Review Length,Summary Length
3889,The Green Spot Cafe,Vegetarian,"1812 Hamilton Street Regina, SK S4P 2B8 Canada",SK,50.44999,-104.609645,4,the coffee is good and it has a great view i w...,0.7906,Good,"[good, great]",[],2,0,132,,132,0
854,Sookjai Thai Restaurant,Thai,"893 Fort Street Victoria, BC V8W 1H7 Canada",BC,48.423961,-123.360217,4,wow amazing quality of food and a really exoti...,0.8688,Good,[good],[],1,0,672,nonetheless a really memorable restaurant expe...,672,130
395,Micha,Chinese,"120-4751 Garden City Road Richmond, BC V6X 3M7...",BC,49.177868,-123.125541,5,if youre looking for the best sweet and sour d...,0.9857,Good,"[delicious, delicious]",[],2,0,648,if youre looking for the best sweet and sour d...,648,484
1154,Victoria Harbour House Restaurant,Steakhouses,"607 Oswego St Victoria, BC V8V 4W9 Canada",BC,48.421132,-123.37443,5,what a fabulous meal and experience the best ...,0.9852,Good,[yummy],[],1,0,325,,325,0
2776,Happy Veal Hot Pot,Chinese,"1333 S Park Street Halifax, NS B3J 2K9 Canada",NS,44.64057,-63.578214,5,great place to get hot pot in halifax although...,0.7974,Good,[great],[],1,0,356,,356,0
1307,Superior Pizza,Pizza,"455 Simcoe Street Victoria, BC V8V 4T3 Canada",BC,48.415536,-123.374854,3,some days its very good other days the amount ...,0.9381,Good,[good],[],1,0,188,,188,0
239,Le Crocodile Restaurant,French,"909 Burrard Street Vancouver, BC V6Z 2N2 Canada",BC,49.282539,-123.125205,5,fantastic place with topnotch service since th...,0.7783,Good,"[fantastic, great]",[],2,0,544,,544,0
3234,Brazas Portugal,Portuguese,"169A Boulevard Sainte-Rose Laval, QC H7L 1L2 C...",QC,45.615823,-73.784145,3,very small restaurant they give complimentary ...,0.8937,Good,"[good, good]",[salty],2,1,637,very small restaurant they give complimentary ...,637,77
3774,Mercury Cafe,Diners,"2936 13th Ave Regina, SK S4T 1N7 Canada",SK,50.445688,-104.623709,1,i wanted to like it so bad its an iconic locat...,-0.798,Bad,[excited],[bad],1,1,346,,346,0
2725,Happy Veal Hot Pot,Chinese,"1333 S Park Street Halifax, NS B3J 2K9 Canada",NS,44.64057,-63.578214,5,delightful experience always wanted to try hot...,0.8642,Good,"[good, delicious]",[],2,0,141,,141,0


In [17]:
# Save the updated DataFrame to a new CSV file
restaurants_df.to_csv("restaurants_df.csv", index=False)