# Sentiment Analysis

### Import dependencies

In [156]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import requests
from bs4 import BeautifulSoup
import re
from math import ceil

### Load model

In [157]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

### Test model

In [158]:
tokens = tokenizer.encode('Amazing. It was a once in a lifetime experience. Truly a memory to cherish.', return_tensors='pt')

In [159]:
tokens

tensor([[  101, 39854,   119, 10197, 10140,   143, 12983, 10104,   143, 36597,
         16277,   119, 69434,   143, 19350, 10114, 31528, 16551,   119,   102]])

In [160]:
tokenizer.decode(tokens[0])

'[CLS] amazing. it was a once in a lifetime experience. truly a memory to cherish. [SEP]'

In [161]:
result = model(tokens)

In [162]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[-2.5367, -2.8164, -1.4307,  1.4542,  4.2075]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [163]:
result.logits

tensor([[-2.5367, -2.8164, -1.4307,  1.4542,  4.2075]],
       grad_fn=<AddmmBackward0>)

In [164]:
torch.argmax(result.logits)

tensor(4)

In [165]:
int(torch.argmax(result.logits))+1

5

### Check if `requests` work with Tripadvisor

In [166]:
url = 'https://www.tripadvisor.in//Restaurant_Review-g297635-d1205738-Reviews-Hotel_Paragon_Restaurant-Kozhikode_Kozhikode_District_Kerala.html'

In [167]:
# response = requests.get(url)
# response.status_code

In [168]:
# # Output :
# ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

We can set custom headers

In [169]:
# testing requests
url = 'https://httpbin.org/headers'

In [170]:
response = requests.get(url)
response.status_code

200

In [171]:
print(response.text)

{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.31.0", 
    "X-Amzn-Trace-Id": "Root=1-64c9eaf3-5d4c46273d5bfe8a011662de"
  }
}



In [172]:
headers = {
    # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5.2 Safari/605.1.15',
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Accept-Language' : 'en-US,en;q=0.9',
    'Referer' : 'https://google.com',
    'DNT' : '1'
}

In [173]:
response = requests.get(url, headers=headers)
response.status_code

200

In [174]:
print(response.text)

{
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "en-US,en;q=0.9", 
    "Dnt": "1", 
    "Host": "httpbin.org", 
    "Referer": "https://google.com", 
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-64c9eaf7-3f010a8c01edbafa724224ba"
  }
}



In [175]:
url = 'https://www.tripadvisor.in//Restaurant_Review-g297635-d1205738-Reviews-Hotel_Paragon_Restaurant-Kozhikode_Kozhikode_District_Kerala.html'

In [176]:
response = requests.get(url, headers=headers)
response.status_code

200

In [177]:
print(response.text)

<!DOCTYPE html><html lang="en-IN" xmlns:og="http://opengraphprotocol.org/schema/"><head><meta http-equiv="content-type" content="text/html; charset=utf-8"/><link rel="icon" id="favicon" href="https://static.tacdn.com/favicon.ico?v2" type="image/x-icon" /><link rel="mask-icon" sizes="any" href="https://static.tacdn.com/img2/brand_refresh/application_icons/mask-icon.svg" color="#000000" /><meta name="theme-color" content="#34e0a1" /><meta name="format-detection" content="telephone=no" /><script type="text/javascript">window.taRollupsAreAsync = true;</script><link rel="stylesheet" href="https://static.tacdn.com/css2/webfonts/TripSans/TripSans.css?v1.002" crossorigin><link rel="preload" as="fetch" href="/static/decodeKey.txt" crossorigin="anonymous" /><title>HOTEL PARAGON RESTAURANT, Kozhikode - Menu, Prices &amp; Restaurant Reviews - Tripadvisor</title><meta property="al:ios:app_name" content="TripAdvisor"><meta property="al:ios:app_store_id" content="284876795"><meta property="twitter:ap

### Collect reviews

In [178]:
soup = BeautifulSoup(response.text, 'html.parser')

In [179]:
regex = re.compile('.*partial_entry.*')
results = soup.find_all('p', {'class':regex})

In [180]:
results[0]

<p class="partial_entry">One of the oldest restaurants in town, I was instantly transported back in time with its charming old interiors.

The service was simply outstanding, as the staff was attentive and courteous in spite of the heavy rush, there were at least a 50 people waiting...<span class="postSnippet">outside to get in. And everyone got a smile from the staffs.

Now, let's talk about the food. The food was beyond amazing! Prepared with authentic flavors, the biriyani was so good that my kids just loved it.
The most delightful surprises, they had Vegetable biriyani and i didnt leave a tiny grain of rice behind. Every bite was a pure delight.

What impressed me the most was the unbelievable affordability. Despite its longstanding reputation and excellent service, the prices were incredibly cheap.

It's a place where time slows down, and you can savor both history and culinary artistry.

I can't recommend it highly enough, and I'm already looking forward to my next visit!</span><

In [181]:
print(results[0].text)

One of the oldest restaurants in town, I was instantly transported back in time with its charming old interiors.

The service was simply outstanding, as the staff was attentive and courteous in spite of the heavy rush, there were at least a 50 people waiting...outside to get in. And everyone got a smile from the staffs.

Now, let's talk about the food. The food was beyond amazing! Prepared with authentic flavors, the biriyani was so good that my kids just loved it.
The most delightful surprises, they had Vegetable biriyani and i didnt leave a tiny grain of rice behind. Every bite was a pure delight.

What impressed me the most was the unbelievable affordability. Despite its longstanding reputation and excellent service, the prices were incredibly cheap.

It's a place where time slows down, and you can savor both history and culinary artistry.

I can't recommend it highly enough, and I'm already looking forward to my next visit!More


In [182]:
len(results)

15

In [183]:
regex_date = re.compile('.*ratingDate*')
results_date = soup.find_all('span', {'class':regex_date})

In [184]:
results_date[0].attrs['title']

'31 July 2023'

In [185]:
results_date[0]['title']

'31 July 2023'

In [186]:
results_date[0].get('title')

'31 July 2023'

In [187]:
regex_title = re.compile('.*noQuotes*')
results_title = soup.find_all('span', {'class':regex_title})

In [188]:
results_title[0].text

'Timeless Charm, Excellent Service, Amazing Food, Unbeatable Prices!'

In [189]:

results_rating = soup.find_all('span', {'class':'ui_bubble_rating'})

In [190]:
results_rating[0]['class'][1].split('_')[-1]

'45'

Above is wrong.

below is right

In [191]:
regex_date = re.compile('.*ratingDate*')
results_date = soup.find_all('span', {'class': regex_date})

# Loop through the results and extract the review dates and corresponding ratings
for date in results_date:
    # Extract the review date
    review_date = date.get('title')
    print("Review Date:", review_date)

    # Find the overall rating just before the review date
    overall_rating = date.find_previous('span', {'class': 'ui_bubble_rating'})
    rating_value = float(overall_rating['class'][1].split('_')[-1]) / 10.0
    print("Overall Rating:", rating_value)

Review Date: 31 July 2023
Overall Rating: 4.0
Review Date: 31 May 2023
Overall Rating: 5.0
Review Date: 15 May 2023
Overall Rating: 5.0
Review Date: 26 April 2023
Overall Rating: 4.0
Review Date: 1 April 2023
Overall Rating: 2.0
Review Date: 8 March 2023
Overall Rating: 5.0
Review Date: 22 February 2023
Overall Rating: 5.0
Review Date: 19 January 2023
Overall Rating: 5.0
Review Date: 16 December 2022
Overall Rating: 5.0
Review Date: 8 October 2022
Overall Rating: 5.0
Review Date: 4 August 2022
Overall Rating: 1.0
Review Date: 17 March 2022
Overall Rating: 5.0
Review Date: 9 March 2022
Overall Rating: 5.0
Review Date: 26 November 2021
Overall Rating: 3.0
Review Date: 16 June 2021
Overall Rating: 1.0


### Collect data

In [232]:
def collect_review(soup):

    regex_date = re.compile('.*ratingDate*')
    results_date = soup.find_all('span', {'class': regex_date})

    regex_title = re.compile('.*noQuotes*')
    results_title = soup.find_all('span', {'class':regex_title})

    regex_comment = re.compile('.*partial_entry.*')
    results_comment = soup.find_all('p', {'class':regex_comment})

    review_date_list = []
    user_rating_list = []
    review_title_list = []
    review_list = []
    
    for date, title, review in zip(results_date, results_title, results_comment):
        # Extract the review date
        review_date = date.get('title')
        review_date_list.append(review_date)

        # Find the user rating just before the review date
        user_rating = date.find_previous('span', {'class': 'ui_bubble_rating'})
        rating = int(int(user_rating['class'][1].split('_')[-1]) / 10)
        user_rating_list.append(rating)

        # Extract the review title
        result_title = title.text
        review_title_list.append(result_title)

        # Extract the review

        result_review = review.text
        review_list.append(result_review)
    
    reviews = {'review_date': review_date_list, 
            'user_rating' : user_rating_list,
            'review_title' : review_title_list,
            'review' : review_list}
    
    return reviews

In [233]:
def get_soup(url):
    headers = {
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Accept-Language' : 'en-US,en;q=0.9',
    'Referer' : 'https://google.com',
    'DNT' : '1'
    }
    response = requests.get(url, headers=headers)
    #print(response.status_code)
    if (response.status_code!=200):
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [234]:
def combine_dicts(dict1, dict2):
    combined_dicts = {}

    for key in dict1.keys():
        combined_dicts[key] = dict1[key]+dict2[key]
    
    return combined_dicts

In [265]:
def get_reviews(base_url, max_reviews=15):
    pattern = r'(https?://[^/]+)(/Restaurant_Review-g\d+-d\d+-Reviews-)(.*)'
    url_split = re.search(pattern, base_url)

    website = url_split.group(1)
    path_part_a = url_split.group(2)
    path_part_b = url_split.group(3)

    if path_part_b[:2]=='or':
        pattern2 = r'(or\d+-)(.*)'
        split2 = re.search(pattern2, path_part_b)
        path_part_b = split2.group(2)
    

    reviews = {'review_date': [], 
            'user_rating' : [],
            'review_title' : [],
            'review' : []}

    for i in range(ceil(max_reviews/15)):
        if i==0:
            url = website+path_part_a+path_part_b
        else:
            url = website+path_part_a+'or{}-'.format(i*15)+path_part_b
        
        soup = get_soup(url)
        if soup == None:
            return reviews
        else:
            reviews = combine_dicts(reviews, collect_review(soup))
    
    return reviews

In [236]:
url = 'https://www.tripadvisor.in/Restaurant_Review-g297635-d1205738-Reviews-Hotel_Paragon_Restaurant-Kozhikode_Kozhikode_District_Kerala.html'
max_reviews = 300

reviews_df = pd.DataFrame(get_reviews(url, max_reviews))

In [237]:
reviews_df

Unnamed: 0,review_date,user_rating,review_title,review
0,31 July 2023,4,"Timeless Charm, Excellent Service, Amazing Foo...","One of the oldest restaurants in town, I was i..."
1,31 May 2023,5,Great place for non-veg Kerala food in Kozhiko...,Been here for breakfast and late lunch on our ...
2,15 May 2023,5,Best restaurant since 1939,Paragon is a Kozhikode institution with an unm...
3,26 April 2023,4,Very good to check out Indian cuisine,Indeed very good food (at a price). Went here ...
4,1 April 2023,2,Lunch at Paragon Kozhikkode,Went for lunch with family . A very crowded pl...
...,...,...,...,...
295,6 September 2017,5,Re - Paragon Calicut,Ambiance is good and staff is courteous same a...
296,6 September 2017,5,Superb quality food,"My target was to have a great lunch ""non-veg b..."
297,6 September 2017,3,Good non veg restaurant,This is a very famous and old and we'll establ...
298,6 September 2017,5,Lovely food,Very interesting unique food.it is really good...


In [238]:
# Save reviews
csv_file = 'paragon_reviews_300.csv'
reviews_df.to_csv(csv_file, index=False, quotechar='"', quoting=1)

### Predict sentiments

In [239]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [267]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [266]:
reviews_df['HF_BERT_sentiment'] = reviews_df['review'].apply(lambda x: sentiment_score(x))

tensor([[-1.1236, -0.0024, -0.0394,  0.3510,  0.6503]],
       grad_fn=<AddmmBackward0>)
tensor([[-1.9949, -1.0410, -0.2063,  1.4310,  1.1391]],
       grad_fn=<AddmmBackward0>)
tensor([[-1.0259, -0.4539,  0.1414,  0.7542,  0.3184]],
       grad_fn=<AddmmBackward0>)
tensor([[-3.3422, -2.8228, -0.5890,  2.7137,  3.1378]],
       grad_fn=<AddmmBackward0>)
tensor([[-1.1664,  1.0608,  2.0801,  0.5377, -2.2339]],
       grad_fn=<AddmmBackward0>)
tensor([[-3.2115, -2.5808, -0.6630,  2.2938,  3.1597]],
       grad_fn=<AddmmBackward0>)
tensor([[-2.6415, -2.6942, -1.5277,  1.3921,  4.4265]],
       grad_fn=<AddmmBackward0>)
tensor([[-1.9834, -1.4116, -0.6206,  1.2229,  2.0379]],
       grad_fn=<AddmmBackward0>)
tensor([[-2.6252, -0.1541,  2.0242,  1.9806, -1.1154]],
       grad_fn=<AddmmBackward0>)
tensor([[-2.7462, -2.7445, -1.2027,  1.7191,  4.0681]],
       grad_fn=<AddmmBackward0>)
tensor([[ 3.8963,  2.3968,  0.1225, -2.5321, -3.0522]],
       grad_fn=<AddmmBackward0>)
tensor([[-2.3938, -1.

In [268]:
reviews_df

Unnamed: 0,review_date,user_rating,review_title,review,HF_BERT_sentiment
0,31 July 2023,4,"Timeless Charm, Excellent Service, Amazing Foo...","One of the oldest restaurants in town, I was i...",5
1,31 May 2023,5,Great place for non-veg Kerala food in Kozhiko...,Been here for breakfast and late lunch on our ...,4
2,15 May 2023,5,Best restaurant since 1939,Paragon is a Kozhikode institution with an unm...,4
3,26 April 2023,4,Very good to check out Indian cuisine,Indeed very good food (at a price). Went here ...,5
4,1 April 2023,2,Lunch at Paragon Kozhikkode,Went for lunch with family . A very crowded pl...,3
...,...,...,...,...,...
295,6 September 2017,5,Re - Paragon Calicut,Ambiance is good and staff is courteous same a...,4
296,6 September 2017,5,Superb quality food,"My target was to have a great lunch ""non-veg b...",5
297,6 September 2017,3,Good non veg restaurant,This is a very famous and old and we'll establ...,5
298,6 September 2017,5,Lovely food,Very interesting unique food.it is really good...,5
