### references
- aspect based sentiment analysis: https://github.com/ScalaConsultants/Aspect-Based-Sentiment-Analysis

### import packages

In [1]:
from utils import data_scraping, absa_english_text
import pandas as pd
import os

2023-09-07 09:38:49.196548: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-09-07 09:38:49.196594: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-09-07 09:38:51.821239: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-09-07 09:38:51.821293: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-09-07 09:38:51.821317: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (CID-GiangTD13): /proc/driver/nvidia/version does not exist
2023-09-07 09:38:51.821687: I tensorflow/core/platform/cpu_feat

### read & transform data

In [2]:
# read data
file_path = 'https://raw.githubusercontent.com/dinhgiangltk/stored_data/main/text_data/the_grace_dalat_reviews.csv'
try:
    df = pd.read_csv(file_path, converters={'profileId':str})
    eval_cols = ['travelPurpose','travelKeywords','photoDataDisplaysList','reactionSummaries']
    df[eval_cols] = df[eval_cols].applymap(lambda x: eval(x) if isinstance(x, str) else x)
except:
    data = data_scraping(url_hotel="https://www.traveloka.com/vi-vn/hotel/vietnam/the-grace-hotel-dalat-3000010042556", reviews_per_page=10)
    df = data.get_all_reviews()
    df.to_csv('/home/tdjiang/github/stored_data/text_data/the_grace_dalat_reviews.csv', index=False)

In [3]:
# replace empty data to blank data
df = df.mask(df == '')

# drop columns with no data
df = df.dropna(axis=1, how='all')

# drop rows with no comment
df = df.dropna(subset=['originalReviewText','reviewText'], how='all', axis=0)

# add new columns
df['travelPurposeText'] = df.travelPurpose.apply(lambda x: x['travelPurposeText'] if isinstance(x, dict) else x)
df['travelPurpose'] = df.travelPurpose.apply(lambda x: x['travelPurpose'] if isinstance(x, dict) else x)
df['travelKeywords'] = df.travelKeywords.apply(lambda x: ','.join(sorted(map(lambda y: y['travelKeyword'] if isinstance(y, dict) else '', x))) if isinstance(x, list) else x)
df['reviewLikes'] = df.reactionSummaries.apply(lambda x: x['reactionSummaryMap']['LIKE']['reactionCount'] if isinstance(x, dict) else x)
df['photoCategories'] = df.photoDataDisplaysList.apply(lambda x: ','.join(sorted(map(lambda y: y['photoCategoryDisplay']['photoCategory'] if isinstance(y, dict) else '', x))) if isinstance(x, list) else x)

### aspect-based sentiment analysis
- prioritize the english review text, if none, do the translation

In [6]:
output = []

file_name = 'output_the_grace_dalat_reviews.csv'
if os.path.exists(file_name):
    df_output = pd.read_csv(file_name)

else:
    for id, row in enumerate(df.to_dict(orient='records')):
        reviewTextFn = row['reviewText']
        absa_class = absa_english_text(reviewTextFn)
        if not row['translated']:
            tokenized = absa_class.words_tokenized()
            reviewTextFn = absa_class.translate_vi_to_en(tokenized)
        
        # limit first 400 characters
        reviewTextFn = absa_class.truncate_first_words(reviewTextFn)
        sentiments = absa_class.absa_by_np(reviewTextFn)
        output.append({
            'reviewId': row['reviewId'],
            'sentiment':sentiments
        })
        print('---', id)

    df_output = pd.DataFrame(output)
    df_output['sentiment'] = df_output.apply(lambda x: pd.DataFrame(x.sentiment).assign(reviewId = x.reviewId) , axis=1)
    df_output = pd.concat(df_output.sentiment.tolist())
    df_output.to_csv(file_name, index=False)

--- 0
--- 1
--- 2
--- 3
--- 4
--- 5
--- 6
--- 7
--- 8
--- 9
--- 10
--- 11
--- 12
--- 13
--- 14
--- 15
--- 16
--- 17
--- 18
--- 19
--- 20
--- 21
--- 22
--- 23
--- 24
--- 25
--- 26
--- 27
--- 28
--- 29
--- 30
--- 31
--- 32
--- 33
--- 34
--- 35
--- 36
--- 37
--- 38
--- 39
--- 40
--- 41
--- 42
--- 43
--- 44
--- 45
--- 46
--- 47
--- 48
--- 49
--- 50
--- 51
--- 52
--- 53
--- 54
--- 55
--- 56
--- 57
--- 58
--- 59
--- 60
--- 61
--- 62
--- 63
--- 64
--- 65
--- 66
--- 67
--- 68
--- 69
--- 70
--- 71
--- 72
--- 73
--- 74
--- 75
--- 76
--- 77
--- 78
--- 79
--- 80
--- 81
--- 82
--- 83
--- 84
--- 85
--- 86
--- 87
--- 88
--- 89
--- 90
--- 91
--- 92
--- 93
--- 94
--- 95
--- 96
--- 97
--- 98
--- 99
--- 100
--- 101
--- 102
--- 103
--- 104
--- 105
--- 106
--- 107
--- 108
--- 109
--- 110
--- 111
--- 112
--- 113
--- 114
--- 115
--- 116
--- 117
--- 118
--- 119
--- 120
--- 121
--- 122
--- 123
--- 124
--- 125
--- 126
--- 127
--- 128
--- 129
--- 130
--- 131
--- 132
--- 133
--- 134
--- 135
--- 136
--- 137
--- 13

In [15]:
df_output.aspect.value_counts().head(20)

room             145
hotel            127
staff             85
center            44
market            42
price             35
time              33
service           26
location          25
bit               23
night             22
receptionist      19
car               19
soundproofing     19
water             17
guest             16
point             15
window            15
view              14
day               14
Name: aspect, dtype: int64