In [30]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\junka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Import the file and dropped some less useful columns.

side note: for the dates, I just assumed that 'Stay Date' is more important rather than 'Created Date' or 'Published Date'.

In [31]:
df = pd.read_csv("tripadvisor_20250213222526.csv")
df = df.drop(columns=['Review Id', 'Display Name', 'User Name', 'User Profile',
                      'User Avatar', 'User Is Verified','Additional Ratings', 'Photos',
                       'Location Id', 'URL', 'Created Date', 'Published Date', 'Location'])

In [32]:
print(df.columns)

Index(['User ID', 'User Location', 'Rating', 'Review Title', 'Review Text',
       'Helpful Votes', 'Stay Date', 'Language'],
      dtype='object')


Observed that there are reviews not in english

In [33]:
print(df["Language"].unique())

['en' 'zhCN' 'es' 'ru' 'pl' 'fr' 'sv' 'ja' 'ko' 'da' 'in' 'it' 'nl' 'th'
 'de' 'ar' 'pt' 'el' 'zhTW' 'tr' 'no' 'vi' 'fi' 'iw' 'sk' 'hu' 'sr' 'cs']


After removing non-english reviews, there are 7076 rows left

In [34]:
df = df[df['Language'] == 'en']
#drop Language col after filtering for only english reviews
df = df.drop(columns = ['Language'])


print(df.columns)  #Displayes the columns in the df
print(df.shape[0])  # Displays the number of rows


Index(['User ID', 'User Location', 'Rating', 'Review Title', 'Review Text',
       'Helpful Votes', 'Stay Date'],
      dtype='object')
7076


There are 2 Locations 'Universal Studios Singapore' and 'Universal Studios Singapore Tickets' but after looking at the reviews, 
they seem to be used interchangebly by reviewers.

I.e. There are reviewers who put the location as 'Universal Studios Singapore Tickets' that also talking about the park as a whole as opposed to just talking about the ticketing system or just the queue.

Therefore, will drop 'Location' column as well. (Do it above with the rest to be neater)

In [35]:
''' 

print(df["Location"].unique())
count = df[df['Location'] == 'Universal Studios Singapore Tickets'].shape[0]
print(count)
count2 = df[df['Location'] == 'Universal Studios Singapore'].shape[0]
print(count2)

pd.set_option('display.max_colwidth', None)  # Set to None to show full text without truncation

filtered_reviews = df[df['Location'] == 'Universal Studios Singapore Tickets']['Review Title']
print(filtered_reviews)

'''


' \n\nprint(df["Location"].unique())\ncount = df[df[\'Location\'] == \'Universal Studios Singapore Tickets\'].shape[0]\nprint(count)\ncount2 = df[df[\'Location\'] == \'Universal Studios Singapore\'].shape[0]\nprint(count2)\n\npd.set_option(\'display.max_colwidth\', None)  # Set to None to show full text without truncation\n\nfiltered_reviews = df[df[\'Location\'] == \'Universal Studios Singapore Tickets\'][\'Review Title\']\nprint(filtered_reviews)\n\n'

The column 'User Location' has 1654 missing values, we might want to consider removing these too, especially if we're not doing a demographic by location analysis.

In [36]:
print(df.isnull().sum())  # Check for missing values in each column
# Print the data types of each column


User ID             0
User Location    1654
Rating              0
Review Title        0
Review Text         0
Helpful Votes       0
Stay Date           0
dtype: int64


Converting to the right data types


In [37]:
print(df.dtypes)

#Convert 'Stay Date' to a date type variable
df['Stay Date'] = pd.to_datetime(df['Stay Date'], errors='coerce')



User ID          object
User Location    object
Rating            int64
Review Title     object
Review Text      object
Helpful Votes     int64
Stay Date        object
dtype: object


Text preprocessing:

Remove special characters, numbers, and extra spaces.

Convert text to lowercase.

Tokenization (split text into words).

Remove stop words (common words like "the", "is", "and", etc. that don’t add much meaning).


In [38]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Lowercase, remove special characters, and extra spaces."""
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text) # Keep only alphanumeric characters
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] # Remove stopwords + Lemmatization
    return ' '.join(words)

df['Clean Review Title'] = df['Review Title'].apply(clean_text)
df['Clean Review Text'] = df['Review Text'].apply(clean_text)

Finally, we can preview the dataset below

In [39]:
print(df.head())

                            User ID  User Location  Rating  \
0  2165A55827C624A5FA8BA42A63DA343C  United States       3   
1  87364D4E6E971222E18CB6C485C9209E            NaN       5   
2  DC02DAA61A0CD9DC77B4C83FC2A79A09      Edinburgh       4   
3  3E7A5D8CEA2FB93CCB415499204B9E77            NaN       1   
4  C9CC6FD8B4C7A6EC72E0088AF811CA82         Sydney       4   

                    Review Title  \
0                      It’s okay   
1  Great Staff, Great Experience   
2    Great day out, pretty small   
3                      Pointless   
4                        Fun day   

                                         Review Text  Helpful Votes  \
0  This was relatively small, ~24 rides(?), lines...              0   
1  Staff at Universal and RWS were so lovely and ...              0   
2  Great day out, with plenty of shade and seats ...              0   
3  Pointless place.\nWait times are over 60 mins ...              0   
4  Not as good as Universal in LA. \n\nHighly rec...   