In [2]:
import pandas as pd

df2 = pd.read_csv("UNITENReview.csv", encoding="latin1")
pd.set_option('display.max_colwidth', None)

print(df2.head())
print(df2.columns)   # Make sure column name is "Review"

                     Timestamp  \
0  2025/02/10 7:40:54 pm GMT+8   
1  2025/02/10 7:41:00 pm GMT+8   
2  2025/02/10 7:41:19 pm GMT+8   
3  2025/02/10 7:46:40 pm GMT+8   
4  2025/02/10 7:46:43 pm GMT+8   

                                                                                                                                                                                                                                                                                                                                                         Review  
0                                                                                                                                                                                                                                                                                                          Im happy with uniten actually, even the people are W  
1                                                                                      

In [3]:
def convert_to_lowercase(text):
    return str(text).lower()

df2["lowercased"] = df2["Review"].apply(convert_to_lowercase)
print(df2["lowercased"].head())

0                                                                                                                                                                                                                                                                                                            im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                        iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                   

In [4]:
import re

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', str(text))

df2["urls_removed"] = df2["lowercased"].apply(remove_urls)
print(df2["urls_removed"].head())

0                                                                                                                                                                                                                                                                                                            im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                        iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                   

In [5]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    return BeautifulSoup(str(text), "html.parser").get_text()

df2["html_removed"] = df2["urls_removed"].apply(remove_html_tags)
print(df2["html_removed"].head())

0                                                                                                                                                                                                                                                                                                            im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                        iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                   

In [6]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["u"\U0001F600-\U0001F64F"
         u"\U0001F300-\U0001F5FF"
         u"\U0001F680-\U0001F6FF"
         u"\U0001F1E0-\U0001F1FF"
         "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))

df2["emojis_removed"] = df2["html_removed"].apply(remove_emojis)
print(df2["emojis_removed"].head())

0                                                                                                                                                                                                                                                                                                            im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                        iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                   

In [7]:
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "i do not know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "i know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

def replace_slang(text):
    escaped_words = [re.escape(word) for word in slang_dict.keys()]
    slang_pattern = r'\b(' + '|'.join(escaped_words) + r')\b'

    def replace_match(match):
        return slang_dict[match.group(0).lower()]

    return re.sub(slang_pattern, replace_match, str(text), flags=re.IGNORECASE)

df2["slangs_replaced"] = df2["emojis_removed"].apply(replace_slang)
print(df2["slangs_replaced"].head())

0                                                                                                                                                                                                                                                                                                            im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                        iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                   

In [8]:
contractions_dict = {
    "wasn't": "was not", "isn't": "is not", "aren't": "are not",
    "weren't": "were not", "doesn't": "does not", "don't": "do not",
    "didn't": "did not", "can't": "cannot", "couldn't": "could not",
    "shouldn't": "should not", "wouldn't": "would not", "won't": "will not",
    "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
    "i'm": "i am", "you're": "you are", "it's": "it is", "they're": "they are"
}

escaped_contractions = [re.escape(word) for word in contractions_dict.keys()]
pattern = r'\b(' + '|'.join(escaped_contractions) + r')\b'
compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)

def replace_contractions(text):
    def replace_match(match):
        return contractions_dict[match.group(0).lower()]
    return compiled_pattern.sub(replace_match, str(text))

df2["contractions_replaced"] = df2["slangs_replaced"].apply(replace_contractions)
print(df2["contractions_replaced"].head())

0                                                                                                                                                                                                                                                                                                             im happy with uniten actually, even the people are w
1                                                                                                                                                                                                                                                                                         iâm having a pretty good time here, happy to meet all of the w people.
2                                                                                                                                                                                                                                                                                                 

In [9]:
import string

def remove_punctuation(text):
    return str(text).translate(str.maketrans('', '', string.punctuation))

df2["punctuations_removed"] = df2["contractions_replaced"].apply(remove_punctuation)
print(df2["punctuations_removed"].head())

0                                                                                                                                                                                                                                                                                                     im happy with uniten actually even the people are w
1                                                                                                                                                                                                                                                                                  iâm having a pretty good time here happy to meet all of the w people
2                                                                                                                                                                                                                                                                                                             a very

In [10]:
def remove_numbers(text):
    return re.sub(r'\d+', '', str(text))

df2["numbers_removed"] = df2["punctuations_removed"].apply(remove_numbers)
print(df2["numbers_removed"].head())

0                                                                                                                                                                                                                                                                                                    im happy with uniten actually even the people are w
1                                                                                                                                                                                                                                                                                 iâm having a pretty good time here happy to meet all of the w people
2                                                                                                                                                                                                                                                                                                            a very ne

In [11]:
df2["spelling_corrected"] = df2["numbers_removed"]
print(df2["spelling_corrected"].head())

0                                                                                                                                                                                                                                                                                                    im happy with uniten actually even the people are w
1                                                                                                                                                                                                                                                                                 iâm having a pretty good time here happy to meet all of the w people
2                                                                                                                                                                                                                                                                                                            a very ne

In [25]:
import re

def grammar_polish(text):
    text = str(text)
    
    # Remove duplicated 'with'
    text = re.sub(r'\bwith\s+with\b', 'with', text, flags=re.IGNORECASE)
    
    # Fix common grammar in your dataset
    text = re.sub(r'\bare close\b', 'are closed', text, flags=re.IGNORECASE)
    text = re.sub(r'\bit is make\b', 'it makes', text, flags=re.IGNORECASE)
    
    return text

df2["final_clean"] = df2["final_clean"].apply(grammar_polish)
print(df2["final_clean"].head())

0                                                                                                                                                                                                                                                                                                  im happy with uniten actually even the people are with
1                                                                                                                                                                                                                                                                                 i'm having a pretty good time here happy to meet all of the with people
2                                                                                                                                                                                                                                                                                                             a very

In [27]:
import nltk
from nltk.corpus import stopwords

# Download once (safe if already downloaded)
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = str(text).split()
    filtered_words = [w for w in words if w.lower() not in stop_words]
    return " ".join(filtered_words)

df2["stopwords_removed"] = df2["final_clean"].apply(remove_stopwords)
print(df2["stopwords_removed"].head())

0                                                                                                                                                                                                                           im happy uniten actually even people
1                                                                                                                                                                                                                             pretty good time happy meet people
2                                                                                                                                                                                                                                 neutral place terms everything
3                                                                                                                                                              would say uniten good university issue need improved transportation wi

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Downloads (safe if already done)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    tokens = word_tokenize(str(text))
    pos_tags = pos_tag(tokens)
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in pos_tags]
    return " ".join(lemmatized_words)

df2["processed_review"] = df2["stopwords_removed"].apply(lemmatize_text)
print(df2["processed_review"].head())

0                                                                                                                                                                                                               im happy uniten actually even people
1                                                                                                                                                                                                                 pretty good time happy meet people
2                                                                                                                                                                                                                      neutral place term everything
3                                                                                                                                                      would say uniten good university issue need improve transportation wifi network facility well
4    uniten well reg

[nltk_data] Downloading package punkt to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/17f50d36-54d6-4c27-a64f-
[nltk_data]     e7f6ec902d19/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [29]:
df2.to_csv("UNITENReview_Processed.csv", index=False)
print("Saved as UNITENReview_Processed.csv")

Saved as UNITENReview_Processed.csv
