## Add All Dependencies and Libraries

In [4]:
import kagglehub
import shutil
import os
import pandas as pd
import re
import string
import nltk
from tqdm import tqdm
tqdm.pandas()

## Import Dataset from Kaggle Using Kagglehub

In [5]:
path = kagglehub.dataset_download("bwandowando/shopee-app-reviews-from-google-store")

for file in os.listdir(path):
    full_path = os.path.join(path, file)
    shutil.copy(full_path, "./dataset/raw/shopee-app-reviews-from-google-store.csv")

print("Dataset downloaded and copied to dataset/raw/shopee-app-reviews-from-google-store.csv")

Dataset downloaded and copied to dataset/raw/shopee-app-reviews-from-google-store.csv


## Download and Use NLTK's Stopwords

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gusti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read Dataset

In [7]:
df = pd.read_csv("./dataset/raw/shopee-app-reviews-from-google-store.csv")
df = df[['review_text', 'review_rating']] 

## Add Preprocessing Function
This function includes:
1. Lowercasing
2. Remove punctuation
3. Remove non-alphabetic characters
4. Remove stopwords

In [8]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))  # You can change 'english' to another language if needed
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

## Add Sentiment Label based on review_rating Using label_sentiment Function

In [9]:
def label_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

## Apply All Function for the Dataset 

In [10]:
df['cleaned_review'] = df['review_text'].astype(str).progress_apply(preprocess)
df['sentiment'] = df['review_rating'].progress_apply(label_sentiment)

100%|████████████████████████████████████████████████████████████████████████| 782079/782079 [07:17<00:00, 1787.51it/s]
100%|██████████████████████████████████████████████████████████████████████| 782079/782079 [00:01<00:00, 731059.81it/s]


## Save Cleaned Dataset

In [11]:
final_df = df[['cleaned_review', 'sentiment']]
final_df.to_csv("./dataset/cleaned/shopee_app_reviews_cleaned.csv", index=False)
print("Dataset cleaned and saved to dataset/cleaned/shopee_app_reviews_cleaned.csv")

Dataset cleaned and saved to dataset/cleaned/shopee_app_reviews_cleaned.csv
