### Add All Dependencies and Libraries

In [21]:
import kagglehub
import shutil
import os
import pandas as pd
import re
import string
import nltk
from tqdm import tqdm
import numpy as np
tqdm.pandas()

### Import Dataset from Kaggle Using Kagglehub

In [5]:
path = kagglehub.dataset_download("bwandowando/shopee-app-reviews-from-google-store")

for file in os.listdir(path):
    full_path = os.path.join(path, file)
    shutil.copy(full_path, "./dataset/raw/shopee-app-reviews-from-google-store.csv")

print("Dataset downloaded and copied to dataset/raw/shopee-app-reviews-from-google-store.csv")

Dataset downloaded and copied to dataset/raw/shopee-app-reviews-from-google-store.csv


### Download and Use NLTK's Stopwords

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gusti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read Dataset

In [8]:
df = pd.read_csv("./dataset/raw/shopee-app-reviews-from-google-store.csv")
df = df[['review_text', 'review_rating']]
df 

Unnamed: 0,review_text,review_rating
0,The next level in buying and selling! Everyone...,5
1,User friendly ecommerce site.,5
2,This is such a cool app. I find it easy to bro...,5
3,"Finally, they're here! Hooray!",5
4,"Very cool app. Shop anytime, anywhere. Hassle-...",5
...,...,...
782074,All goods,5
782075,Worse lipat na kayoas priority nila seller kes...,1
782076,❤❤❤😊😊😊,5
782077,Legit ang mga product at maganda ang service. ...,5


### Add Preprocessing Function
This function includes:
1. Lowercasing
2. Remove punctuation
3. Remove non-alphabetic characters
4. Remove stopwords

In [9]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))  # You can change 'english' to another language if needed
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

### Add Sentiment Label based on review_rating Using label_sentiment Function

In [10]:
def label_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

### Apply All Function for the Dataset 

In [13]:
df['cleaned_review'] = df['review_text'].astype(str).progress_apply(preprocess)
df['sentiment'] = df['review_rating'].progress_apply(label_sentiment)

100%|██████████| 782079/782079 [07:32<00:00, 1729.01it/s]
100%|██████████| 782079/782079 [00:01<00:00, 750497.75it/s]


### Save Cleaned Dataset

In [None]:
final_df = df[['cleaned_review', 'sentiment']]
final_df.loc[:, "cleaned_review"] = final_df["cleaned_review"].replace(["", " ", "NaN", "nan"], np.nan)
final_df.dropna(inplace=True)
final_df.to_csv("./dataset/cleaned/shopee_app_reviews_cleaned.csv", index=False)
print("Dataset cleaned and saved to dataset/cleaned/shopee_app_reviews_cleaned.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


Dataset cleaned and saved to dataset/cleaned/shopee_app_reviews_cleaned.csv


In [43]:
final_df

Unnamed: 0,cleaned_review,sentiment
0,next level buying selling everyone try must ap...,positive
1,user friendly ecommerce site,positive
2,cool app find easy browse might skip going mal...,positive
3,finally theyre hooray,positive
4,cool app shop anytime anywhere hasslefree buyi...,positive
...,...,...
782073,hard find appropriate size classes due sellers...,neutral
782074,goods,positive
782075,worse lipat na kayoas priority nila seller kes...,negative
782077,legit ang mga product maganda ang service than...,positive
