<a href="https://colab.research.google.com/github/gussgary/ShopeeLazadaSentimentAnalysis/blob/main/SentimentAppsComparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
from google_play_scraper import reviews, Sort

In [None]:
def get_reviews(app_id, app_name, count = 1000):
  result, _ = reviews(
      app_id,
      lang = 'en',
      country = 'my',
      sort = Sort.NEWEST,
      count = count,
      filter_score_with = None
  )
  df = pd.DataFrame(result)
  df['app'] = app_name
  return df[
      [
       'content',
       'score',
       'at',
       'app'
       ]
  ]

# Data Loading and Preprocessing

In [None]:
#Scrape Shopee
df_shopee = get_reviews('com.shopee.my', 'Shopee')
#Scrape Lazada
df_lazada = get_reviews('com.lazada.android', 'Lazada')

print(f'Dataset Overview and Preprocessing')
print(f'-' * 30)
print(f'For Shopee, there are {df_shopee.shape[0]} reviews in the dataset and consist of {df_shopee.shape[1]} columns')
print(f'For Lazada, there are {df_lazada.shape[0]} reviews in the dataset and consist of {df_lazada.shape[1]} columns')

Dataset Overview and Preprocessing
------------------------------
For Shopee, there are 1000 reviews in the dataset and consist of 4 columns
For Lazada, there are 1000 reviews in the dataset and consist of 4 columns


In [None]:
#Merge both datasets
df_both = pd.concat([df_shopee, df_lazada])
print(f'After merging, there are {df_both.shape[0]} reviews in the dataset and consist of {df_both.shape[1]} columns')

df_both.head()

After merging, there are 2000 reviews in the dataset and consist of 4 columns


Unnamed: 0,content,score,at,app
0,Shopee make my shopping routine easier. And no...,5,2025-08-07 07:07:06,Shopee
1,ok ok,5,2025-08-07 07:04:01,Shopee
2,Fair price and more choice of products. Let's ...,5,2025-08-07 06:39:27,Shopee
3,👍🏻👍🏻👍🏻,5,2025-08-07 06:25:13,Shopee
4,Mudah nak cari dan beli barang. Tq shopee.,5,2025-08-07 06:16:31,Shopee


In [None]:
#Save the dataset to csv
df_both.to_csv('ShopeeLazadaReview.csv', index = False)

# Data Cleaning

In [None]:
df_clean = df_both.copy()

#Removing duplicates
print(f'Original size before removing the duplicates: {len(df_clean)}')
df_clean = df_clean.drop_duplicates(subset = ['content'])
print(f'New size after removing the duplicates: {len(df_clean)}')

#Handling missing value
df_clean = df_clean.dropna(subset = ['content', 'score'])
print(f'New size after removing the missing value: {len(df_clean)}')

Original size before removing the duplicates: 2000
New size after removing the duplicates: 1528
New size after removing the missing value: 1528


In [None]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text)                     # Normalize whitespace
    text = re.sub(r'([.?!]){3,}', r'\1\1\1', text)       # Limit repeated punctuation to 3
    text = text.strip()                                  # Remove leading/trailing whitespace
    return text

df_clean['content_clean'] = df_clean['content'].apply(clean_text)
df_clean.head()

Unnamed: 0,content,score,at,app,content_clean
0,Shopee make my shopping routine easier. And no...,5,2025-08-07 07:07:06,Shopee,Shopee make my shopping routine easier. And no...
1,ok ok,5,2025-08-07 07:04:01,Shopee,ok ok
2,Fair price and more choice of products. Let's ...,5,2025-08-07 06:39:27,Shopee,Fair price and more choice of products. Let's ...
3,👍🏻👍🏻👍🏻,5,2025-08-07 06:25:13,Shopee,👍🏻👍🏻👍🏻
4,Mudah nak cari dan beli barang. Tq shopee.,5,2025-08-07 06:16:31,Shopee,Mudah nak cari dan beli barang. Tq shopee.
