In [1]:
!pip3 install nltk
!pip3 install transformers
!pip3 install torch torchvision torchaudio



In [2]:
import requests
import json
import pandas as pd
import nltk
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


## Testing

In [3]:
itemid, shopid = "19043502047", "1358899"
limit = 50

In [4]:
url = "https://shopee.sg/api/v2/item/get_ratings"

querystring = {
    "exclude_filter":"1",
    "filter":"1", # "1" for ratings with comments; "0" for ratings w/o comments
    "filter_size":"0",
    "flag":"1",
    "fold_filter":"0",
    "itemid":itemid,
    "limit":str(limit),
    "offset":"2124",
    "relevant_reviews":"false",
    "request_source":"2",
    "shopid":shopid,
    "tag_filter":"",
    "type":"0",
    "variation_filters":""}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
}

response = requests.request("GET", url, headers=headers, params=querystring)

response

<Response [200]>

In [87]:
response.json()['data']['item_rating_summary']

{'rating_total': 4339,
 'rating_count': [20, 18, 85, 328, 3888],
 'rcount_with_context': 2083,
 'rcount_with_image': 1702,
 'rcount_with_media': 1714,
 'rcount_local_review': 4339,
 'rcount_repeat_purchase': 0,
 'rcount_overall_fit_small': 0,
 'rcount_overall_fit_fit': 0,
 'rcount_overall_fit_large': 0,
 'rcount_oversea_review': 0,
 'rcount_folded': 0}

In [89]:
response.json()['data']

{'ratings': None,
 'item_rating_summary': {'rating_total': 4339,
  'rating_count': [20, 18, 85, 328, 3888],
  'rcount_with_context': 2083,
  'rcount_with_image': 1702,
  'rcount_with_media': 1714,
  'rcount_local_review': 4339,
  'rcount_repeat_purchase': 0,
  'rcount_overall_fit_small': 0,
  'rcount_overall_fit_fit': 0,
  'rcount_overall_fit_large': 0,
  'rcount_oversea_review': 0,
  'rcount_folded': 0},
 'is_sip_item': False,
 'rcmd_algo': 'BUNDLE:comment_search,RECALLER:comment_search_default,QUEUE:comment_search_default,NEWABTEST:0,QUEUES:comment_search_default|comment_search_default',
 'downgrade_switch': False,
 'has_more': False,
 'show_local_review': False,
 'browsing_ui': '',
 'enable_buyer_gallery_media': True,
 'user_latest_rating': None,
 'size_info_abt': '',
 'top_ratings': [],
 'resize_image_abt': False,
 'purchase_bar_abt': 'bucket_a',
 'tag_filters': [],
 'signature': ''}

In [7]:
test_df = pd.json_normalize(response.json()['data']['ratings'])
test_df.columns

Index(['orderid', 'itemid', 'cmtid', 'ctime', 'rating', 'userid', 'shopid',
       'comment', 'rating_star', 'status', 'mtime', 'editable', 'opt',
       'filter', 'mentioned', 'is_hidden', 'can_follow_up', 'follow_up',
       'submit_time', 'author_username', 'author_portrait', 'author_shopid',
       'anonymous', 'images', 'videos', 'product_items', 'delete_reason',
       'delete_operator', 'ItemRatingReply', 'tags', 'editable_date',
       'show_reply', 'like_count', 'liked', 'sync_to_social',
       'exclude_scoring_due_low_logistic', 'loyalty_info', 'template_tags',
       'has_template_tag', 'sync_to_social_toggle', 'is_repeated_purchase',
       'display_variation_filter', 'overall_fit', 'is_normal_item', 'viewed',
       'show_view', 'sync_to_social_detail', 'profile', 'size_info_tags',
       'size_info_abt', 'image_data', 'is_super_review', 'super_reviewer_tag',
       'is_newly_created', 'template_hints', 'template_tags_hints', 'region',
       'template_abt', 'is_repeat_ed

## Scraping

In [6]:
def get_reviews_shopee(itemid, shopid, limit=None, limit_per_req = 59, offset=0): 
    url = "https://shopee.sg/api/v2/item/get_ratings"

    querystring = {
        "exclude_filter":"1",
        "filter":"1", #! 1 only includes those with comments
                      #! 0 includes all with/without comments
        "filter_size":"0",
        "flag":"1",
        "fold_filter":"0",
        "itemid":itemid,
        "limit":str(limit_per_req),
        "offset":str(offset),
        "relevant_reviews":"false",
        "request_source":"2",
        "shopid":shopid,
        "tag_filter":"",
        "type":"0",
        "variation_filters":""}

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)

    rating_total = response.json()['data']['item_rating_summary']['rating_total']
    rating_count = response.json()['data']['item_rating_summary']['rating_count']
    rcount_with_context = response.json()['data']['item_rating_summary']['rcount_with_context']

    # return response.json()['data']['ratings']
    # return rating_count, rcount_with_context

    ratings_list = list()
    if limit is None:
        limit = rcount_with_context

    pages_to_scrape = limit//limit_per_req + 1
    for i in range(pages_to_scrape):
        querystring['offset'] = str(offset)
        response = requests.request("GET", url, headers=headers, params=querystring)
        # print(offset)
        # if 'ratings' in response.json()['data']:
        ratings_list += response.json()['data']['ratings']
        # print(response.json()['data']['ratings'])
        offset += limit_per_req
    
    ratings_df = pd.json_normalize(ratings_list)
    # ratings_df.columns
    return ratings_df

In [8]:
itemid, shopid = "19043502047", "1358899"
df = get_reviews_shopee(itemid, 
                        shopid, 
                        # limit =100,
                        )
df["comment"]

0       Quality:very good\n\nFast delivery. Next day! ...
1       Quality:honestly not bad 😋😋\n\nbought 2 during...
2       Fast delivery, around 2 days. Packaging is goo...
3       Received very quickly. Great quality and super...
4       It took 3 days to arrive with good condition. ...
                              ...                        
2078                                                Great
2079                                           Quality:ok
2080                                                Small
2081                                               good👍🏻
2082                                                 Nice
Name: comment, Length: 2083, dtype: object

In [92]:
print(df.shape)

(2083, 91)


In [10]:
df['comment'].value_counts()
#df['comment'].value_counts().shape

Quality:good                                                                        13
Fast delivery                                                                        5
Good                                                                                 3
Quality:good\n\ngood                                                                 3
Good quality                                                                         3
                                                                                    ..
Quality:Good\n\nThe colour is so pretty. Looks really good. Happy with purchase.     1
Fast delivery. Good quality. Feels like a size bigger. But nice                      1
Item received with good condition, fast delivery. But too small …                    1
Quality:good\n\nDelivery is fast. The slipper fits perfectly and is so cute!         1
good👍🏻                                                                               1
Name: comment, Length: 2040, dtype: int64

In [96]:
df[df['rating_star'] <3]

Unnamed: 0,orderid,itemid,cmtid,ctime,rating,userid,shopid,comment,rating_star,status,...,ItemRatingReply.shopid,ItemRatingReply.comment,ItemRatingReply.rating_star,ItemRatingReply.status,ItemRatingReply.mtime,ItemRatingReply.editable,ItemRatingReply.opt,ItemRatingReply.filter,ItemRatingReply.mentioned,ItemRatingReply.is_hidden
262,143705603294962,19043502047,12102988478,1690466037,0,7225494,1358899,Received with good condition. But size run sma...,2,2,...,,,,,,,,,,
415,116896532260836,19043502047,9598486296,1663518442,-1,295988533,1358899,lots of defects. tried to contact seller but i...,1,2,...,,"Dear buyer, we are sorry for your experience. ...",,,1663638000.0,,,,,False
693,137893252247352,19043502047,11605905386,1685286448,0,15172543,1358899,Quality:decent\n\nnah the sizes just ain’t it ...,2,2,...,,,,,,,,,,
742,131429283259533,19043502047,10913575420,1677923607,0,153382131,1358899,Quality:overall can see minor quality defects;...,2,2,...,,,,,,,,,,
981,141980705236581,19043502047,11913445722,1688623094,0,159794894,1358899,"Quality:styrofoamy, very light\n\nSent out lat...",2,2,...,,,,,,,,,,
1040,124978738239001,19043502047,10392152799,1671852591,0,22740961,1358899,Quality:ok 7/10\n\nDelivery was fast. Need to ...,2,2,...,,,,,,,,,,
1045,131551300295465,19043502047,10934737494,1678120364,-1,134053990,1358899,Quality:quality is gd but one of the grey one ...,1,2,...,,,,,,,,,,
1059,131983120214119,19043502047,10975968453,1678526619,-1,563646162,1358899,Quality:sucks booty\n\nmaterial is hard and u ...,1,2,...,,,,,,,,,,
1095,152642831318917,19043502047,13232032838,1700133767,-1,696876926,1358899,I thought it would be good but difficult to ru...,1,2,...,,,,,,,,,,
1096,130753126298652,19043502047,10857490968,1677291327,-1,118714741,1358899,Quality:seems bad\n\nafter one wear the grip i...,1,2,...,,,,,,,,,,


## Sentiment Analysis

In [21]:
ROBERTA_MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_MODEL)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [101]:
BERT_MODEL = f"nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL)

In [100]:
def roberta_classification(text, debug=False):
    # Run for Roberta Model
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    if debug:
        print(text)
    return scores_dict
roberta_classification(df['comment'][262], debug=True)

Received with good condition. But size run small. My previous pairs size 42-43 and now size 44-45 is almost the same size. And is not comfortable to wear .  look at the slippers, the new slippers the front is so high up. It does not feel secure when walk. Not nice, some more I bought 2 pairs. Sigh….


{'roberta_neg': 0.85082173,
 'roberta_neu': 0.12977569,
 'roberta_pos': 0.019402608}

In [102]:
def bert_classification(text, debug=False):
    # Run for BERT Model
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    if debug:
        print(text)
    return scores
bert_classification(df['comment'][262], debug=True)

Received with good condition. But size run small. My previous pairs size 42-43 and now size 44-45 is almost the same size. And is not comfortable to wear .  look at the slippers, the new slippers the front is so high up. It does not feel secure when walk. Not nice, some more I bought 2 pairs. Sigh….


array([-0.00566801,  2.0982053 ,  2.1872063 , -0.11072581, -3.3476822 ],
      dtype=float32)

## Comment Analysis

In [94]:
from transformers import pipeline
bart_summarizer=pipeline(task="summarization",model="facebook/bart-large-cnn")
pegasus_summarizer=pipeline(task="summarization",model="google/bigbird-pegasus-large-pubmed")
default_summarizer=pipeline(task="summarization",model="sshleifer/distilbart-cnn-12-6")

Downloading config.json: 100%|██████████| 1.58k/1.58k [00:00<00:00, 1.67MB/s]
Downloading model.safetensors: 100%|██████████| 1.63G/1.63G [02:36<00:00, 10.4MB/s]
Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading generation_config.json: 100%|██████████| 363/363 [00:00<00:00, 21.8kB/s]
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.23MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.08MB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.24MB/s]
Downloading config.json: 100%|██████████| 1.05k/1.05k [00:00<00:00, 816kB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.31G/2.31G [03:23<00:00, 11.3MB/s]
Downloading generation_config.json: 100%|██████████| 232/232 [00:00<?, ?B/s] 
Do

In [82]:
STAR_MAX_COUNT,comment_agg=5,{}
for star_count in range(1,STAR_MAX_COUNT+1):
    comment_agg[star_count]=list(df[df['rating_star']==star_count]['comment'])


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("punkt")
nltk.download("stopwords")
def preprocess(text):
    sentences=sent_tokenize(text)
    stop_words=set(stopwords.words("english"))

In [92]:
def summarize_groupwise(summarizer_type,text_list,subgroup_size):
    curr_text_list,next_text_list,i=text_list,[],0
    while len(curr_text_list)>1 and i<len(curr_text_list):
        substring_to_summarize=','.join(text_list[i:i+subgroup_size]).translate(str.maketrans("\n\t","  "))
        next_text_list.append(summarizer_type(substring_to_summarize,max_length=100,do_sample=False))
        i+=subgroup_size
    curr_text_list=next_text_list
    print(f"Test: {curr_text_list}")
    return curr_text_list[0]
"""
for star_count in range(1,STAR_MAX_COUNT+1):
    print(summarize_groupwise(comment_agg[star_count],15))
"""

'\nfor star_count in range(1,STAR_MAX_COUNT+1):\n    print(summarize_groupwise(comment_agg[star_count],15))\n'

In [93]:
summarize_groupwise(bart_summarizer,comment_agg[5],15)

Your max_length is set to 100, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 100, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 100, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 100, but your input_length is only 86. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your

Test: [[{'summary_text': " Quality: Very good  Fast delivery. No smell of plastic and it is sturdy and non slip. Comfy too! The slippers I need at home so that the dog fur won’t stick to it!!! Bought 2 during the sale and it's SOOO cute . Bought for my son. his feet is one size bigger then me. Bought 3 days ago and arrived super fast ."}], [{'summary_text': ' Seller ships super fast and i recieved the next day from ordering from sg. Quality:10/10  I usually wear US size 7/UK size 37-38 . Advise to go one size up, apart from that don’t think it is as wide as some comments claim .'}], [{'summary_text': ' Sharky Slippers are so cute! One for my boy and one for my girl. Feels comfortable and looks good! Delivery was also fast and I would have received it the next day if I didn’t order it right before Chinese New Year . The fit would have been better if I had more meaty legs but overall I recommend it!'}], [{'summary_text': ' The colour arrived as expected, the colour is exactly as the phot

[{'summary_text': " Quality: Very good  Fast delivery. No smell of plastic and it is sturdy and non slip. Comfy too! The slippers I need at home so that the dog fur won’t stick to it!!! Bought 2 during the sale and it's SOOO cute . Bought for my son. his feet is one size bigger then me. Bought 3 days ago and arrived super fast ."}]

In [91]:
df[df['comment'].str.contains("The slippers I need at home so that the dog fur won’t stick to it!!!")].comment[0]
# df[df['comment']==" Quality: Very good  Fast delivery. No smell of plastic and it is sturdy and non slip. Comfy too! The slippers I need at home so that the dog fur won’t stick to it!!! Bought 2 during the sale and it's SOOO cute . Bought for my son. his feet is one size bigger then me. Bought 3 days ago and arrived super fast ."]

'Quality:very good\n\nFast delivery. Next day! Very good quality. No smell of plastic and it is sturdy and non slip. Comfy too! The slippers I need at home so that the dog fur won’t stick to it!!!'