In [6]:
import json 
import urllib.request
import yaml

cfg = yaml.safe_load(open('config.yml', 'r'))
cfg = type('Namespace', (object, ), cfg)

def get_json_product(itemid, limit, offset, shopid, type=0):
    '''Get JSON of a product
    * type = 0: get all ratings
    * type = 1..5: get ratings based on rating stars
    * country code= 'vn' or 'sg'
    '''
    url = 'https://shopee.{}/api/v2/item/get_ratings?filter=0&flag=1&itemid={}&limit={}&offset={}&shopid={}&type={}'.format(
        cfg.country_code, itemid, limit, offset, shopid, type)
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    return data


def get_json_recommend(limit, offset):
    url = 'https://shopee.{}/api/v4/recommend/recommend?bundle=daily_discover_main&limit={}&offset={}'.format(
        cfg.country_code, limit, offset)
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    return data


def get_json_campaign(label, limit, offset):
    url = 'https://shopee.{}/api/v4/recommend/recommend?bundle=daily_discover_campaign&label={}&limit={}&offset={}'.format(
        cfg, label, limit, offset)
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    return data

In [7]:
import re
import pandas as pd


def remove_adjacent_duplicates(str):
    return re.sub(r'(.)\1+', r'\1\1', str)


def format_string(str):
    if str:
        locale_chars = ''
        if cfg.country_code == 'vn':
            locale_chars = ' ,.\n\tABCDEGHIKLMNOPQRSTUVXYabcdeghiklmnopqrstuvxyÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
        elif cfg.country_code == 'sg':
            locale_chars = ' ,.\n\tABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
        bad_chars = [('\t', ', '), ('\n', '. '), ('  ', ' '), (' .', '.'),
                     (' ,', ','), ('..', '.'), (',,', ','), (',.', '.'),
                     ('.,', ',')]
        # Keep only specific characters
        str = ''.join(c for c in str if c in locale_chars)
        str = remove_adjacent_duplicates(str)
        for c in bad_chars:
            str = str.replace(c[0], c[1])
        str = str.strip()
    return str


def export_to_text_file(array_of_json,
                        product_info,
                        filename,
                        only_header=False):
    f = open(filename, 'a+', encoding='utf-8')
    if only_header:
        f.write(
            'userid\tshopid\titemid\tcmtid\tmtime\trating_star\tcomment\tproduct_name\tshop_name\tshop_rating\tavg_rating\trating_count\tsold\n'
        )
    else:
        for j in array_of_json:
            # print(';'.join(*product_info))
            f.write(
                '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    j['userid'],
                    product_info['shopid'],
                    product_info['itemid'],
                    j['cmtid'],
                    j['mtime'],
                    j['rating_star'],
                    j['comment'],
                    product_info['product_name'],
                    product_info['shop_name'],
                    product_info['shop_rating'],
                    product_info['rating_star'],
                    product_info['rating_count'],
                    product_info['sold'],
                ))
    f.close()


def remove_duplicate_column(filename, col_check, filename_out=None):
    df = pd.read_csv(filename, delimiter='\t')
    print(df['rating_star'].value_counts().sort_index(ascending=True))
    df.drop_duplicates(col_check, inplace=True)
    print(df['rating_star'].value_counts().sort_index(ascending=True))
    filename_out = filename if filename_out is None else filename_out
    df.to_csv(filename_out, sep='\t', index=False)


def prune(filename):
    df = pd.read_csv(filename, delimiter='\t')
    min = df.groupby('rating_star').agg('count')['comment'].min()
    for i in [1, 2, 3, 4, 5]:
        rows = df.loc[df['rating_star'] == i]
        rows = rows.sort_values(by='comment',
                                key=lambda x: x.str.len(),
                                ascending=False)
        rows = rows.head(min)
        header = True if i == 1 else False
        rows.to_csv('pruned_' + filename,
                    mode='a',
                    index=False,
                    sep='\t',
                    header=header)


In [8]:
import datetime


def get_ratings_from_json(json_data, min_len_str=4):
    data = json_data['data']
    ratings = data['ratings'] if data != None else None
    result = []
    if ratings != None:
        for r in ratings:
            itemid = r['itemid']
            shopid = r['shopid']
            userid = r['userid']
            cmtid = r['cmtid']
            mtime = datetime.datetime.fromtimestamp(
                r['mtime']).strftime('%d-%m-%Y %H:%M:%S')
            rating_star = r['rating_star']
            comment = format_string(r['comment'])
            if comment != None and len(comment) >= min_len_str:
                result.append({
                    'itemid': itemid,
                    'shopid': shopid,
                    'userid': userid,
                    'cmtid': cmtid,
                    'mtime': mtime,
                    'rating_star': rating_star,
                    'comment': comment
                })
    return result


def get_products_from_json(json_data, get_top_product=False):
    data = json_data['data']
    sections = data['sections'] if data != None else []
    result = []
    for s in sections:
        data = s['data']
        item = data['item']
        if item != None:
            for i in item:
                shopid = i['shopid']
                itemid = i['itemid']
                product_name = i['name']
                shop_name = i['shop_name']
                shop_rating = i['shop_rating']
                sold = i['sold']
                rating_star = i['item_rating']['rating_star']
                rating_count = i['item_rating']['rating_count']
                result.append({
                    'shopid': shopid,
                    'itemid': itemid,
                    'product_name': product_name,
                    'shop_name': shop_name,
                    'shop_rating': shop_rating,
                    'sold': sold,
                    'rating_star': rating_star,
                    'rating_count': rating_count
                })
        if get_top_product:
            top_product = data['top_product']
            if top_product != None:
                for t in top_product:
                    list = t['list']
                    data = list['data']
                    item_lite = data['item_lite']
                    if item_lite != None:
                        for i in item_lite:
                            shopid = i['shopid']
                            itemid = i['itemid']
                            product_name = i['name']
                            shop_name = i['shop_name']
                            shop_rating = i['shop_rating']
                            sold = i['sold']
                            rating_star = i['item_rating']['rating_star']
                            rating_count = i['item_rating']['rating_count']
                            result.append({
                                'shopid': shopid,
                                'itemid': itemid,
                                'product_name': product_name,
                                'shop_name': shop_name,
                                'shop_rating': shop_rating,
                                'sold': sold,
                                'rating_star': rating_star,
                                'rating_count': rating_count
                            })
    return result


In [9]:
import time
from tqdm import tqdm


class Collector:

    def __init__(self):
        pass

    def get_all_reviews(self,
                        itemid,
                        shopid,
                        limit=6,
                        offset=0,
                        max_cmt=100,
                        min_len_cmt=4,
                        type=0):
        result = []
        with tqdm(total=(max_cmt // limit + 1) * limit) as pbar:
            while True:
                json_data = get_json_product(itemid, limit, offset, shopid,
                                             type)
                ratings = get_ratings_from_json(json_data, min_len_cmt)
                if ratings == []:
                    break
                else:
                    result += ratings
                offset += limit
                pbar.update(limit)
                if len(result) >= max_cmt:
                    break
        return result[:max_cmt]

    def get_all_recommended_products(self,
                                     max_products=100,
                                     limit=10,
                                     offset=0,
                                     get_top_product=False):
        result = []
        if max_products < limit:
            limit = max_products
        with tqdm(total=(max_products // limit + 1) * limit) as pbar:
            while True:
                start_time = time.time()
                # Notes: The number of products may be smaller than limit number although max_products < limit
                # So the number of result can be larger than the max_products
                json_data = get_json_recommend(limit, offset)
                products = get_products_from_json(json_data, get_top_product)
                if products == [] or len(result) >= max_products:
                    break
                else:
                    result += products
                    pbar.set_description(
                        'Đã lấy về {} sản phẩm trên tổng số tối đa {} sản phẩm. Mất {:0.2f} mili giây'
                        .format(len(result), max_products,
                                (time.time() - start_time) * 1000))
                offset += limit
                pbar.update(limit)

        return result[:max_products]

    def collect_reviews_product(self,
                                filename,
                                max_products,
                                min_len_cmt=4,
                                ratetypes=[]):
        '''Collect all reviews of products with specific rating_star
        * type = array [0]: get all rating_stars
        * type = array [1..5]: get only these rating_stars
        '''
        products = self.get_all_recommended_products(max_products=max_products,
                                                     get_top_product=True)
        length_products = len(products)
        export_to_text_file(None, None, filename, True)
        pbar = (products)
        stat = {'total': 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        for p in pbar:
            start_time = time.time()
            itemid = p['itemid']
            shopid = p['shopid']
            shopname = p['shop_name']
            reviews = []
            if ratetypes != None and ratetypes != []:
                for t in ratetypes:
                    reviews += self.get_all_reviews(itemid,
                                                    shopid,
                                                    min_len_cmt=min_len_cmt,
                                                    type=t)
            else:
                reviews += self.get_all_reviews(itemid,
                                                shopid,
                                                min_len_cmt=min_len_cmt)
            export_to_text_file(array_of_json=reviews,
                                product_info=p,
                                filename=filename)
            length_products -= 1
            # pbar.set_description(

            print(
                'Đã thu thập và ghi {} đánh giá của sản phẩm {} tại shop \"{}\". Còn {} sản phẩm nữa'
                .format(len(reviews), itemid, shopname, length_products))

            stat['total'] += len(reviews)
            for r in reviews:
                stat[r['rating_star']] += 1
        print('Thống kê số lượng đánh giá ghi nhận:')
        for k, v in stat.items():
            print('{}*: {} reviews'.format(k, v))

In [None]:
if __name__ == '__main__':
    c = Collector()
    c.collect_reviews_product('sentiments.txt', max_products=1, min_len_cmt=10, ratetypes=[1, 2, 3, 4, 5])
    remove_duplicate_column('sentiments.txt', 'comment',
                            'sentiments_nondup.txt')