Ref:
- https://www.kaggle.com/code/ashishkumarak/google-play-reviews-scraping-daily-update

- https://pypi.org/project/google-play-scraper/

In [None]:
!pip install google-play-scraper

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [None]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
from tqdm import tqdm
import time
import json
from time import sleep
from typing import List, Optional, Tuple

In [None]:
app_id = 'com.facebook.katana'


In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            #funnan MOD start
            token = continuation_token.token
            continue
            #MOD end

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        ),
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [None]:
reviews_count = 25000


In [None]:
result = []
continuation_token = None


with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='en', #The language of review
            country='in', #Country for which you want to scrape
            sort=Sort.NEWEST,
            filter_score_with=None,
            count=199 #No need to change this
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))

25074it [00:53, 468.34it/s]


In [None]:
df = pd.DataFrame(result)



In [None]:
df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,3fd80e44-3430-4bd3-9817-fef13ba76c4e,Mim khatun,https://play-lh.googleusercontent.com/a/ACg8oc...,Good app,5,0,468.1.0.56.78,2024-06-21 13:01:17,,,468.1.0.56.78
1,0dd66745-b532-413a-81d6-90fb79b3daaa,Ashok Kumar,https://play-lh.googleusercontent.com/a-/ALV-U...,so,5,0,468.1.0.56.78,2024-06-21 13:01:15,,,468.1.0.56.78
2,1493feb3-0e19-4472-aab3-47cfb92f5b95,Kmuhiypppan K.Muhiyppau,https://play-lh.googleusercontent.com/a/ACg8oc...,K.Muhiyppan,1,0,469.0.0.39.80,2024-06-21 13:01:15,,,469.0.0.39.80
3,d2a52167-f661-4811-a69d-e9b3e28c6f55,Meet Mali,https://play-lh.googleusercontent.com/a/ACg8oc...,Great 👍,5,0,469.0.0.39.80,2024-06-21 13:00:52,,,469.0.0.39.80
4,60098f93-1810-4c00-b4af-8c5a343b133c,Pawan Kundu,https://play-lh.googleusercontent.com/a/ACg8oc...,nice,5,0,,2024-06-21 13:00:27,,,
...,...,...,...,...,...,...,...,...,...,...,...
25069,a0046954-b3d8-46ca-93e5-d811d16e005d,gaurav singh,https://play-lh.googleusercontent.com/a-/ALV-U...,Super,5,0,467.1.0.52.83,2024-06-13 15:43:30,,,467.1.0.52.83
25070,562f83c3-db17-4823-b36b-ca515e2fcb79,MR MONEY,https://play-lh.googleusercontent.com/a-/ALV-U...,love this app ❣️,5,0,465.0.0.63.83,2024-06-13 15:43:27,,,465.0.0.63.83
25071,38a3d74f-3831-4f48-bc77-9c4fe00543b5,Enam Khan,https://play-lh.googleusercontent.com/a-/ALV-U...,very good service meta thanks a lot❤️,5,0,467.1.0.52.83,2024-06-13 15:43:18,,,467.1.0.52.83
25072,8000c2d0-c3bc-42a6-9db1-ef80aa1560df,Pink Kat 79,https://play-lh.googleusercontent.com/a-/ALV-U...,it's nobody's business our location,1,0,467.1.0.52.83,2024-06-13 15:43:16,,,467.1.0.52.83


In [None]:
df.to_csv('reviews.csv', index=False)