In [0]:
#%pip install google_play_scraper
#%pip install langdetect

In [0]:
from google_play_scraper import Sort, reviews_all
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# app id for each delivery app 
app_ids = {
    "coupangEats" : "com.coupang.mobile.eats",
    "baemin" : "com.sampleapp",
    "yogiyo" : "com.fineapp.yogiyo"
}

1. 초기 접근법: 다국가 리뷰 수집 시도 <br/>
프로젝트 초기에는 외국인 리뷰를 폭넓게 확보하기 위해, 한국과 교류가 많거나 주요 언어를 사용하는 여러 국가의 리뷰를 동시에 수집하는 전략을 세웠습니다. 아래 코드는 당시 최초로 설계했던 데이터 수집 로직입니다.

In [0]:
# ------------ This code is no longer in use -----------------

# countries to check reviews
# I am covering countries:
# 1) using English, French, Spanish, and Chinese as official or major secondary languages
# 2) relatively nearby Korea (with an assumption there will be more tourists coming from these countries due to its accessibility)
# 3) with high exchange scope
# 4) and Korea
language_group = [
    # English
    'us', 'gb', 'ca', 'au', 'nz', 'ie', 'za', 'in', 'sg', 'ph', 
    # Spanish 
    'es', 'mx', 'ar', 'co', 'cl', 'pe',  
    # Chinese                        
    'cn', 'tw', 'hk',   
    # French                                         
    'fr', 'be', 'ch', 'sn'                                       
]
nearby_group = ['jp', 'ru', 'mn', 'vn', 'id', 'th']
exchange_scope_group = ['au', 'sa', 'de', 'ae']
korea_group = ['kr']

# combine countries
country_codes = language_group + nearby_group + exchange_scope_group + korea_group

# remove duplicates
unique_country_codes = list(set(country_codes))

all_reviews = []
# get reviews from each country in each delivery app 
for app_name, app_id in app_ids.items():
    for country_code in unique_country_codes:
        # reviews = reviews_all(
        #     app_id,
        #     sleep_milliseconds = 500, # Add a delay to avoid rate-limiting
        #     lang = 'en', 
        #     country = country_code,
        #     sort = Sort.NEWEST, 
        # )

        # for review in reviews:
        #     review['appName'] = app_name
        #     review['appId'] = app_id
        #     review['countryCode'] = country_code

        # all_reviews.extend(reviews)
        # print(f"Found {len(reviews)} reviews for {app_name} in {country_code}")
        pass
print("위 코드를 실행하면 모든 국가에서 동일한 리뷰가 수집되는 문제가 발견됩니다.")

2. 전략 수정 <br/>
위 코드로 데이터를 수집한 결과, 모든 국가에서 동일한 개수의 리뷰가 반환되는 문제를 발견했습니다. 이는 대상 앱들이 한국 내수용이라 구글 플레이스토어가 미출시 국가에 대해 한국 스토어의 데이터를 기본값으로 보여주기 때문이었습니다. <br/>
따라서 비효율적인 다국가 수집 방식을 폐기하고, 목표에 더 직접적으로 접근할 수 있는 새로운 전략을 채택했습니다.

In [0]:
# Instead, now we will get reviews from korea play store 
# but filter to only get the ones written in non-Korean 
# to focus on foreigners' experience
from langdetect import detect, LangDetectException

# sort options to iterate to get more reviews
sort_options = [Sort.NEWEST, Sort.MOST_RELEVANT]

all_reviews = []

# get reviews from each delivery app 
for app_name, app_id in app_ids.items():
    unique_review_ids = set()
    for sort_option in sort_options:
        reviews = reviews_all(
            app_id,
            sleep_milliseconds = 500, # Add a delay to avoid rate-limiting
            lang = 'en', 
            country = 'kr',
            sort = sort_option, 
        ) # -> returns a list of dictionary

        for review in reviews:
            # check if this review was captured already
            if review['reviewId'] not in unique_review_ids:
                # exclude reviews with only star ratings
                if review['content']:
                    # check if the review was written in non-Korean
                    try: 
                        written_language = detect(review['content'])
                        if written_language != 'ko':
                            review['appName'] = app_name
                            review['language'] = written_language
                            all_reviews.append(review)
                            unique_review_ids.add(review['reviewId'])
                    except LangDetectException:
                            # if it fails to detect the language, save it with language N/A
                            review['appName'] = app_name
                            review['language'] = 'n/a'
                            all_reviews.append(review)
                            unique_review_ids.add(review['reviewId'])


In [0]:
spark = SparkSession.builder.appName("playstore_review_ingest").getOrCreate()

df = spark.createDataFrame(all_reviews)
# convert all columns to string to save data as is
final_df_to_write = df.select(
    [col(c).cast("string") for c in df.columns]
)

final_df_to_write.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("workspace.growth_poc.bronze_playstore_reviews")