In [0]:
#%pip install requests

In [0]:
import requests
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd

# app id for each delivery app 
app_ids = {
    "baemin" : "378084485",
    "coupangEats" : "1445504255",
    "yogiyo" : "543831532"
}

# countries to check reviews
# I am covering countries:
# 1) using English, French, Spanish, and Chinese as official or major secondary languages
# 2) relatively nearby Korea (with an assumption there will be more tourists coming from these countries due to its accessibility)
# 3) with high exchange scope
# 4) and Korea
language_group = [ 
    # English
    'us', 'gb', 'ca', 'au', 'nz', 'ie', 'za', 'in', 'sg', 'ph', 
    # Spanish 
    'es', 'mx', 'ar', 'co', 'cl', 'pe',  
    # Chinese                        
    'cn', 'tw', 'hk',   
    # French                                         
    'fr', 'be', 'ch', 'sn'                                       
]
nearby_group = ['jp', 'ru', 'mn', 'vn', 'id', 'th']
exchange_scope_group = ['au', 'sa', 'de', 'ae']
korea_group = ['kr']

country_codes = language_group + nearby_group + exchange_scope_group + korea_group
# remove duplicates
unique_country_codes = list(set(country_codes))

max_page_num = 10 # AppStore RSS provides only up to 10 pages 

all_reviews = []

# loop through each app
for app_name, app_id in app_ids.items():
    print(f"--Processing: {app_name}--")
    # loop through each country
    for country in unique_country_codes:
        # loop through each page
        for page in range(1, max_page_num+1):
            url = f"https://itunes.apple.com/{country}/rss/customerreviews/id={app_id}/page={page}/json"
            try:
                response = requests.get(url)
                response.raise_for_status() 

                reviews = response.json()
                # it finds if there is a key 'entry' in data['feed']
                # if key not found, it returns None, or if found but no data, it returns [].
                # both are considered as False in if statement
                if not reviews['feed'].get('entry'):
                    # if data not found or it reaches the last page, escape the page loop
                    break
                else:
                    review_entries = reviews['feed']['entry']
                    # if there is only one review, reviews['feed']['entry'] returns a dict, not a list
                    # if review_entries is a dict, put it in a list
                    if isinstance(review_entries, dict):
                        review_entries = [review_entries]
                    for review in review_entries:                        
                        
            except requests.exceptions.HTTPError as http_err:
                print(f"HTTPError: {http_err}. Move to the next country.")
                break
            except json.JSONDecodeError:
                print(f"No review found. Move to the next country.")
                break

In [0]:
# create pandas dataframe to handle nested dictionary
df = pd.DataFrame(all_reviews)

# convert to spark dataframe
spark = SparkSession.builder.appName("appstore_review_ingest").getOrCreate()
spark_df = spark.createDataFrame(df)

spark_df.write\
        .mode("overwrite")\
        .format("delta")\
        .option("mergeSchema", "true")\
        .saveAsTable("workspace.growth_poc.bronze_appstore_reviews")
 