# **Scrape Apple App Store Reviews**

In [1]:
from google.colab import drive
drive._mount('/content/drive', force_remount=True)

import os
os.chdir('/content/drive/MyDrive/apple-app-reviews-scraper')

Mounted at /content/drive


In [2]:
import random
import requests
print(f"requests=={requests.__version__}")
import re
import time
from tqdm import tqdm
import sys
import numpy as np
import pandas as pd

# Add src folder to module search path
sys.path.append('src')

# Import custom functions
from apple_app_reviews_scraper import get_token, fetch_reviews

requests==2.27.1


In [3]:
# Set some user_agents
user_agents = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
    ]

In [17]:
# Define some apps to scrape
# (country, app_name, app_id)
# Set the country and app_id (app_name doesn't really matter)
app_infos = {'Tidal': ('sg', 'tidal-music-hifi-ad-free', '913943275'),
             'Spotify': ('sg', 'spotify-music-and-podcasts', '324684580')}

ios_reviews = {key: [] for key in app_infos.keys()}
ios_reviews

{'Tidal': [], 'Spotify': []}

**Get Bearer Token**  
Tends to be the same token for the same app

In [21]:
%%capture
# GET TOKEN
country = 'sg'
app_name = 'chicken mcnuggets' # doesn't matter
app_id = '913943275'
token = get_token(country, app_name, app_id, user_agents)

**Scrape one page (offset) of 20 reviews for a single app**

In [38]:
country = 'sg'
app_name = 'tidal'
app_id = '913943275'
offset = '1'
reviews, offset, response_code = fetch_reviews(country=country, 
                                               app_name=app_name, 
                                               user_agents=user_agents, 
                                               app_id=app_id, 
                                               token=token, 
                                               offset=offset)

Offset: 21


In [39]:
pd.json_normalize(reviews).head(5)

Unnamed: 0,id,type,offset,n_batch,app_id,attributes.date,attributes.developerResponse.id,attributes.developerResponse.body,attributes.developerResponse.modified,attributes.review,attributes.rating,attributes.isEdited,attributes.title,attributes.userName
0,9700279414,user-reviews,21,20,913943275,2023-03-11T01:06:51Z,35337720.0,Thanks for your feedback. We're happy you're e...,2023-03-12T17:16:37Z,"After updated to the latest version, TIDAL App...",4,False,[Report] Search bug issue,Soufiaj
1,3871570145,user-reviews,21,20,913943275,2019-03-12T03:27:40Z,,,,I was slow coming to online music streaming. I...,5,False,Tidal is my new best friend.,RobPhilp
2,7680736472,user-reviews,21,20,913943275,2021-08-11T13:45:01Z,,,,Despite there were so many controversial debat...,5,False,Still the best sound quality Music Streaming,Mlok99
3,5340990717,user-reviews,21,20,913943275,2019-12-31T02:45:44Z,,,,I have been subscribing to Tidal for three yea...,5,False,Outstanding audio quality,MKrishna
4,9237029356,user-reviews,21,20,913943275,2022-10-30T11:59:17Z,,,,The app is ok on iOS. I much prefer it on the ...,3,False,"Tidal hifi is a great service, the app is ok",gamov SG


**Scraping more reviews**  

You can write a loop to iterate through the different apps.  
When `offset=None` and the response is `404`, it is likely that there are no more reviews. Note that the number of ratings does not equal the number of reviews, as not every user would have written a review.

In [24]:
for app in app_infos:
    country, app_name, app_id = [v for v in app_infos[app]]
    print(f"{app:<10} {country:<2} | {app_name:<15} | {app_id:<10}")

    # Scraping loop ------------------------------------------------------------
    all_reviews = []
    offset = '1'
    MAX_REVIEWS = 100+21

    start_time = time.time()
    while (offset != None) and (int(offset) <= MAX_REVIEWS):
        reviews, offset, response_code = fetch_reviews(country=country, 
                                                       app_name=app_name, 
                                                       user_agents=user_agents, 
                                                       app_id=app_id, 
                                                       token=token, 
                                                       offset=offset)
        print(f"Current response code: {response_code} | Next offset: {offset}.")
        all_reviews.extend(reviews)
        
    print(f"Completed scraping {len(all_reviews)} reviews in {(time.time() - start_time)/60:.2f} minutes.")
    # --------------------------------------------------------------------------

    # Check max offset
    max_offset = np.nanmax([int(rev['offset']) for rev in all_reviews 
                            if rev['offset'] is not None and rev['n_batch'] == 20])
    print(f"Backed up till offset {max_offset}.")

    # Store all reviews for the app
    #ios_reviews[app] = all_reviews
    ios_reviews[app].extend(all_reviews)

Tidal      sg | tidal-music-hifi-ad-free | 913943275 
Offset: 21
Current response code: 200 | Next offset: 21.
Offset: 41
Current response code: 200 | Next offset: 41.
Offset: 61
Current response code: 200 | Next offset: 61.
Offset: 81
Current response code: 200 | Next offset: 81.
Offset: 101
Current response code: 200 | Next offset: 101.
14 reviews scraped. This is fewer than the expected 20.
No offset found.
Current response code: 200 | Next offset: None.
Completed scraping 114 reviews in 0.05 minutes.
Backed up till offset 101.
Spotify    sg | spotify-music-and-podcasts | 324684580 
Offset: 21
Current response code: 200 | Next offset: 21.
Offset: 41
Current response code: 200 | Next offset: 41.
Offset: 61
Current response code: 200 | Next offset: 61.
Offset: 81
Current response code: 200 | Next offset: 81.
Offset: 101
Current response code: 200 | Next offset: 101.
Offset: 121
Current response code: 200 | Next offset: 121.
Offset: 141
Current response code: 200 | Next offset: 141.
Co

In [26]:
ios_df = pd.DataFrame()
for key, reviews in ios_reviews.items():
    app_df = pd.json_normalize(reviews)
    app_df['app'] = key
    ios_df = pd.concat([ios_df, app_df], axis=0).reset_index(drop=True)

ios_df.columns = [re.sub('^attributes\.', '', x) for x in ios_df.columns]
ios_df.columns = [re.sub('\.', '_', x).lower() for x in ios_df.columns]
ios_df['date'] = pd.to_datetime(ios_df['date'])
ios_df.head(3)

Unnamed: 0,id,type,offset,n_batch,app_id,date,developerresponse_id,developerresponse_body,developerresponse_modified,review,rating,isedited,username,title,app
0,9700279414,user-reviews,21,20,913943275,2023-03-11 01:06:51+00:00,35337720.0,Thanks for your feedback. We're happy you're e...,2023-03-12T17:16:37Z,"After updated to the latest version, TIDAL App...",4,False,Soufiaj,[Report] Search bug issue,Tidal
1,3871570145,user-reviews,21,20,913943275,2019-03-12 03:27:40+00:00,,,,I was slow coming to online music streaming. I...,5,False,RobPhilp,Tidal is my new best friend.,Tidal
2,7680736472,user-reviews,21,20,913943275,2021-08-11 13:45:01+00:00,,,,Despite there were so many controversial debat...,5,False,Mlok99,Still the best sound quality Music Streaming,Tidal


In [30]:
ios_df = ios_df.sort_values(['app', 'date']).reset_index(drop=True)

# Week rollup
ios_df['week_start'] = ios_df['date'].dt.normalize() - pd.to_timedelta(ios_df['date'].dt.dayofweek, unit='D')

# Calculate the moving average for each app
ios_df['mavg'] = ios_df.groupby('app')['rating'].rolling(window=7, min_periods=1).mean().reset_index(drop=True)