In [48]:
import pandas as pd
import numpy as np
from app_store_scraper import AppStore
from google_play_scraper import reviews_all, reviews, Sort, app
import time

In [52]:
def get_apple_reviews(app_name, app_id, country):
    
    # Try and get scrape reviews...
    for n in range(20):
        reviews = AppStore(country = country, app_name = app_name, app_id = app_id)
        reviews.review(how_many=100000)
        reviews_dict = reviews.reviews
        del reviews
        
        # If no results, retry...
        if len(reviews_dict) < 2:
            print("No results found - let's wait a sec, then retry...")
            print(reviews_dict)
            time.sleep(10+(n*2))
        else:
            print('Found some results! Moving on...')
            break
            
    return reviews_dict


def apple_reviews_to_df(reviews_dict, app_name):
        # Turn to dataframe
    col_date = []
    col_review = []
    col_title = []
    col_rating = []
    col_app = []
    for review in reviews_dict:
        col_date.append(review['date'])
        col_title.append(review['title'])
        col_review.append(review['review'])
        col_rating.append(review['rating'])
        col_app.append(app_name)
    
    reviews_df = pd.DataFrame({'date':col_date,
                               'title':col_title,
                               'review':col_review,
                               'rating':col_rating,
                               'app':col_app})
    return reviews_df


In [53]:
def get_google_reviews(app_id, country, lang):
    reviews_dict = reviews_all(app_id,
                               sleep_milliseconds=0, # defaults to 0
                               country=country, # Changing this doesn't do anything - 3078 reviews show up regardless of what country code - this is probably all EN reviews, after checking how many per country with "app()"
                               lang=lang) # Just choosing EN filters out some english reviews, e.g. FR is mostly english)
    return reviews_dict

def google_reviews_to_df(reviews_dict, app_name):
        # Turn to dataframe
    col_date = []
    col_review = []
    col_title = []
    col_rating = []
    col_app = []
    for review in reviews_dict:
        col_date.append(review['at'])
        col_title.append('')
        col_review.append(review['content'])
        col_rating.append(review['score'])
        col_app.append(app_name)
    
    reviews_df = pd.DataFrame({'date':col_date,
                               'title':col_title,
                               'review':col_review,
                               'rating':col_rating,
                               'app':col_app})
    return reviews_df
    

In [54]:
apple_reviews_economist = {}
apple_reviews_espresso = {}
for country in ['US','GB','CA','AU','ZA','IE']:
    apple_reviews_economist[country] = get_apple_reviews('the-economist', app_id='1239397626', country=country)

    apple_reviews_espresso[country] = get_apple_reviews('espresso-from-the-economist', app_id='896628003', country=country)

    

2022-08-31 09:44:51,841 [INFO] Base - Initialised: AppStore('us', 'the-economist', 1239397626)
2022-08-31 09:44:51,842 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/us/app/the-economist/id1239397626
2022-08-31 09:44:55,632 [INFO] Base - [id:1239397626] Fetched 1486 reviews (1486 fetched in total)
2022-08-31 09:44:55,677 [INFO] Base - Initialised: AppStore('us', 'espresso-from-the-economist', 896628003)
2022-08-31 09:44:55,677 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/us/app/espresso-from-the-economist/id896628003


Found some results! Moving on...


2022-08-31 09:45:00,710 [INFO] Base - [id:896628003] Fetched 560 reviews (560 fetched in total)
2022-08-31 09:45:06,076 [INFO] Base - [id:896628003] Fetched 1140 reviews (1140 fetched in total)
2022-08-31 09:45:10,359 [INFO] Base - [id:896628003] Fetched 1571 reviews (1571 fetched in total)


Found some results! Moving on...


2022-08-31 09:45:11,251 [INFO] Base - Initialised: AppStore('gb', 'the-economist', 1239397626)
2022-08-31 09:45:11,251 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/gb/app/the-economist/id1239397626
2022-08-31 09:45:31,131 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/1239397626/reviews?l=en-GB&offset=180&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:45:31,137 [INFO] Base - [id:1239397626] Fetched 180 reviews (180 fetched in total)


Found some results! Moving on...


2022-08-31 09:45:32,271 [INFO] Base - Initialised: AppStore('gb', 'espresso-from-the-economist', 896628003)
2022-08-31 09:45:32,272 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/gb/app/espresso-from-the-economist/id896628003
2022-08-31 09:45:51,623 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/gb/apps/896628003/reviews?l=en-GB&offset=80&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:45:51,628 [INFO] Base - [id:896628003] Fetched 80 reviews (80 fetched in total)


Found some results! Moving on...


2022-08-31 09:45:52,259 [INFO] Base - Initialised: AppStore('ca', 'the-economist', 1239397626)
2022-08-31 09:45:52,261 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ca/app/the-economist/id1239397626
2022-08-31 09:46:10,732 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/ca/apps/1239397626/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:46:10,737 [INFO] Base - [id:1239397626] Fetched 0 reviews (0 fetched in total)


No results found - let's wait a sec, then retry...
[]


2022-08-31 09:46:20,805 [INFO] Base - Initialised: AppStore('ca', 'the-economist', 1239397626)
2022-08-31 09:46:20,806 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ca/app/the-economist/id1239397626
2022-08-31 09:46:39,436 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/ca/apps/1239397626/reviews?l=en-GB&offset=0&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:46:39,439 [INFO] Base - [id:1239397626] Fetched 0 reviews (0 fetched in total)


No results found - let's wait a sec, then retry...
[]


2022-08-31 09:46:49,533 [INFO] Base - Initialised: AppStore('ca', 'the-economist', 1239397626)
2022-08-31 09:46:49,542 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ca/app/the-economist/id1239397626
2022-08-31 09:47:08,102 [INFO] Base - [id:1239397626] Fetched 20 reviews (20 fetched in total)
2022-08-31 09:47:26,836 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/ca/apps/1239397626/reviews?l=en-GB&offset=20&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:47:26,841 [INFO] Base - [id:1239397626] Fetched 20 reviews (20 fetched in total)


Found some results! Moving on...


2022-08-31 09:47:27,392 [INFO] Base - Initialised: AppStore('ca', 'espresso-from-the-economist', 896628003)
2022-08-31 09:47:27,393 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ca/app/espresso-from-the-economist/id896628003
2022-08-31 09:47:47,119 [ERROR] Base - Something went wrong: HTTPSConnectionPool(host='amp-api.apps.apple.com', port=443): Max retries exceeded with url: /v1/catalog/ca/apps/896628003/reviews?l=en-GB&offset=100&limit=20&platform=web&additionalPlatforms=appletv%2Cipad%2Ciphone%2Cmac (Caused by ResponseError('too many 429 error responses'))
2022-08-31 09:47:47,124 [INFO] Base - [id:896628003] Fetched 100 reviews (100 fetched in total)


Found some results! Moving on...


2022-08-31 09:47:50,130 [INFO] Base - Initialised: AppStore('au', 'the-economist', 1239397626)
2022-08-31 09:47:50,131 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/au/app/the-economist/id1239397626
2022-08-31 09:47:51,963 [INFO] Base - [id:1239397626] Fetched 139 reviews (139 fetched in total)


Found some results! Moving on...


2022-08-31 09:47:52,527 [INFO] Base - Initialised: AppStore('au', 'espresso-from-the-economist', 896628003)
2022-08-31 09:47:52,527 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/au/app/espresso-from-the-economist/id896628003
2022-08-31 09:48:12,030 [INFO] Base - [id:896628003] Fetched 80 reviews (80 fetched in total)
2022-08-31 09:48:12,769 [INFO] Base - [id:896628003] Fetched 129 reviews (129 fetched in total)


Found some results! Moving on...


2022-08-31 09:48:17,040 [INFO] Base - Initialised: AppStore('za', 'the-economist', 1239397626)
2022-08-31 09:48:17,042 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/za/app/the-economist/id1239397626
2022-08-31 09:48:17,227 [INFO] Base - [id:1239397626] Fetched 18 reviews (18 fetched in total)


Found some results! Moving on...


2022-08-31 09:48:25,200 [INFO] Base - Initialised: AppStore('za', 'espresso-from-the-economist', 896628003)
2022-08-31 09:48:25,201 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/za/app/espresso-from-the-economist/id896628003
2022-08-31 09:48:25,408 [INFO] Base - [id:896628003] Fetched 17 reviews (17 fetched in total)


Found some results! Moving on...


2022-08-31 09:48:25,941 [INFO] Base - Initialised: AppStore('ie', 'the-economist', 1239397626)
2022-08-31 09:48:25,942 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ie/app/the-economist/id1239397626
2022-08-31 09:48:26,362 [INFO] Base - [id:1239397626] Fetched 27 reviews (27 fetched in total)


Found some results! Moving on...


2022-08-31 09:48:27,045 [INFO] Base - Initialised: AppStore('ie', 'espresso-from-the-economist', 896628003)
2022-08-31 09:48:27,046 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/ie/app/espresso-from-the-economist/id896628003
2022-08-31 09:48:27,249 [INFO] Base - [id:896628003] Fetched 17 reviews (17 fetched in total)


Found some results! Moving on...


In [5]:
google_reviews_economist = {}
google_reviews_espresso = {}

google_reviews_economist['All'] = get_google_reviews('com.economist.lamarr', country='US', lang='EN')
google_reviews_espresso['All'] = get_google_reviews('com.economist.darwin', country='US', lang='EN')

    

In [None]:
def get_all_reviews(country_list = None)

    # Load default countries
    if country_list is None:
        countries_english = ['US','GB','CA','AU','ZA','IE','NZ'] # English speaking countries
        countries_other = ['IN','US','ID','BR','DE','ES','NL','SE','SG'] # Populous countries where vast majority of reviews are in english
        countries_list = countries_english + countries_other

    # Scrape apple reviews
    apple_reviews_economist = {}
    apple_reviews_espresso = {}
    print('Getting apple reviews...')
    for country in countries_list: 
        apple_reviews_economist[country] = get_apple_reviews('the-economist', app_id='1239397626', country=country)
        apple_reviews_espresso[country] = get_apple_reviews('espresso-from-the-economist', app_id='896628003', country=country)
    print('Apple reviews downloaded!')

    # Scrape google reviews
    google_reviews_economist = {}
    google_reviews_espresso = {}
    print('Getting google reviews...')
    google_reviews_economist['All'] = get_google_reviews('com.economist.lamarr', country='US', lang='EN') # I think this is is all english reviews worldwide
    google_reviews_espresso['All'] = get_google_reviews('com.economist.darwin', country='US', lang='EN') # I think this is is all english reviews worldwide
    print('Google reviews downloaded!')

    # Turn to dataframe and combine
    all_reviews_list = []

    for r in apple_reviews_economist:
        all_reviews_list.append(apple_reviews_to_df(apple_reviews_economist[r], 'The Economist (Apple)'))
    for r in apple_reviews_espresso:
        all_reviews_list.append(apple_reviews_to_df(apple_reviews_espresso[r], 'Espresso (Apple)'))
    for r in google_reviews_economist:
        all_reviews_list.append(google_reviews_to_df(google_reviews_economist[r], 'The Economist (Google)'))
    for r in google_reviews_economist:
        all_reviews_list.append(google_reviews_to_df(google_reviews_espresso[r], 'Espresso (Google)'))

    final_reviews_df = pd.concat(all_reviews_list).reset_index(drop=True)

    return final_reviews_df

In [7]:
all_reviews_list = []

for r in apple_reviews_economist:
    all_reviews_list.append(apple_reviews_to_df(apple_reviews_economist[r], 'The Economist (Apple)'))
    
for r in apple_reviews_espresso:
    all_reviews_list.append(apple_reviews_to_df(apple_reviews_espresso[r], 'Espresso (Apple)'))

for r in google_reviews_economist:
    all_reviews_list.append(google_reviews_to_df(google_reviews_economist[r], 'The Economist (Google)'))

for r in google_reviews_economist:
    all_reviews_list.append(google_reviews_to_df(google_reviews_espresso[r], 'Espresso (Google)'))


In [11]:
final_reviews_df = pd.concat(all_reviews_list).reset_index(drop=True)


In [33]:
fname = '/Users/dominicbates/Documents/GitHub/app-review-classifier/data/reviews_sample.parquet'
final_reviews_df.to_parquet(fname)

In [7]:
def get_apple_reviews(app_name, app_id, country):
    
    # Try and get scrape reviews...
    reviews = AppStore(country = country, app_name = app_name, app_id = app_id)
    reviews.review(how_many=100000)
    reviews_dict = reviews.reviews
#     del reviews

#     # If no results, retry...
#     if len(reviews_dict) < 2:
#         print("No results found - let's wait a sec, then retry...")
#         time.sleep(10)
#     else:
#         print('Found some results! Moving on...')
#         break
            
    return reviews, reviews_dict

In [8]:
['US','GB','CA','AU','ZA','IE'] # Arbitary list of english speaking countries
    
test, test_dict = get_apple_reviews('the-economist', app_id='1239397626', country='IE')

2022-08-30 10:52:17,664 [INFO] Base - Initialised: AppStore('us', 'the-economist', 1239397626)
2022-08-30 10:52:17,665 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/us/app/the-economist/id1239397626
2022-08-30 10:52:22,759 [INFO] Base - [id:1239397626] Fetched 620 reviews (620 fetched in total)
2022-08-30 10:52:28,044 [INFO] Base - [id:1239397626] Fetched 1260 reviews (1260 fetched in total)
2022-08-30 10:52:30,082 [INFO] Base - [id:1239397626] Fetched 1486 reviews (1486 fetched in total)


In [10]:
test_dict

[{'rating': 5,
  'isEdited': False,
  'review': 'I’m new to The Economist after a suggestion from a relative.  My goal was to be more purposeful and engaged in the content that I consume and break the habit of endless scrolling of news headlines via Twitter, Google’s and Apple’s customized news outlets.  I find myself actually wanting to read the content published by The Economist rather than skimming headlines.  I enjoy learning about what’s going on around the world rather than being inundated by insignificant headlines that make many of the other news outlets’ front pages just to get clicks.  It’s a completely different experience.  For times when I can’t read the content, like while driving, or if I want to consume some of their longer form content but want to give my eyes a rest from screen fatigue, it’s nice they offer users the option to listen to the content as well.  The daily Espresso emails are short bursts of content in the AM while I have my coffee and then throughout the 

In [103]:
tt = []
english_speaking = ['US','GB','CA','AU','ZA','IE','NZ']
big_population = ['IN','US','ID','BR','DE','ES','NL','SE','SG','HK']

['TR','IR','TH','FR','GB','IT','ZA','TZ','MM','KR','KE','CO','ES','UA','AR','UG','DZ','SD','IQ','PL','AF','CA','MA','SA','UZ','PE','MY','AO','GH','MZ','VE','YE','NP','MG','KP','CM','CI','AU','TW','NE','LK','BF','RO','ML','CL','KZ','MW','ZM','GT','NL','EC','SY','KH','SN','TD','SO','ZW','GN','RW','TN','BJ','BE','BO','CU','BI','HT','GR','DO','CZ','PT','SE','JO','AZ','HU','AE','HN','BY','TJ','IL','AT','PG','CH','SS','TG','SL','HK','LA','BG','RS','PY','LB','LY','NI','SV','KG','TM','DK','SG','FI','SK','NO','CG','CR','NZ','IE','OM','LR','CF','PS','MR','PA','KW','HR','GE','MD','UY','BA','PR','MN','AM','JM','AL','LT','QA','NA','GM','BW','GA','LS','MK','SI']    
    
    
test_economist = get_apple_reviews('the-economist', app_id='1239397626', country='HK')

test_espresso = get_apple_reviews('espresso-from-the-economist', app_id='896628003', country='HK')

        

2022-08-31 11:32:50,242 [INFO] Base - Initialised: AppStore('hk', 'the-economist', 1239397626)
2022-08-31 11:32:50,243 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/hk/app/the-economist/id1239397626
2022-08-31 11:32:50,682 [INFO] Base - [id:1239397626] Fetched 28 reviews (28 fetched in total)


Found some results! Moving on...


2022-08-31 11:32:52,014 [INFO] Base - Initialised: AppStore('hk', 'espresso-from-the-economist', 896628003)
2022-08-31 11:32:52,016 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/hk/app/espresso-from-the-economist/id896628003
2022-08-31 11:32:52,241 [INFO] Base - [id:896628003] Fetched 19 reviews (19 fetched in total)


Found some results! Moving on...


In [109]:

test_espresso[-1]['review']

'未經授權自動續購引起爭論'

In [26]:
test_dict2[0]

{'rating': 5,
 'title': 'Great App, fantastic content and journalism.',
 'review': 'I’m new to The Economist after a suggestion from a relative.  My goal was to be more purposeful and engaged in the content that I consume and break the habit of endless scrolling of news headlines via Twitter, Google’s and Apple’s customized news outlets.  I find myself actually wanting to read the content published by The Economist rather than skimming headlines.  I enjoy learning about what’s going on around the world rather than being inundated by insignificant headlines that make many of the other news outlets’ front pages just to get clicks.  It’s a completely different experience.  For times when I can’t read the content, like while driving, or if I want to consume some of their longer form content but want to give my eyes a rest from screen fatigue, it’s nice they offer users the option to listen to the content as well.  The daily Espresso emails are short bursts of content in the AM while I have

In [42]:
(np.random.poisson((3.52/4), 10000)==0).sum()/10000

0.4166

In [44]:
1.6664+2

3.6664000000000003

In [46]:
2+(0.4166)+((0.49*5) + (0.39*3))

6.0366

In [47]:
6.0366-3.6664

2.3702