In [None]:
import time
import requests
import pandas as pd
import numpy as np
import re

### Open the list of podcast IDs

In [None]:
df = pd.read_csv('../data/podcast_ID_list.csv')
ids = df['ID'].tolist()

print(f"Podcast to scrape ready")

### Initialize lists to hold scraped info

In [None]:
# Content to scrape
podcast_ids = list()
review_titles = list()
review_content = list()
review_rating = list()

# Logs
error_log = list()
noReview_log = list()

### Parsing function

In [None]:
def parse(podcast):
    '''
    Parse the passed strings to keep only word characters
    '''
    raw_title = podcast['title']['label']
    title = re.sub(r'[\W]+', ' ', raw_title).strip() 
    
    raw_content = podcast['content']['label'].lower()
    content = re.sub(r'[\W]+', ' ', raw_content).strip()
    
    rating = podcast['im:rating']['label']
    
    return title, content, rating

### Main scraping logic

In [None]:
h={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'}

for ind in range(len(ids)):
    
    # Pop the next ID in the list
    pod_id = ids.pop(0)
    
    # Variables that will be updated during each iteration
    page = 1
    podcast_err = 0
    
    while (page <= 10):

        # Fill in URL 
        url = f"https://itunes.apple.com/us/rss/customerreviews/page={page}/id={pod_id}/sortby=mostrecent/json"

        # Get request, include user-agent in headers
        r = requests.get(url, headers = h)

        # Check to see if successful
        status = r.status_code
        
        # If not successful, pause 30 seconds and try again (twice)
        while (status != 200) and (podcast_err < 2):
            time.sleep(30)
            
            # Log the number of unsuccesful attemps
            podcast_err +=1

            # Request again and check status
            r = requests.get(url, headers = h)
            status = r.status_code

        # if not successful after two tries, skip and log to global error log
        if podcast_err == 2:
            print(f"Could not scrape: {pod_id}")
            error_log.append(ind)
            # end process for this podcast, move to the next
            break

        # if passed all the previous tests, proceed...
        #  Get the JSON
        info = r.json()['feed']
        
        # Check if there are any reviews
        if "entry" not in info.keys():
            noReview_log.append(pod_id)
            break
        
        else: info = info['entry']
            
        # Check if it's a single review or multiple
        if isinstance(info, list):        
            
            for review in info:
                title, content, rating = parse(review)
                podcast_ids.append(pod_id)
                review_titles.append(title)
                review_content.append(content)
                review_rating.append(rating)
            
            # Check if it's a full-page of reviews (50)
            if len(info)!=50:
                break

        else:
            title, content, rating = parse(info)
            podcast_ids.append(pod_id)
            review_titles.append(title)
            review_content.append(content)
            review_rating.append(rating)
            break
        
        # If the podcast did not trigger a break anywhere, proceed after randomized pause
        page += 1
        time.sleep(np.random.uniform(low=0.5, high=2.5, size=1)[0])
    
    # Check if more than 3 consecutive errors or 20 total
    if len(error_log)>=3: 

        if len(error_log)>20 or (error_log[-1]==error_log[-2]+1==error_log[-3]+2):
            print(f"Too many consecutive or total errors, whole process terminated")
            break
    
    # Add some randomized delays
    if ind % 500 == 0:
        time.sleep(np.random.uniform(low=7.0, high=8.0, size=1)[0])
        
    elif ind % 100 == 0:
        time.sleep(np.random.uniform(low=5.0, high=6.5, size=1)[0])
    
    elif ind % 25 == 0:
        time.sleep(np.random.uniform(low=3.2, high=4.8, size=1)[0])
    
    elif ind % 10 == 0:
        time.sleep(np.random.uniform(low=1.5, high=2.5, size=1)[0])        
        
    else:
        time.sleep(np.random.uniform(low=0.2, high=1.2, size=1)[0])

### Write reviews to csv

In [None]:
data = [podcast_ids, review_titles, review_content, review_rating]

In [None]:
import sqlite3

#Connecting to sqlite
con = sqlite3.connect('../data/GuidePod.sqlite')

#Creating a cursor object using the cursor() method
cur = conn.cursor()

In [None]:
for i in data:
    
    podcastid  = i[1] 
    title = i[2] 
    content = i[3]
    rating = i[4]
    
    cur.execute('''INSERT INTO podcast_reviews values (?,?,?,?)''',(podcastid, title, content, rating ))

cur.close()
con.close()