In [None]:
### Pull genres ###

from bs4 import BeautifulSoup
import requests

# Scrape data from genres page; See BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/
html_genres = requests.get('https://apps.apple.com/us/genre/ios/id36')
soup_genres = BeautifulSoup(html_genres.text, 'html.parser')

# Find all genres
genres = soup_genres.find_all(class_="top-level-genre")

# Isolate genre links
genre_links = [genre.get('href') for genre in genres]

In [None]:
### Pull apps in particular genre ###

# Obtain Social Networking link
social_link = [genre_link for genre_link in genre_links if "social-networking" in genre_link][0]

# Create alphabetic list to iterate over pages; See this post: https://stackoverflow.com/questions/16060899/alphabet-range-in-python
import string
alphabet = list(string.ascii_uppercase) + ['#']

# Create numeric list to iterate over pages
numbers = list(range(1,100))

# Get app links for given genre
def get_app_links(genre_link):
  app_links = []
  for letter in alphabet:
    for number in numbers:
      try:
        html_letter = requests.get(f'{genre_link}?letter={letter}&page={number}#page')
        soup_letter = BeautifulSoup(html_letter.text, 'html.parser')
      except:
        continue # Numerical page might not exist
      links = soup_letter.find_all("a", href=True)
      sub_links = [link.get('href') for link in links]
      app_links += [app_link for app_link in sub_links if "https://apps.apple.com/us/app/" in app_link]
  return list(set(app_links)) # Remove duplicates with set, but return list

# App links for Social Networking category
app_links = get_app_links(social_link)

In [None]:
### Pull other apps for given developer ###

other_app_links = []
for app_link in app_links:
  try:
    # Find developer
    html_app = requests.get(f'{app_link}')
    soup_app = BeautifulSoup(html_app.text, 'html.parser')
    links_app = soup_app.find_all("a", href=True)
    sub_links_app = [link.get('href') for link in links_app]
    developer_links = [link for link in sub_links_app if "https://apps.apple.com/us/developer/" in link]
    developer = developer_links[0]

    # Pull other apps for that developer
    html_dev = requests.get(f'{developer}')
    soup_dev = BeautifulSoup(html_dev.text, 'html.parser')
    links_dev = soup_dev.find_all("a", href=True)
    sub_links_dev = [link.get('href') for link in links_dev]
    other_apps = [link for link in sub_links_dev if "https://apps.apple.com/us/app/" in link]
    other_app_links += other_apps
  except:
    continue

In [None]:
### Combine app lists ###
app_links += other_app_links

In [None]:
### Take unique app links ###
unique_app_links = set(app_links)

In [None]:
### Get app-specific data ###

import pandas as pd

# Imports for version history scrape; See Selenium Docs: https://selenium-python.readthedocs.io/getting-started.html#simple-usage; See this YouTube tutorial: https://www.youtube.com/watch?v=ATigYVyCoAE&list=PLz-0BiySzeQUuXoD4_mT3lex6HNi7f-YH
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# Create dataframe, which will be appended to throughout the loop
columns = ['name', 'url', 'category', 'age', 'ratings_score', 'ratings_count', 'privacy_headings', 'developer', 'price', 'purchases', 'version_history']
df = pd.DataFrame(columns = columns)

# Set up driver for version history scrape
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)

# Collect data for all apps
for app_link in unique_app_links:
    # Scrape data for certain app
    try:
      html_app = requests.get(f'{app_link}')
      soup_app = BeautifulSoup(html_app.text, 'html.parser')
    except:
      continue # Page not found

    # URL
    url = app_link

    # Category
    try:
      links = soup_app.find_all("a", href=True)
      sub_links = [link.get('href') for link in links]
      # Find category-specific links
      category_links = [app_link for app_link in sub_links if "https://itunes.apple.com/us/genre/" in app_link]
      category = category_links[0]
    except:
      category = '' # Category not found

    # Name
    try:
      name_code = soup_app.find_all(class_="product-header__title app-header__title")
      name_list = name_code[0].text.strip()
      name = name_list.split('\n')[0].strip()
    except:
      name = '' # Name not found

    # Age
    try:
      age = name_list.split('\n')[1].strip()
    except:
      age = '' # Age not found

    # Ratings
    try:
      ratings_code = soup_app.find_all(class_="we-rating-count star-rating__count")
      ratings = ratings_code[0].text.strip()
      ratings_split = ratings.split('•')
      ratings_score = ratings_split[0].strip()
      ratings_count = ratings_split[1].strip().split(' ')[0].strip()
    except:
      ratings_score = '' # Ratings score not found
      ratings_count = '' # Ratings count not found

    # Privacy information
    try:
      privacy_code = soup_app.find_all(class_="privacy-type__heading")
      privacy_headings = [heading.text for heading in privacy_code]
    except:
      privacy_headings = '' # Privacy information not found

    # Developer
    try:
      links = soup_app.find_all("a", href=True)
      sub_links = [link.get('href') for link in links]
      # Find developer-specific links
      developer_links = [app_link for app_link in sub_links if "https://apps.apple.com/us/developer/" in app_link]
      developer = developer_links[0]
    except:
      developer = '' # Developer not found

    # Price
    try:
      price_code = soup_app.find_all(class_="inline-list__item inline-list__item--bulleted app-header__list__item--price")
      price = price_code[0].text.strip()
    except:
      price = '' # Price not found

    # In-app purchases
    try:
      purchases_code = soup_app.find_all(class_="inline-list__item inline-list__item--bulleted app-header__list__item--in-app-purchase")
      purchases = purchases_code[0].text.strip()
    except:
      purchases = '' # In-app purchases not found

    # Version history
    version_bucket = []
    try:
      cutoff_time = time.time() + 30 # 30 second time limit to avoid excessive runtime
      version_complete = False
      while time.time() <= cutoff_time and version_complete != True:
        driver.get(f'{app_link}')

        element = driver.find_element(By.CLASS_NAME, 'we-modal__show.link.section__nav__see-all-link')
        element.click()

        versions = driver.find_elements(By.CLASS_NAME, 'version-history__item')

        if len(versions) == 0:
          version_complete = True
        
        for version in versions:
          version_bucket.append(version.text.split("\n"))
        version_complete = True

    except:
      version_bucket.append("version history not found")

    version_history = [version[1] if len(version) > 1 else version for version in version_bucket]

    # Append all data to dataframe
    df_app = pd.DataFrame([[name, url, category, age, ratings_score, ratings_count, privacy_headings, developer, price, purchases, version_history]], columns=columns)
    df = pd.concat([df, df_app], ignore_index=True)
    
driver.quit()

In [None]:
### Export data ###
df.to_csv('raw_data.csv')