In [None]:
import numpy as np
import pandas as pd
import json

import random
import requests
import re
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

**Set up ChromeDriver for Selenium**

In [None]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

### Gather data from sites with a DIV class and script HTML element 
* DIV class uses kib-grid item format
* uses \<script> with application/ld+json type

In [None]:
# Create an empty list to collect links
links = []

# Collect links to collect data from
url = f'<url of webpage with links to collect from>'

driver = webdriver.Chrome()
driver.get(url)

soup = BeautifulSoup(driver.page_source, 'html.parser')

tags = soup.find_all('div', attrs={'class':'kib-grid__item kib-grid__item--span-4@min-xs kib-grid__item--span-4@md kib-grid__item--span-4@min-lg'})

# write collected links to file
file = open("<filename to collect links in>.txt","x")

for tag in tags:
    link = tag.find('a')['href']
    links.append(link)
    
    file.write(f'{link}\n')

file.close()
driver.quit()
print(f'Finished retrieving {len(links)} links')

**Gather actual data from collected links**

In [None]:
# First, since all the links obtained are part of an URL. Need to construct full URL.
full_links = ['<input url without endpoint here>' + x for x in links]
print(f'Check number of links: {len(full_links)}')

In [None]:
# Set up scraper
savepath = '<folder path to save scraped data>'
links_count = 0  # to keep track of how many links the scraper has gone through

for url in full_links:
    driver = webdriver.Chrome()
    driver.implicitly_wait(3) # add this so we don't need to wait for all webpage elements to load.
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    data = [json.loads(x.string) for x in soup.find_all("script", type="application/ld+json")]
    
    filename = '_'.join(url.split('/')[-2:])
    with open(f"{savepath}/{filename}.json", "x") as outfile:
        json.dump(data, outfile)
        
    links_count += 1
    
driver.quit()
    
print(f'Data collection is complete for {links_count} URLs.')

### Gather data from sites with article format
* DIV class in article format
* uses \<script> with application/ld+json type

In [None]:
# Create an empty list to collect links
links = []

# Collect links to collect data from
url = '<url of webpage with links to collect from>'

# write collected links to file
file = open("<filename to collect links in>.txt","x")

for i in range(17):  # the number in range represents total pagination
    if (i+1) == 1:
        url_page = url
        driver = webdriver.Chrome()
        driver.get(url_page)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        tags = soup.find_all('div', attrs={'class':'article_card_articleCard__UmssU'})      
        for tag in tags:
            link = tag.find('a')['href']
            links.append(link)
            file.write(f'{link}\n')
            
    else:
        url_page = f'{url}/p/{i+1}#all-articles'
        driver = webdriver.Chrome()
        driver.get(url_page)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        tags = soup.find_all('div', attrs={'class':'article_card_articleCard__UmssU'})      
        for tag in tags:
            link = tag.find('a')['href']
            links.append(link)
            file.write(f'{link}\n')

file.close()
driver.quit()
print(f'Finished retrieving {len(links)} links')

**Gather data from collected links**

In [None]:
# First, since all the links obtained are part of an URL. Need to construct full URL.
full_links = ['<input url without endpoint>' + x for x in links]
print(f'Check number of links: {len(full_links)}')

In [None]:
# Set up scraper
savepath = '<folder path to save scraped data>'
links_count = 0  # to keep track of how many links the scraper has gone through

for url in full_links:
    driver = webdriver.Chrome()
    driver.implicitly_wait(3) # add this so we don't need to wait for all webpage elements to load.
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    data = [json.loads(x.string) for x in soup.find_all("script", type="application/ld+json")]
    
    filename = '_'.join(url.split('/')[-2:])
    with open(f"{savepath}/{filename}.json", "w") as outfile:
        json.dump(data, outfile)
        
    links_count += 1
    
driver.quit()
    
print(f'Data collection is complete for {links_count} URLs.')