In [25]:
#scraping for multiple pages in a loop

#import libraries used for accessing pages randomly (mimicking human behavior)
from time import sleep
from time import time
from random import randint
from warnings import warn
from IPython.core.display import clear_output

import pandas as pd

from requests import get
from bs4 import BeautifulSoup

headers = {"Accept-Language": "en-US, en;q=0.5"}

#an array of 5-tuples: (Company Ticker, Violation Date, Start Date, End Date)
#for now, our array is random and contains UNIX dates for convenience
violations = [("AAPL", 0, 1528095600, 1559631600, 0)]

#a list of dataframes, each corresponding to a violation
dataframes = {}

# For every (ticker, sdate, edate) triple make new get request
for v in violations:
    
    # Lists to store the scraped data in (for each violation)
    dates = []
    opens = []
    highs = []
    lows = []
    closes = []
    adjcloses = []
    volumes = []

    # Make a get request for daily stock values in the date range specified
    response = get('https://finance.yahoo.com/quote/' + str(v[0]) + 
                   '/history?period1=' + str(v[2]) +
                   '&period2=' + str(v[3]) + 
                   '&interval=1mo&filter=history&frequency=1mo',
                   headers = headers)
    
    print('Scraping for: ' + str(v[0]) + ', ' + str(v[2]) + ', ' + str(v[3]))

    # Pause the loop
    sleep(randint(3,8))

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the date rows on page (including dividends)
    date_containers = page_html.find_all('tr', {'class' : ["BdT Bdc($c-fuji-grey-c) Ta(end) Fz(s) Whs(nw)"]})

    # Extract data from individual book container
    for container in date_containers:
        cells = container.find_all('td', {'class' : ["Py(10px) Pstart(10px)"]})
        if len(cells) > 0:
            # The date
            date = container.td.text
            dates.append(date)
            # if the row is not a dividend            
            # The open price
            opn = cells[0].text
            opens.append(opn)
            # The high price
            high = cells[1].text
            highs.append(high)
            # The low price
            low = cells[2].text
            lows.append(low)
            # The close price
            close = cells[3].text
            closes.append(close)
            # The adjusted close price
            adjclose = cells[4].text
            adjcloses.append(adjclose)
            # The volume
            volume = cells[5].text
            volumes.append(volume)
            
    # Make a dataframe for this iteration
    # Each dataframe corresponds to a specific violation
    violation_df = pd.DataFrame({'date': dates,
                              'open': opens,
                              'high': highs,
                              'close': closes,
                              'adjusted close': adjcloses,
                              'volume' : volumes
    })
    
    # Add dataframe to dictionary called dataframes
    dataframes[v] = violation_df

Scraping for: AAPL, 1528095600, 1559631600


In [22]:
dataframes

{('AAPL',
  '1528095600',
  '1559631600'):             date    open    high   close adjusted close       volume
 0   Jun 01, 2019  175.60  185.47  185.22         185.22  121,852,700
 1   May 01, 2019  209.88  215.31  175.07         174.40  739,456,600
 2   Apr 01, 2019  191.64  208.48  200.67         199.90  506,117,700
 3   Mar 01, 2019  174.28  197.69  189.95         189.22  650,945,900
 4   Feb 01, 2019  166.96  175.87  173.15         171.75  472,540,600
 5   Jan 01, 2019  154.89  169.00  166.44         165.09  828,019,300
 6   Dec 01, 2018  184.46  184.94  157.74         156.46  898,922,500
 7   Nov 01, 2018  219.05  222.36  178.58         176.52  961,326,400
 8   Oct 01, 2018  227.95  233.47  218.86         216.33  789,748,500
 9   Sep 01, 2018  228.41  229.67  225.74         223.14  678,972,000
 10  Aug 01, 2018  199.13  228.87  227.63         224.22  700,273,700
 11  Jul 01, 2018  183.82  195.96  190.29         187.44  393,691,400
 12  Jun 01, 2018       -       -       -       

In [23]:
for v in violations:
    

[('AAPL', '1528095600', '1559631600')]