In [1]:
from datetime import datetime, timedelta
import pickle
from time import sleep

from bs4 import BeautifulSoup
from requests_html import HTMLSession
import yfinance as yf

In [2]:
EXCHANGE_PATHS = {
    'nasdaq': 'nasdaq',
    'nyse': 'newyorkstockexchange',
    'amex': 'americanstockexchange'}
base_url = 'https://www.advfn.com'
session = HTMLSession()

In [3]:
def get_soup(url):
    try:
        res = session.get(url)
        html = res.html.html
        soup = BeautifulSoup(html, 'html.parser')
        return soup
    except BaseException as e:
        print(f'Problem retrieving page: {url}\n{e}')

In [4]:
def get_table(soup):
    try:
        table = soup.find('table', class_='market')
        return table
    except BaseExcetpion as e:
        print('Unable to obtain table')
        print(e)

In [5]:
def get_rows(table):
    try:
        rows = table.find_all('tr')
        rows = [
            row for row in rows if 'class' in row.attrs 
            and row.attrs['class'][0].startswith('ts')]
        return rows
    except BaseException as e:
        print('Unable to extract rows')
        print(e)

In [6]:
def is_valid(symbol):
    MAXLEN = 4
    return len(symbol) <= MAXLEN and symbol.isalpha()

In [7]:
def get_symbol(row):
    SYMBOL_COL_IDX = 1
    try:
        symbol = row.find_all('a')[SYMBOL_COL_IDX].text
        if is_valid(symbol):
            return symbol
    except BaseException as e:
        print('Unable to get symbol from row:', row)
        print(e)

In [8]:
def extract_symbols(soup):
    symbols = []
    table = get_table(soup)
    rows = get_rows(table)
    for row in rows:
        symbol = get_symbol(row)
        if symbol is not None:
            symbols.append(symbol)
    return symbols

In [9]:
def extract_all_symbols():
    all_symbols = []
    for exchange, path in EXCHANGE_PATHS.items():
        print(f'Beggining {exchange}\nLetter:')
        for letter in list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
            print(letter, end='')
            url = f'{base_url}/{exchange}/{path}.asp?companies={letter}'
            soup = get_soup(url)
            symbols = extract_symbols(soup)
            all_symbols += symbols
        print()
    return sorted(all_symbols)

In [10]:
all_symbols = extract_all_symbols()

Beggining nasdaq
Letter:
ABCDEFGHIJKLMNOPQProblem retrieving page: https://www.advfn.com/nasdaq/nasdaq.asp?companies=Q



NameError: name 'BaseExcetpion' is not defined

In [None]:
all_symbols = sorted(list(set(all_symbols)))

In [None]:
all_symbols[:10]

In [None]:
all_symbols[-10:]

In [None]:
with open('../data/all_symbols.pkl', 'wb') as f:
    pickle.dump(all_symbols, f)

In [None]:
n = len(all_symbols)
n

In [None]:
END = datetime.now()
START = END - timedelta(7)
test = yf.download(['AAPL', 'TSLA', 'ZQZZ'], start=str(START.date()), end=str(END.date()))['Adj Close']
test

In [None]:
test2 = test.loc[:, test.iloc[-1, :].notnull()]
test2

In [None]:
test2.columns

In [None]:
start = str(START.date())
end = str(END.date())
start, end

### Next Time
Code below is inefficient, bc it retries symbols if < $2 instead of removing them. 
Add to `omit` instead and do not rerun. (Update for all code chunks below.)

In [None]:
i = 0
active = []
failed = []
omit = []

while i <= n:
    first = i
    last = min(i + 100, n)
    batch = all_symbols[first:last]
    print(f'Batch: {batch[0]} - {batch[-1]}', end='\r')
    df = yf.download(batch, start=start, end=end)['Adj Close']
    df = df.loc[:, df.iloc[-1, :].notnull()]
    batch_active = df.columns
    # Exclude if current val < 2.00
    df = df.loc[:, df.iloc[-1, :] >= 2.]
    batch_omit = [sym for sym in batch_active if sym not in df.columns]
    omit += list(batch_omit)
    active += list(df.columns)
    fails = [sym for sym in batch if sym not in df.columns and sym not in batch_omit]
    failed += fails
    sleep(3)
    i += 100

In [None]:
'ACAI' in active, 'ACAI' in failed, 'ACAI' in omit 

### 2nd Attempt

In [None]:
len(active), len(omit)

In [None]:
n = len(failed)
n

In [None]:
failed_again = []
i = 0
while i <= n:
    first = i
    last = min(i + 100, n)
    batch = failed[first:last]
    print(f'Batch: {batch[0]} - {batch[-1]}', end='\r')
    df = yf.download(batch, start=start, end=end)['Adj Close']
    print(df.shape)
    if len(df):
        df = df.loc[:, df.iloc[-1, :].notnull()]
        batch_active = df.columns
        # Exclude if current val < 2.00
        df = df.loc[:, df.iloc[-1, :] >= 2.]
        batch_omit = [sym for sym in batch_active if sym not in df.columns]
        omit += list(batch_omit)
        active += list(df.columns)
        fails = [sym for sym in batch if sym not in df.columns and sym not in batch_omit]
        failed_again += fails
    else:
        failed_again += batch
    sleep(3)
    i += 100

### 3rd Attempt

In [None]:
'ACAI' in active, 'ACAI' in failed, 'ACAI' in omit 

In [None]:
len(active), len(omit)

In [None]:
failed = failed_again[:]
n = len(failed)
n

In [None]:
failed_again = []
i = 0
while i <= n :
    first = i
    last = min(i + 100, n)
    batch = failed[first:last]
    print(f'Batch: {batch[0]} - {batch[-1]}', end='\r')
    df = yf.download(batch, start=start, end=end)['Adj Close']
    print(df.shape)
    if len(df):
        df = df.loc[:, df.iloc[-1, :].notnull()]
        batch_active = df.columns
        # Exclude if current val < 2.00
        df = df.loc[:, df.iloc[-1, :] >= 2.]
        batch_omit = [sym for sym in batch_active if sym not in df.columns]
        omit += list(batch_omit)
        active += list(df.columns)
        fails = [sym for sym in batch if sym not in df.columns and sym not in batch_omit]
        failed_again += fails
    else:
        failed_again += batch
    sleep(3)
    i += 100

### 4th Attempt

In [None]:
len(active), len(omit)

In [None]:
failed = failed_again[:]
n = len(failed)
n

In [None]:
with open('../data/all_symbols.pkl', 'wb') as f:
    pickle.dump(active, f)

In [None]:
'INTI' in active, 'INTI' in omit

In [None]:
!say complete

In [None]:
# Run...?
'''
failed_again = []
i = 0
while i <= n - 100:
    first = i
    last = min(i + 100, n)
    batch = failed[first:last]
    print(f'Batch: {batch[0]} - {batch[-1]}', end='\r')
    df = yf.download(batch, start=start, end=end)['Adj Close']
    print(df.shape)
    if len(df):
        df = df.loc[:, df.iloc[-1, :].notnull()]
        batch_active = df.columns
        # Exclude if current val < 2.00
        df = df.loc[:, df.iloc[-1, :] >= 2.]
        batch_omit = [sym for sym in batch_active if sym not in df.columns]
        omit += list(batch_omit)
        active += list(df.columns)
        fails = [sym for sym in batch if sym not in df.columns and sym not in batch_omit]
        failed_again += fails
    else:
        failed_again += batch
    sleep(3)
    i += 100
'''