In [None]:
import datetime
import sys
import time
from collections import namedtuple
from pyquery import PyQuery
from requests import get
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from tqdm.notebook import tqdm

In [None]:
TickerTuple = namedtuple('Ticker', ['symbol', 'name', 'industry', 'type', 'exchange'])

In [None]:
class YahooLookupBrowser:
    def __init__(self):
        self.browser = self._open_browser()
        self.base_url = "https://finance.yahoo.com/lookup/{category}?s={key}&t=A&b={start}&c={size}"
        
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        self.browser.close()

    def _open_browser(self):
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        return webdriver.Firefox(options=options)

    def lookup(self, key, category='all', start=0, size=100) -> (list, int):
        url = self.base_url.format(category=category, key=key, start=start, size=size)
        self.browser.get(url)
        raw_data = PyQuery(self.browser.page_source)
        
        title = raw_data("a[href*=\/lookup\/]")[0].find('span').text_content()
        total = int(title[title.find('(')+1:title.find(')')])
        
        if total == 0:
            return ([], 0)
        
        tbody = raw_data("tbody")
        ans = list()
        for row in tbody[0].findall('tr'):
            td = row.findall("td")
            td.pop(2)
            ans.append(TickerTuple._make(x.text_content() for x in td))

        return (ans, total)

In [None]:
%%time
lookup_browser = YahooLookupBrowser()
ans, total = lookup_browser.lookup(key='a')

In [None]:
print(len(ans), total)

In [None]:
%%time

lookup_browser = YahooLookupBrowser()

perpage = 10000
idx = 0
letters = list('abcdefghijklmnopqrstuvwxyz')
queue = list(letters)
pbar = tqdm(total=len(queue))

answers = set()
while idx < len(queue):
    pbar.set_description(f"[Query {queue[idx]}]")
    pbar.refresh()
    
    try:
        ans, total = lookup_browser.lookup(key=queue[idx], category='all', start=0, size=perpage)
        pbar.write(f"query = {queue[idx]} | count = {len(ans)} | total = {total}")
    except Exception as e:
        pbar.write(f"error: idx = {idx}")
        pbar.write(str(e))
        continue

    if total > perpage:
        add = [queue[idx] + l for l in letters]
        queue += add
        pbar.write(f"Add new queries {queue[idx]}[a-z] to queue")
        pbar.reset(total=len(queue))
        pbar.update(n=idx)
    
    answers.update(ans)
    pbar.update()
    idx += 1

In [None]:
answer_list = list(answers)
sort(answer_list, key=lambda x: x.symbol)
print(answer_list[0:10])

In [None]:
browser = lookup_browser._open_browser()
letters0 = list('abcdefghijklmnopqrstuvwxyz^.=-')
letters = letters0 + [x + y for x in letters0 for y in letters0]
base_url = "https://finance.yahoo.com/lookup/all?s={key}&t=A&b=0&c=100"
ans = []

for x in tqdm(letters):
    browser.get(base_url.format(key=x))
    raw_data = PyQuery(browser.page_source)
    text = raw_data("a[href*=\/lookup\/]")[0].find('span').text_content()
    num = int(text[text.find('(')+1:text.find(')')])
    ans.append((x, num))

In [None]:
print([x for x in ans if x[1] >= 10000])