In [None]:
from datetime import datetime
import sys
import time
from collections import namedtuple
from pyquery import PyQuery
from requests import get
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from tqdm.notebook import tqdm
import pandas as pd
from requests import get

In [None]:
TickerTuple = namedtuple('Ticker', ['symbol', 'name', 'industry', 'type', 'exchange'])

In [None]:
class YahooLookupBrowser:
    def __init__(self):
        self.browser = self._open_browser()
        self.base_url = "https://finance.yahoo.com/lookup/{category}?s={key}&t=A&b={start}&c={size}"
        
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        self.browser.quit()

    def _open_browser(self):
        options = webdriver.FirefoxOptions()
        options.add_argument('--headless')
        return webdriver.Firefox(options=options)

    def lookup(self, key, category='all', start=0, size=100) -> (list, int):
        url = self.base_url.format(category=category, key=key, start=start, size=size)
        # self.browser.get(url)
        source = get(url)
        raw_data = PyQuery(source.text)
        
        title = raw_data("a[href*=\\/lookup]")[0].find('span').text_content()
        total = int(title[title.find('(')+1:title.find(')')])
        
        if total == 0:
            return ([], 0)
        
        tbody = raw_data("tbody")
        ans = list()
        for row in tbody[0].findall('tr'):
            td = row.findall("td")
            td.pop(2)
            ans.append(TickerTuple._make(x.text_content() for x in td))

        return (ans, total)

In [None]:
%%time
lookup_browser = YahooLookupBrowser()
ans, total = lookup_browser.lookup(key='a')
print(len(ans), total)

In [None]:
%%time

ylb = YahooLookupBrowser()
answers = set()

perpage = 10000
idx = 0
letters = list('abcdefghijklmnopqrstuvwxyz')
queue = list(letters)
pbar = tqdm(total=len(queue))

while idx < len(queue):
    pbar.set_description(f"[query = {queue[idx]}]")
    pbar.refresh()

    try:
        t = datetime.now()
        ans, total = ylb.lookup(key=queue[idx], category='all', start=0, size=perpage)
        seconds = (datetime.now() - t).total_seconds()
        pbar.write(f"query = {queue[idx]} | count = {len(ans)} | total = {total} | seconds = {seconds}")
    except Exception as e:
        pbar.write(f"error (idx = {idx}, query = {queue[idx]}): " + str(e))
        pbar.write("wait for 10 seconds...")
        time.sleep(10)  # s
        continue

    if total > perpage:
        add = [queue[idx] + '%20' + letter for letter in letters] + [queue[idx] + letter for letter in letters]
        queue += add
        pbar.write(f"Add new queries {queue[idx]}[%20][a-z] to queue")
        pbar.reset(total=len(queue))
        pbar.update(n=idx)

    answers.update(ans)
    pbar.update()
    idx += 1

In [None]:
answer_list = sorted(list(answers))
df = pd.DataFrame(answer_list)
df.to_csv('answers.csv')
df

In [None]:
print(list(df['exchange'].unique()))

In [None]:
a = df[(df['industry'] == 'Technology') & (df['type'] == 'Stocks') & (df['exchange'] == 'NMS')]
a

In [None]:
df = pd.read_csv('answers.csv').sort_values(by=['type', 'exchange', 'industry', 'symbol'])
df

In [None]:
df[df.duplicated(subset=['symbol'])]