In [None]:
from datetime import datetime
import sys
import time
from collections import namedtuple
from pyquery import PyQuery
from requests import get
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from tqdm.notebook import tqdm
import pandas as pd
from requests import get

In [None]:
TickerTuple = namedtuple('Ticker', ['symbol', 'name', 'industry', 'type', 'exchange'])
CategoryTuple = namedtuple('Category', ['index', 'code', 'name'])

In [None]:
test_url = 'https://finance.yahoo.com/lookup/all?s=a'
source = get(test_url)
raw_data = PyQuery(source.text)
y = raw_data[0]
titles = [x.find('span').text_content() for x in raw_data("a[href*=\\/lookup]")]
titles

In [None]:
class YahooLookupBrowser:
    """The browser simulator to lookup tickers in Yahoo Finance.
    """

    def __init__(self):
        self.base_url = "https://finance.yahoo.com/lookup/{category}?s={key}&t=A&b={start}&c={size}"
        self.categories = {
            'all': CategoryTuple(0, 'all', 'All'),
            'equity': CategoryTuple(1, 'equity', 'Stocks'),
            'mutualfund': CategoryTuple(2, 'mutualfund', 'Mutual Funds'),
            'etf': CategoryTuple(3, 'etf', 'ETFs'),
            'index': CategoryTuple(4, 'index', 'Indices'),
            'future': CategoryTuple(5, 'future', 'Futures'),
            'currency': CategoryTuple(6, 'currency', 'Currencies'),
        }

    def lookup(self, key: str, category: str = 'all', start: int = 0, size: int = 100):
        """Lookup tickers in Yahoo Finance.

        Args:
            key (str): The keyword of the lookup.
            category (str, optional): Category of the tickers. Defaults to 'all'.
            start (int, optional): Start index of the lookup page. Defaults to 0.
            size (int, optional): Size of the lookup page. Defaults to 100.

        Returns:
            (list, int): The pair of lookup results in the page and total tickers matching the keyword.
        """
        url = self.base_url.format(category=category, key=key, start=start, size=size)
        response = get(url)

        if not response.ok or 'Will be right back' in response.text:
            # page temporarily unavailable
            raise ConnectionRefusedError("Lookup page is temporarily unavailable")

        raw_data = PyQuery(response.text)
        ct = self.categories[category]
        title = raw_data("a[href*=\\/lookup]")[ct.index].find('span').text_content()
        assert title.startswith(ct.name)
        total = int(title[title.find('(') + 1:title.find(')')])

        if total == 0:
            # nothing is in the lookup page, so don't parse it
            return ([], 0)

        tbody = raw_data("tbody")
        ans = list()
        for row in tbody[0].findall('tr'):
            td = row.findall("td")
            td.pop(2)  # remove latest price
            ans.append(TickerTuple._make(x.text_content() for x in td))

        return (ans, total)

In [None]:
lookup_browser = YahooLookupBrowser()
for cat in lookup_browser.categories:
    %time ans, total = lookup_browser.lookup(key='a', category=cat)
    print(cat, len(ans), total)

In [None]:
%%time

ylb = YahooLookupBrowser()
answers = set()

perpage = 10000
idx = 0
letters = list('abcdefghijklmnopqrstuvwxyz')
queue = list(letters)
pbar = tqdm(total=len(queue))

while idx < len(queue):
    pbar.set_description(f"[query = {queue[idx]}]")
    pbar.refresh()

    try:
        t = datetime.now()
        ans, total = ylb.lookup(key=queue[idx], category='all', start=0, size=perpage)
        seconds = (datetime.now() - t).total_seconds()
        pbar.write(f"query = {queue[idx]} | count = {len(ans)} | total = {total} | seconds = {seconds}")
    except Exception as e:
        pbar.write(f"error (idx = {idx}, query = {queue[idx]}): " + str(e))
        pbar.write("wait for 10 seconds...")
        time.sleep(10)  # s
        continue

    if total > perpage:
        add = [queue[idx] + '%20' + letter for letter in letters] + [queue[idx] + letter for letter in letters]
        queue += add
        pbar.write(f"Add new queries {queue[idx]}[%20][a-z] to queue")
        pbar.reset(total=len(queue))
        pbar.update(n=idx)

    answers.update(ans)
    pbar.update()
    idx += 1

In [None]:
answer_list = sorted(list(answers))
df = pd.DataFrame(answer_list)
df.to_csv('answers.csv')
df

In [None]:
print(list(df['exchange'].unique()))

In [None]:
a = df[(df['industry'] == 'Technology') & (df['type'] == 'Stocks') & (df['exchange'] == 'NMS')]
a

In [None]:
df = pd.read_csv('answers.csv').sort_values(by=['type', 'exchange', 'industry', 'symbol'])
df

In [None]:
df[df.duplicated(subset=['symbol'])]