**We download stock market data from the following website.**

Source: https://query1.finance.yahoo.com

In [1]:
import sys
import requests

import pandas as pd

from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ThreadPoolExecutor

# Helpers

In [2]:
BASEURL = 'https://query1.finance.yahoo.com'

In [3]:
notnone = lambda x: x is not None

# Datasets

In [4]:
stocks = pd.read_csv('../datasets/stocks.csv.gz', index_col=0)

stocks.head()

Unnamed: 0,ticker,name,exchange,category name,country
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA


In [5]:
stocks.describe(include='all')

Unnamed: 0,ticker,name,exchange,category name,country
count,106328,96676,95092,20728,95091
unique,106328,64859,67,214,41
top,BWI.SG,FINANC/TERMO,PNK,Industrial Metals & Minerals,USA
freq,1,439,14602,1292,22169


In [6]:
tickers = stocks['ticker']

tickers.head()

0    OEDV
1    AAPL
2     BAC
3    AMZN
4       T
Name: ticker, dtype: object

# Data infos

In [7]:
def download_info(ticker):
    """Download meta information from the given ticker."""
    url = f"{BASEURL}/v7/finance/quote?symbols={ticker}"
    
    try:
        res = requests.get(url=url).json()
        result = res['quoteResponse']['result'][0]
        
        return result
    except Exception as e:
        # fail silently
        return None

In [8]:
with ThreadPoolExecutor() as executor:
    gen = executor.map(download_info, tickers)
    infos = pd.DataFrame(filter(notnone, tqdm(gen, total=len(tickers))))

print(infos.shape)

infos.head()

HBox(children=(IntProgress(value=0, max=106328), HTML(value='')))


(82108, 75)


Unnamed: 0,language,region,quoteType,quoteSourceName,exchangeDataDelayedBy,fullExchangeName,esgPopulated,tradeable,triggerable,regularMarketTime,...,forwardPE,ytdReturn,trailingThreeMonthReturns,trailingThreeMonthNavReturns,openInterest,underlyingSymbol,contractSymbol,expireDate,expireIsoDate,headSymbolAsString
0,en-US,US,MUTUALFUND,Delayed Quote,0.0,YHD,False,False,False,1561760000.0,...,,,,,,,,,,
1,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,NasdaqGS,False,True,True,1567016000.0,...,15.996875,,,,,,,,,
2,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,NYSE,False,True,True,1567016000.0,...,8.879934,,,,,,,,,
3,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,NasdaqGS,False,True,True,1567016000.0,...,53.036423,,,,,,,,,
4,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,NYSE,False,True,True,1567016000.0,...,9.62431,,,,,,,,,


In [9]:
infos.describe(include='all')

Unnamed: 0,language,region,quoteType,quoteSourceName,exchangeDataDelayedBy,fullExchangeName,esgPopulated,tradeable,triggerable,regularMarketTime,...,forwardPE,ytdReturn,trailingThreeMonthReturns,trailingThreeMonthNavReturns,openInterest,underlyingSymbol,contractSymbol,expireDate,expireIsoDate,headSymbolAsString
count,82108,82108,82108,37384,78098.0,78098,82108,82108,82108,77135.0,...,11058.0,159.0,161.0,110.0,550.0,461,529,87.0,87,17
unique,1,1,11,2,,68,1,2,2,,...,,,,,,461,2,,15,17
top,en-US,US,EQUITY,Delayed Quote,,YHD,False,False,False,,...,,,,,,HDFCSENETF.NS,True,,2029-12-31T00:00:00Z,ZWTKRB-A
freq,82108,82108,68145,33164,,11424,82108,68415,77472,,...,,,,,,1,512,,62,1
mean,,,,,1.716433,,,,,1547038000.0,...,40121.57,9.962453,1.242112,0.558909,0.0,,,1799777000.0,,
std,,,,,5.226918,,,,,167828500.0,...,6404696.0,13.732888,6.146069,5.171344,0.0,,,148747100.0,,
min,,,,,0.0,,,,,0.0,...,-386282500.0,-22.38,-20.6,-20.73,0.0,,,1473293000.0,,
25%,,,,,0.0,,,,,1565330000.0,...,5.033942,0.71,-1.44,-1.575,0.0,,,1573560000.0,,
50%,,,,,0.0,,,,,1566973000.0,...,10.92902,8.16,1.32,0.96,0.0,,,1893370000.0,,
75%,,,,,0.0,,,,,1567005000.0,...,19.09479,18.44,4.12,3.0775,0.0,,,1893370000.0,,


# Data extras

In [10]:
extras = pd.merge(stocks, infos, how='inner', left_on='ticker', right_on='symbol')

extras.head()

Unnamed: 0,ticker,name,exchange_x,category name,country,language,region,quoteType,quoteSourceName,exchangeDataDelayedBy,...,forwardPE,ytdReturn,trailingThreeMonthReturns,trailingThreeMonthNavReturns,openInterest,underlyingSymbol,contractSymbol,expireDate,expireIsoDate,headSymbolAsString
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA,en-US,US,MUTUALFUND,Delayed Quote,0.0,...,,,,,,,,,,
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,...,15.996875,,,,,,,,,
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,...,8.879934,,,,,,,,,
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,...,53.036423,,,,,,,,,
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA,en-US,US,EQUITY,Nasdaq Real Time Price,0.0,...,9.62431,,,,,,,,,


In [11]:
extras.describe(include='all')

Unnamed: 0,ticker,name,exchange_x,category name,country,language,region,quoteType,quoteSourceName,exchangeDataDelayedBy,...,forwardPE,ytdReturn,trailingThreeMonthReturns,trailingThreeMonthNavReturns,openInterest,underlyingSymbol,contractSymbol,expireDate,expireIsoDate,headSymbolAsString
count,82099,80216,79005,20570,79005,82099,82099,82099,37375,78089.0,...,11058.0,159.0,161.0,110.0,550.0,461,529,87.0,87,17
unique,82073,54594,64,214,41,1,1,11,2,,...,,,,,,461,2,,15,17
top,M,ICICI Prudential Mutual Fund,PNK,Industrial Metals & Minerals,USA,en-US,US,EQUITY,Delayed Quote,,...,,,,,,HDFCSENETF.NS,True,,2029-12-31T00:00:00Z,ZWTKRB-A
freq,9,34,11285,1281,18020,82099,82099,68145,33155,,...,,,,,,1,512,,62,1
mean,,,,,,,,,,1.716631,...,40121.57,9.962453,1.242112,0.558909,0.0,,,1799777000.0,,
std,,,,,,,,,,5.227186,...,6404696.0,13.732888,6.146069,5.171344,0.0,,,148747100.0,,
min,,,,,,,,,,0.0,...,-386282500.0,-22.38,-20.6,-20.73,0.0,,,1473293000.0,,
25%,,,,,,,,,,0.0,...,5.033942,0.71,-1.44,-1.575,0.0,,,1573560000.0,,
50%,,,,,,,,,,0.0,...,10.92902,8.16,1.32,0.96,0.0,,,1893370000.0,,
75%,,,,,,,,,,0.0,...,19.09479,18.44,4.12,3.0775,0.0,,,1893370000.0,,


In [12]:
extras.to_csv('../datasets/extras.csv.gz')

# Data points

In [13]:
def download_data(ticker, range='5y', interval='1d', events='div,splits'):
    """Download market information for the given ticker and time range."""
    url = f"{BASEURL}/v8/finance/chart/{ticker}"
    
    params = dict(
        range=range,
        events=events,
        interval=interval,
    )
    
    try:
        res = requests.get(url=url, params=params).json()
        result = res['chart']['result'][0]
        splits = result['events']['splits']
        dividends = result['events']['dividends']
        indicators = result['indicators']['quote'][0]
                
        splits = pd.DataFrame(splits.values()).assign(ticker=ticker)
        dividends = pd.DataFrame(dividends.values()).assign(ticker=ticker)
        quotes = pd.DataFrame(indicators).assign(date=result['timestamp'], ticker=ticker)
        
        return quotes, splits, dividends
    except Exception as e:
        # fail silently
        return None

In [14]:
with ThreadPoolExecutor() as executor:
    gen = executor.map(download_data, tickers)
    values = filter(notnone, tqdm(gen, total=len(tickers)))
    quotes, splits, dividends = zip(*values)

allsplits = pd.concat(splits, sort=False)
allquotes = pd.concat(quotes, sort=False)
alldividends = pd.concat(dividends, sort=False)

allquotes.shape, allsplits.shape, alldividends.shape

HBox(children=(IntProgress(value=0, max=106328), HTML(value='')))




((5816659, 7), (7086, 5), (38819, 3))

## Quotes

In [15]:
allquotes.head()

Unnamed: 0,high,close,open,low,volume,date,ticker
0,40.082039,39.817711,39.937859,39.697559,2778000.0,1409232600,AA
1,40.106071,39.91383,39.961891,39.577412,3811300.0,1409319000,AA
2,40.178162,40.05801,39.98592,39.793678,4474400.0,1409664600,AA
3,40.851002,40.682789,40.082039,40.05801,4523400.0,1409751000,AA
4,41.71608,41.091301,40.80294,40.634731,6497800.0,1409837400,AA


In [16]:
allquotes.describe(include='all')

Unnamed: 0,high,close,open,low,volume,date,ticker
count,5777200.0,5777200.0,5777200.0,5777200.0,5777200.0,5816659.0,5816659
unique,,,,,,,4969
top,,,,,,,ISFIN.IS
freq,,,,,,,1305
mean,1516.621,1476.032,1483.932,1442.154,3919745.0,1489245000.0,
std,108088.6,104519.2,105805.7,101922.8,180245900.0,45327200.0,
min,0.0,0.0,0.0,0.0,0.0,1409198000.0,
25%,6.66667,6.6,6.6,6.53,0.0,1450103000.0,
50%,19.37372,19.22,19.21,19.06,6001.0,1489712000.0,
75%,48.04,47.65,47.6596,47.25,375078.5,1528438000.0,


In [17]:
allquotes.to_csv('../datasets/quotes.csv.gz')

## Splits

In [18]:
allsplits.head()

Unnamed: 0,date,numerator,denominator,splitRatio,ticker
0,1478007000,1000,801,801/1000,AA
1,1475760600,1,3,3/1,AA
0,1559568600,4725,10000,10000/4725,DD
1,1554211800,1487,1000,1000/1487,DD
0,1412256600,14,1,1/14,MT


In [19]:
allsplits.describe(include='all')

Unnamed: 0,date,numerator,denominator,splitRatio,ticker
count,7086.0,7086.0,7086.0,7086,7086
unique,,,,532,4969
top,,,,1/2,UBFO
freq,,,,1138,11
mean,1484535000.0,1034.722,319.618261,,
std,45175370.0,56223.22,5508.402633,,
min,1409206000.0,0.0,0.0,,
25%,1441692000.0,2.0,1.0,,
50%,1482786000.0,5.0,5.0,,
75%,1523961000.0,26.0,30.0,,


In [20]:
allsplits.to_csv('../datasets/splits.csv.gz')

## Dividends

In [21]:
alldividends.head()

Unnamed: 0,amount,date,ticker
0,0.07209,1462368600,AA
1,0.09,1478093400,AA
2,0.07209,1415197800,AA
3,0.07209,1446647400,AA
4,0.07209,1431005400,AA


In [22]:
alldividends.describe(include='all')

Unnamed: 0,amount,date,ticker
count,38819.0,38819.0,38819
unique,,,4969
top,,,ITUB
freq,,,80
mean,63.660139,1490452000.0,
std,5404.631248,45621700.0,
min,0.0,1409206000.0,
25%,0.098003,1451313000.0,
50%,0.2675,1491880000.0,
75%,1.33,1529977000.0,


In [23]:
alldividends.to_csv('../datasets/dividends.csv.gz')