In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml
from openbb_terminal.sdk import openbb
import os
path="/Users/Massimiliano"
os.chdir(path)

pd.set_option('display.expand_frame_repr', False)

INFO:openbb_terminal.cryptocurrency.onchain.bitquery_model:START
INFO:openbb_terminal.cryptocurrency.onchain.bitquery_model:END


INFO:openbb_terminal.loggers:Logging configuration finished
INFO:openbb_terminal.loggers:Logging set to ['file', 'posthog']
INFO:openbb_terminal.loggers:Verbosity set to 20
INFO:openbb_terminal.loggers:LOGFORMAT: %(levelname)s-%(appName)s-%(commitHash)s-%(appId)s-%(sessionId)s-%(userId)s-%(asctime)s-%(name)s-%(funcName)s-%(lineno)s-%(message)s
INFO:openbb_terminal.core.session.sdk_session:START
INFO:openbb_terminal.core.session.sdk_session:END


In [3]:
DATA_STORE = Path('assets.h5')

In [75]:
df = (pd.read_csv('/Users/Massimiliano/wiki_stocks.csv',
        parse_dates=['date'],
        index_col=['date', 'ticker'],
        infer_datetime_format=True).sort_index())

print(df.info(null_counts=True))
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/prices', df)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15389314 entries, (Timestamp('1962-01-02 00:00:00'), 'ARNC') to (Timestamp('2018-03-27 00:00:00'), 'ZUMZ')
Data columns (total 12 columns):
 #   Column       Non-Null Count     Dtype  
---  ------       --------------     -----  
 0   open         15388776 non-null  float64
 1   high         15389259 non-null  float64
 2   low          15389259 non-null  float64
 3   close        15389313 non-null  float64
 4   volume       15389314 non-null  float64
 5   ex-dividend  15389314 non-null  float64
 6   split_ratio  15389313 non-null  float64
 7   adj_open     15388776 non-null  float64
 8   adj_high     15389259 non-null  float64
 9   adj_low      15389259 non-null  float64
 10  adj_close    15389313 non-null  float64
 11  adj_volume   15389314 non-null  float64
dtypes: float64(12)
memory usage: 1.4+ GB
None


In [None]:
df = web.DataReader(name='SP500', data_source='fred', start=2009).squeeze().to_frame('close')
print(df.info())
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/fred', df)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2610 entries, 2013-09-09 to 2023-09-08
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   close   2518 non-null   float64
dtypes: float64(1)
memory usage: 40.8 KB
None


In [11]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url, header=0)[0]

In [12]:
df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [16]:
df.columns = ['ticker', 'name', 'gics_sector', 'gics_sub_industry',
                'location', 'first_added', 'cik', 'founded']

In [17]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ticker             503 non-null    object
 1   name               503 non-null    object
 2   gics_sector        503 non-null    object
 3   gics_sub_industry  503 non-null    object
 4   location           503 non-null    object
 5   first_added        493 non-null    object
 6   cik                503 non-null    int64 
 7   founded            503 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.6+ KB
None


In [78]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stocks', df)

In [30]:

exchanges = ['NASDAQ.csv', 'AMEX.csv', 'NYSE.csv']
df = pd.concat([pd.read_csv(format(ex)) for ex in exchanges]).dropna(how='all', axis=1)

df = df[~df.index.duplicated()]
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4314 entries, 0 to 4313
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Symbol      4313 non-null   object 
 1   Name        4314 non-null   object 
 2   Last Sale   4314 non-null   object 
 3   Net Change  4314 non-null   float64
 4   % Change    4313 non-null   object 
 5   Market Cap  4302 non-null   float64
 6   Country     4313 non-null   object 
 7   IPO Year    2622 non-null   float64
 8   Volume      4314 non-null   int64  
 9   Sector      4184 non-null   object 
 10  Industry    4184 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 404.4+ KB
None


In [31]:
df.head()

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,AACG,ATA Creativity Global American Depositary Shares,$1.15,-0.03,-2.542%,36365500.0,China,2008.0,8843,Consumer Discretionary,Educational Services
1,AACI,Armada Acquisition Corp. I Common Stock,$10.75,0.01,0.093%,0.0,United States,2021.0,202,Finance,Blank Checks
2,AACIW,Armada Acquisition Corp. I Warrant,$0.0692,0.0,0.00%,0.0,United States,2021.0,10,Finance,Blank Checks
3,AADI,Aadi Bioscience Inc. Common Stock,$5.62,0.49,9.552%,137801600.0,United States,,136084,Health Care,Biotechnology: Pharmaceutical Preparations
4,AAL,American Airlines Group Inc. Common Stock,$14.11,0.13,0.93%,9218958000.0,United States,,17132956,Consumer Discretionary,Air Freight/Delivery Services


In [38]:
mcap = df[['Market Cap']].dropna()


### Store result

The file `us_equities_meta_data.csv` contains a version of the data used for many of the examples. Load using 
```
df = pd.read_csv('us_equities_meta_data.csv')
```
and proceed to store in HDF5 format.

In [24]:
sp500_overview = openbb.stocks.screener.screener_data(preset_loaded='sp500_filter', data_type = 'overview')
sp500_ownership = openbb.stocks.screener.screener_data(preset_loaded='sp500_filter', data_type = 'ownership')
sp500_performance = openbb.stocks.screener.screener_data(preset_loaded='sp500_filter', data_type = 'performance')
sp500_technical = openbb.stocks.screener.screener_data(preset_loaded='sp500_filter', data_type = 'technical')
sp500_valuation = openbb.stocks.screener.screener_data(preset_loaded='sp500_filter', data_type = 'valuation')

sp500_overview = sp500_overview.convert_dtypes()
sp500_ownership = sp500_ownership.convert_dtypes()
sp500_performance = sp500_performance.convert_dtypes()
sp500_technical = sp500_technical.convert_dtypes()
sp500_valuation = sp500_valuation.convert_dtypes()

sp500_overview.drop(columns = ['P/E'], inplace = True)
sp500_overview.set_index(keys = ['Ticker', 'Price', 'Change', 'Volume'], inplace = True)
sp500_performance.drop(columns = ['Avg Volume', 'Price', 'Change', 'Volume'], inplace = True)
sp500_performance.set_index(keys = ['Ticker'], inplace = True)
sp500_ownership.drop(columns = ['Price', 'Change', 'Volume', 'Market Cap'], inplace = True)
sp500_ownership.set_index(keys = ['Ticker'], inplace = True)
sp500_technical.drop(columns = ['Price', 'Change', 'Volume'], inplace = True)
sp500_technical.set_index(keys = ['Ticker'], inplace = True)
sp500_valuation.drop(columns = ['Price', 'Change', 'Volume', 'Market Cap'], inplace = True)
sp500_valuation.set_index(keys = ['Ticker'], inplace = True)


sp500_df = sp500_overview.join(sp500_valuation)
sp500_df = sp500_df.join(sp500_ownership)
sp500_df = sp500_df.join(sp500_performance)
sp500_df = sp500_df.join(sp500_technical)

sp500_df


INFO:openbb_terminal.stocks.screener.finviz_model:START
INFO:openbb_terminal.stocks.screener.finviz_model:{"INPUT": {"preset_loaded": "sp500_filter", "data_type": "overview", "limit": "-1", "ascend": "False", "chart": "False"}, "VIRTUAL_PATH": "stocks.screener.screener_data", "CHART": false}
INFO:openbb_terminal.stocks.screener.finviz_model:START


[Info] loading page [##############################] 26/26 

INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:START
INFO:openbb_terminal.stocks.screener.finviz_model:{"INPUT": {"preset_loaded": "sp500_filter", "data_type": "ownership", "limit": "-1", "ascend": "False", "chart": "False"}, "VIRTUAL_PATH": "stocks.screener.screener_data", "CHART": false}
INFO:openbb_terminal.stocks.screener.finviz_model:START


[Info] loading page [##############################] 26/26 

INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:START
INFO:openbb_terminal.stocks.screener.finviz_model:{"INPUT": {"preset_loaded": "sp500_filter", "data_type": "performance", "limit": "-1", "ascend": "False", "chart": "False"}, "VIRTUAL_PATH": "stocks.screener.screener_data", "CHART": false}
INFO:openbb_terminal.stocks.screener.finviz_model:START


[Info] loading page [##############################] 26/26 

INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:START
INFO:openbb_terminal.stocks.screener.finviz_model:{"INPUT": {"preset_loaded": "sp500_filter", "data_type": "technical", "limit": "-1", "ascend": "False", "chart": "False"}, "VIRTUAL_PATH": "stocks.screener.screener_data", "CHART": false}
INFO:openbb_terminal.stocks.screener.finviz_model:START


[Info] loading page [##############################] 26/26 

INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:START
INFO:openbb_terminal.stocks.screener.finviz_model:{"INPUT": {"preset_loaded": "sp500_filter", "data_type": "valuation", "limit": "-1", "ascend": "False", "chart": "False"}, "VIRTUAL_PATH": "stocks.screener.screener_data", "CHART": false}
INFO:openbb_terminal.stocks.screener.finviz_model:START


[Info] loading page [##############################] 26/26 

INFO:openbb_terminal.stocks.screener.finviz_model:END
INFO:openbb_terminal.stocks.screener.finviz_model:END


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Company,Sector,Industry,Country,Market Cap,P/E,Fwd P/E,PEG,P/S,P/B,...,Beta,ATR,SMA20,SMA50,SMA200,52W High,52W Low,RSI,from Open,Gap
Ticker,Price,Change,Volume,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
AAPL,175.01,-0.0042,108898991,Apple Inc.,Technology,Consumer Electronics,USA,2747560000000.0,29.41,26.54,4.62,7.16,45.58,...,1.28,3.65,-0.028,-0.0543,0.0633,-0.1171,0.4094,39.8,-0.0076,0.0035
MSFT,330.22,-0.025,36811776,Microsoft Corporation,Technology,Software - Infrastructure,USA,2516460000000.0,34.1,26.17,2.37,11.87,11.9,...,0.9,6.1,0.0068,-0.0068,0.1301,-0.0997,0.5472,49.73,-0.0201,-0.005
GOOGL,137.4,-0.0051,38790642,Alphabet Inc.,Communication Services,Internet Content & Information,USA,1746470000000.0,30.09,20.6,1.86,6.03,6.51,...,1.06,2.65,0.0296,0.0657,0.2489,-0.0093,0.6487,62.9,-0.0051,0.0
GOOG,138.3,-0.005,48628430,Alphabet Inc.,Communication Services,Internet Content & Information,USA,1737700000000.0,28.57,20.89,1.76,6.0,6.54,...,1.06,2.63,0.0299,0.0676,0.2502,-0.009,0.6573,62.88,-0.0044,-0.0006
AMZN,140.39,-0.0299,101770091,Amazon.com Inc.,Consumer Cyclical,Internet Retail,USA,1493190000000.0,111.6,44.55,,2.78,8.57,...,1.23,3.36,0.0253,0.0425,0.2651,-0.0375,0.7241,55.44,-0.0167,-0.0135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OGN,19.54,-0.0126,6070408,Organon & Co.,Healthcare,Drug Manufacturers - General,USA,5060000000.0,6.63,4.25,,0.82,,...,0.77,0.55,-0.0772,-0.0891,-0.182,-0.3975,0.0355,35.39,-0.0051,-0.0076
ALK,39.47,0.0028,3042709,Alaska Air Group Inc.,Industrials,Airlines,USA,5010000000.0,32.04,5.77,1.42,0.48,1.27,...,1.55,1.01,-0.047,-0.153,-0.1451,-0.3097,0.0612,26.29,0.0008,0.002
LNC,26.03,0.0124,44226049,Lincoln National Corporation,Financial,Insurance - Life,USA,4360000000.0,,3.33,,0.25,0.92,...,1.81,0.89,0.0223,-0.0138,-0.0302,-0.5232,0.407,52.52,0.0297,-0.0167
DXC,20.69,0.0,10523300,DXC Technology Company,Technology,Information Technology Services,USA,4250000000.0,,5.26,,0.3,1.33,...,1.97,0.55,0.005,-0.1136,-0.1898,-0.3165,0.1118,43.21,0.0044,-0.0043


In [27]:
sp500_df.to_csv('SP500_df.csv')

sp500_df = (pd.read_csv('SP500_df.csv'))
print(sp500_df.info(null_counts=True))
with pd.HDFStore(DATA_STORE) as store:
        store.put('SP500_df', sp500_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 50 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Ticker         503 non-null    object 
 1   Price          503 non-null    float64
 2   Change         503 non-null    float64
 3   Volume         503 non-null    int64  
 4   Company        503 non-null    object 
 5   Sector         503 non-null    object 
 6   Industry       503 non-null    object 
 7   Country        503 non-null    object 
 8   Market Cap     503 non-null    float64
 9   P/E            463 non-null    float64
 10  Fwd P/E        499 non-null    float64
 11  PEG            382 non-null    float64
 12  P/S            503 non-null    float64
 13  P/B            468 non-null    float64
 14  P/C            499 non-null    float64
 15  P/FCF          433 non-null    float64
 16  EPS this Y     501 non-null    float64
 17  EPS next Y     501 non-null    float64
 18  EPS past 5

## MNIST Data

In [36]:
mnist = fetch_openml('mnist_784', version=1)

In [37]:
print(mnist.DESCR)

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [38]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [39]:
mnist_path = Path('mnist')
if not mnist_path.exists():
    mnist_path.mkdir()

In [40]:
np.save(mnist_path / 'data', mnist.data.astype(np.uint8))
np.save(mnist_path / 'labels', mnist.target.astype(np.uint8))

## Fashion MNIST Image Data

We will use the Fashion MNIST image data created by [Zalando Research](https://github.com/zalandoresearch/fashion-mnist) for some demonstrations.

In [12]:
fashion_mnist = fetch_openml(name='Fashion-MNIST')

In [13]:
print(fashion_mnist.DESCR)

**Author**: Han Xiao, Kashif Rasul, Roland Vollgraf  
**Source**: [Zalando Research](https://github.com/zalandoresearch/fashion-mnist)  
**Please cite**: Han Xiao and Kashif Rasul and Roland Vollgraf, Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms, arXiv, cs.LG/1708.07747  

Fashion-MNIST is a dataset of Zalando's article images, consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. Fashion-MNIST is intended to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits. 

Raw data available at: https://github.com/zalandoresearch/fashion-mnist

### Target classes
Each training and test example is assigned to one of the following labels:
Label  Description  
0  T-shirt/top  
1  Trouser  
2  Pullover  
3  Dress  
4  

In [33]:
label_dict = {0: 'T-shirt/top',
              1: 'Trouser',
              2: 'Pullover',
              3: 'Dress',
              4: 'Coat',
              5: 'Sandal',
              6: 'Shirt',
              7: 'Sneaker',
              8: 'Bag',
              9: 'Ankle boot'}

In [34]:
fashion_path = Path('fashion_mnist')
if not fashion_path.exists():
    fashion_path.mkdir()

In [35]:
pd.Series(label_dict).to_csv(fashion_path / 'label_dict.csv', index=False, header=None)

In [31]:
np.save(fashion_path / 'data', fashion_mnist.data.astype(np.uint8))
np.save(fashion_path / 'labels', fashion_mnist.target.astype(np.uint8))


## Bond Price Indexes

The following code downloads several bond indexes from the Federal Reserve Economic Data service ([FRED](https://fred.stlouisfed.org/))

> Warning: Unfortunately, most of this data has been [recently removed](https://news.research.stlouisfed.org/2022/01/ice-benchmark-administration-ltd-iba-data-to-be-removed-from-fred/) from the FRED service. It is not important for the examples in the book, so you can just ignore this.

In [27]:
securities = {'BAMLCC0A0CMTRIV'   : 'US Corp Master TRI',
              'BAMLHYH0A0HYM2TRIV': 'US High Yield TRI',
              'BAMLEMCBPITRIV'    : 'Emerging Markets Corporate Plus TRI',
              'GOLDAMGBD228NLBM'  : 'Gold (London, USD)',
              'DGS10'             : '10-Year Treasury CMR',
              }

df = web.DataReader(name=list(securities.keys()), data_source='fred', start=2000)
df = df.rename(columns=securities).dropna(how='all').resample('B').mean()

with pd.HDFStore(DATA_STORE) as store:
    store.put('fred/assets', df)