In [1]:
import yfinance as yf
import kmapper as km
from kmapper.jupyter import display
from umap import UMAP
import sklearn
import sklearn.manifold as manifold
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read text file with ticker names
filename = open("SP500_tickernames.txt", "r")
raw_tickernames = filename.read()
ticker_names = raw_tickernames.split("\n")
ticker_names = ticker_names[:len(ticker_names)-1]

In [3]:
# define date range
start_date_string = "2020-01-01"
end_date_string = "2022-04-02"

# pull historical data
raw_data = yf.download(ticker_names, start=start_date_string, end=end_date_string)

[*********************100%***********************]  495 of 495 completed

31 Failed downloads:
['NLOK', 'DISCA', 'NLSN', 'DISH', 'PEAK', 'FLT', 'PBCT', 'BLL', 'ABC', 'FBHS', 'DRE', 'WRK', 'ABMD', 'CTXS', 'DISCK', 'RE', 'FRC', 'CDAY', 'CERN', 'XLNX', 'VIAC', 'ANTM', 'FB', 'ATVI', 'PXD', 'INFO', 'PKI', 'TWTR']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')
['SIVB']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2020-01-01 -> 2022-04-02)')
['FISV', 'SBNY']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2020-01-01 -> 2022-04-02) (Yahoo error = "Data doesn\'t exist for startDate = 1577854800, endDate = 1648872000")')


In [4]:
# get daily close prices and drop missing columns
df_close = raw_data['Adj Close'].dropna(axis='columns')

In [5]:
# convert pandas dataframe to numpy array, standardize ticker data, and transpose array
data = df_close.to_numpy()
data = data-np.mean(data, axis=0)/np.std(data, axis=0)
data = data.transpose()

In [6]:
# calculate percent return of each ticker over date range
per_return = (df_close.to_numpy().transpose()[:,504] - df_close.to_numpy().transpose()[:,0])/df_close.to_numpy().transpose()[:,0]

In [7]:
# initialize mapper
mapper = km.KeplerMapper(verbose=1)

KeplerMapper(verbose=1)


In [8]:
# project data into 2D subsapce via 2 step transformation, 1)isomap 2)UMAP
projected_data = mapper.fit_transform(data, projection=[manifold.Isomap(n_components=100, n_jobs=-1), UMAP(n_components=2,random_state=1)])

..Composing projection pipeline of length 2:
	Projections: Isomap(n_components=100, n_jobs=-1)
		UMAP(random_state=1)
	Distance matrices: False
False
	Scalers: MinMaxScaler()
MinMaxScaler()
..Projecting on data shaped (464, 568)

..Projecting data using: 
	Isomap(n_components=100, n_jobs=-1)



  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])



..Scaling with: MinMaxScaler()

..Projecting on data shaped (464, 100)

..Projecting data using: 
	UMAP(random_state=1, verbose=1)

UMAP(n_jobs=1, random_state=1, verbose=1)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Mon Sep 23 14:17:54 2024 Construct fuzzy simplicial set
Mon Sep 23 14:17:54 2024 Finding Nearest Neighbors
Mon Sep 23 14:17:55 2024 Finished Nearest Neighbor Search
Mon Sep 23 14:17:56 2024 Construct embedding


Epochs completed:  25%| ██▍        124/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs


Epochs completed: 100%| ██████████ 500/500 [00:00]

	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Mon Sep 23 14:17:57 2024 Finished embedding

..Scaling with: MinMaxScaler()






In [9]:
# cluster data using DBSCAN
G = mapper.map(projected_data, data, clusterer=sklearn.cluster.DBSCAN(metric="cosine"))

Mapping on data shaped (464, 568) using lens shaped (464, 2)

Creating 100 hypercubes.

Created 59 edges and 46 nodes in 0:00:00.053290.


In [None]:
fileID = "test"

In [None]:
# visualize graph
mapper.visualize(G, 
                path_html="mapper_example_" + fileID + ".html",
                title=fileID,
                custom_tooltips = df_close.columns.to_numpy(),
                color_values = np.log(per_return+1),
                color_function_name = 'Log Percent Returns',
                node_color_function = np.array(['average', 'std', 'sum', 'max', 'min']))

# display mapper in jupyter
km.jupyter.display("mapper_example_" + fileID + ".html")