In [None]:
#!pip install yfinance

In [None]:
from collections import Counter
import yfinance as yf
import glob
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import graphistry
from graphistry.feature_utils import get_dtypes_for_dataframe

import warnings
warnings.filterwarnings('ignore')

In [None]:
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="", password="") 

In [None]:
g = graphistry.bind() # create base graphistry instance that we can reuse below

# Load in 100 leading industry Tickers

In [None]:
res = pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')
ndf = res[2]

In [None]:
# load bulk
ndf

In [None]:
tickers = ndf[ndf.Symbol != 'BRK.B'].Symbol # since BRK.B is delisted.

# We add in 33 Electric Car Stocks from different parts of the supply chain 
## -- Manufacturers, Battery Systems, Material Stocks, Charging Stations and Equipement

In [None]:
# get Electric Car Stocks + supply chain heuristics
ev_tickers = ['GM', 'F', 'XPEV', 'LI', 'NIO', 'TSLA', 'NKLA', 'RIVN', 'RIDE', 'TM',
              'GOEV', 'FSR', 'APTV', 'FUV', 'BLNK', 'LCID', 'CHPT', 'GELYF', 'MGA', 'DDAIF',
             'PCRFY', 'VWAGY', 'BMWYY', 'ZEV', 'SLDP', 'RMO', 'PCAR', 'LAC', #battery systems
             'NNDM', 'BWA', 'MP', 'FCX', 'ALSN', 'ALB',     #Materials Stocks
             'BEEM', 'BLNK', 'VLTA' # Charging stations + equipment
             ] # add your own here

In [None]:
all_tickers = tickers.to_list()+ev_tickers

In [None]:
fetch = True
if fetch:
    # download their price data
    df_prices_all = yf.download(all_tickers, start='2021-01-01', interval='1h')
    df_prices_all.to_csv('data/stock_prices_all.csv')
else:
    df_prices_all = pd.read_csv('data/stock_prices_all.csv', index_col=0)

In [None]:
df_prices_all

 This is a multi-Index DataFrame

In [None]:
df_prices_all.columns

In [None]:
def enrich_from_tickers(tickers):
    """
        Get's enrichment data per ticker.
        Runs slow -- several minutes for 133 tickers.
    """
    data = []
    for tick in tickers:
        print(f'Downloading info for {tick}')
        res = yf.Ticker(tick)
        datum = res.info
        data.append(datum)
    return data

In [None]:
fetch = False
if fetch:
    data = enrich_from_tickers(all_tic)
    df  = pd.DataFrame(data)
    df.to_csv('data/stocks_metadata.csv')
else:
    df = pd.read_csv('data/stocks_metadata.csv', index_col=0)
    df['n'] = range(len(df))

In [None]:
# we have many examples of sector
Counter(df.sector).most_common(), df.shape

In [None]:
## see what we got
df

In [None]:
# Get some volatility metrics
o = df_prices_all.Open
h = df_prices_all.High
l = df_prices_all.Low
c = df_prices_all.Close

# square of GARMAN-KLASS rv estimator from OLHC data
vol = 1/2 * np.square(np.log(h/l)) - (2*np.log(2)-1)*np.square(np.log(c/o))
vol = vol.replace([np.inf, np.nan], 0)

# square of Rogers-Satchell Volatility 
vol2 = np.log(h/c)*np.log(h/o)+ np.log(l/c)*np.log(l/o)
vol2 = vol2.replace([np.inf, np.nan], 0)

In [None]:
good_tickers = vol.columns[vol.columns!='BRK.B']

In [None]:
vol = vol[good_tickers]
vol2 = vol2[good_tickers]

In [None]:
# have to add in a node identifier, so we use add
vol.T['n'] = range(len(vol.T)) # FIXME this breaks featurizer, why?

# Now we can UMAP Volitility Data

In [None]:
# Rogers-Satchell Volatility 
g2 = g.nodes(vol2.T)
g3 = g2.umap(scale = 2, n_neighbors=15)

In [None]:
# g2 = g.nodes(vol.T, 'n').featurize()
# g3 = g2.umap(scale = 2, n_neighbors=15)

In [None]:
# UMAP has created an implicit Edge DataFrame
g3.weighted_edges_df_from_nodes

In [None]:
## this will cluster groups according to Rogers-Satchell Volatility
# Shows how easy it is to cluster by a given metric of interest!

g4 = g.nodes(df, 'n').edges(g3._edges, '_src', '_dst')
g4.plot()

In [None]:
#vol.plot()

In [None]:
#vol2.plot()

# Let's see if we can cluster by  Adj Close value over time. We will see that this naturally clusters by industry category in a semantically useful way. 

In [None]:
df_prices = df_prices_all['Adj Close']
df_prices = df_prices[good_tickers]

In [None]:
df_prices

In [None]:
# we have to run some cleanup on NaN values, so we use Imputer and MinMax Scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

colnames = df_prices.columns
dates = df_prices.index

# impute values 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df_prices)
res = imputer.transform(df_prices)
df_prices = pd.DataFrame(res, index=df_prices.index, columns = df_prices.columns)

# scale the resulting values
scaler = MinMaxScaler()

df_prices = pd.DataFrame(scaler.fit_transform(df_prices), index=dates, columns=colnames)
df_prices


In [None]:
df_adj = df_prices.T
df_adj['n'] = range(len(df_adj))

In [None]:
df_adj

In [None]:
g2 = g.nodes(df_adj, 'n').featurize()
g3 = g3.umap(scale=2)

In [None]:
g3.plot()  # Why is this happening???

In [None]:
g4 = g.nodes(df, 'n').edges(g3._edges, '_src', '_dst')
g4.plot()

# Can we use the data from DF (rather than price flows) to cluster by sector? We will see that this does a great job.

In [None]:
# we want to separate data into Numeric and not, 
gtypes = get_dtypes_for_dataframe(df, verbose=True)

In [None]:
numeric_df = df[gtypes['float64'] + gtypes['int64']]
numeric_df = numeric_df.replace([np.inf, np.nan], 0)

In [None]:
numeric_df

In [None]:
g2 = g.nodes(numeric_df, 'n').featurize()

In [None]:
#g2 = g.nodes(numeric_df, 'symbol').featurize()
g3 = g2.umap(scale=.5, n_neighbors=10)

In [None]:
g3.plot() 

In [None]:
# or seeing it with the better label (company name)
g4 = graphistry.nodes(df, 'n').edges(g3.weighted_edges_df_from_nodes, '_src','_dst')
g4.plot()

# A few amazing observations -- It has clustered, just from financial data, the EV companies by sector. 

It finds all the major EV categories we started with, and even some suprising relationships between correlated stock data. 

## Next we use Textual and Category data and see how well it clusters (sans finance data)

In [None]:
summary_cols = ['n','longBusinessSummary', 'industry', 'sector', 'financialCurrency', 'longName']

In [None]:
meta = df[summary_cols]
meta = meta.fillna('')

In [None]:
g2 = g.nodes(meta, 'n').featurize()

In [None]:
g3 = g2.umap(scale=0.5, n_neighbors=10)

## We see the similarity from textual and other categorical metadata, and it does quite well

In [None]:
g3.plot()

# Some stats

In [None]:
weekly_std = df_prices.resample('W', on='timestamp').std() #.agg(['mean', 'min', 'max', 'std', 'skew'])

In [None]:
weekly_std.plot(figsize=(17,10))

# Can we run on the volitility data again using a trick to make new features

In [None]:
df_target = vol

In [None]:
mdf = df_target.resample('W').agg(['mean', 'min', 'max', 'std', 'skew']).T
mdf

In [None]:
g2 = g.nodes(mdf)

In [None]:
g3 = g2.umap()

In [None]:
g3.weighted_edges_df_from_nodes

In [None]:
mdf_node_name = np.array(list(df.symbol.values)*5).reshape(5, len(df)).T.flatten()

In [None]:
hdf = pd.DataFrame({'symbol': list(mdf_node_name)})
hdf['n'] = range(len(mdf))

In [None]:
g4 = graphistry.nodes(hdf, 'n').edges(g3.weighted_edges_df_from_nodes, '_src','_dst')
g4.plot()

# Lastly Let's compare clustering to a useful PLSR.B regressor, called CCA (Canonical Correlation Analyis)
## We will see that it doesn't do as well as Featurization + UMAP

In [None]:
from sklearn.cross_decomposition import CCA
clf = CCA(n_components=2)
res=clf.fit_transform(df_prices, df_target)

In [None]:
res[1].shape

In [None]:
# two sets of components plotted over time
lf = pd.DataFrame({'comp1_a':res[0].T[0], 
                   'comp2_a':res[0].T[1], 
                   'comp1_b':res[1].T[0], 'comp2_b':res[1].T[1]}, index=df_target.index)
lf.plot(figsize=(15, 10))

In [None]:
plt.imshow(clf.coef_)

In [None]:
# we can use the coef_ of clf to make an edge dataframe
a, b = clf.coef_.nonzero()

In [None]:
CCA_edges = pd.DataFrame({'_src':a, '_dst':b, 'weight': clf.coef_[a,b]})

In [None]:
from graphistry.feature_utils import prune_weighted_edges_df

In [None]:
wdf = prune_weighted_edges_df(CCA_edges, scale=3) # higher scale to reduce connectivity (big blob)

In [None]:
g2 = g.nodes(meta, 'n').edges(wdf, '_src', '_dst')   
g2.plot()  # lots of standalone nodes...