UMAP is a popular method of dimensionality reduction, a helpful technique for meaningful analysis of large, complex datasets
UMAP is:
 * interested in the number of nearest numbers
 * non-linear, unlike longstanding methods such as PCA
 * non-scaling, which keep calculation fast
 * stochastic and thus non-deterministic -- and different libraries handle this differently as you will see in this notebook
   * `umap-learn` states that ["variance between runs will exist, however small"](https://umap-learn.readthedocs.io/en/latest/reproducibility.html)
   * `cuml` currently uses ["exact kNN"](https://docs.rapids.ai/api/cuml/stable/api.html?highlight=umap#cuml.UMAP). This may chance in [future releases](https://github.com/rapidsai/cuml/issues/1653#issuecomment-584357155)


#### clone and install graphistry, print version

In [9]:
import pandas as pd, networkx as nx
# !git clone https://github.com/graphistry/pygraphistry.git

from time import time
!pip install -U pygraphistry/ --quiet

import graphistry 
graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='***', password='***')
graphistry.__version__

[0m

'0.27.2+4.ga674343.dirty'

In [112]:
import pandas as pd, numpy as np
start_u = pd.to_datetime('2016-01-01').value//10**9
end_u = pd.to_datetime('2021-01-01').value//10**9
samples=1000
# df = pd.DataFrame(np.random.randint(,100,size=(samples, 1)), columns=['user_id', 'age', 'profile']) 
df = pd.DataFrame(np.random.randint(18,75,size=(samples, 1)), columns=['age']) 
df['user_id'] = np.random.randint(0,200,size=(samples, 1))
df['profile'] = np.random.randint(0,1000,size=(samples, 1))
df['date']=pd.to_datetime(np.random.randint(start_u, end_u, samples), unit='s').date

# df[['lat','lon']]=(np.round(np.random.uniform(, 180,size=(samples,2)), 5))
df['lon']=np.round(np.random.uniform(20, 24,size=(samples)), 2)
df['lat']=np.round(np.random.uniform(110, 120,size=(samples)), 2)
df['location']=df['lat'].astype(str) +","+ df["lon"].astype(str) 
df.drop(columns=['lat','lon'],inplace=True)
df

Unnamed: 0,age,user_id,profile,date,location
0,61,26,937,2019-04-05,"113.47,20.34"
1,30,19,972,2019-08-17,"117.61,20.24"
2,27,134,760,2020-05-30,"115.11,23.5"
3,55,44,864,2016-08-17,"119.14,21.56"
4,24,184,938,2017-09-30,"113.64,23.54"
...,...,...,...,...,...
995,69,72,887,2019-10-26,"115.18,23.8"
996,33,29,651,2020-06-15,"117.05,21.3"
997,18,101,517,2019-04-14,"111.96,23.58"
998,65,19,974,2019-05-22,"112.48,23.63"


In [113]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap()
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])
g2.plot()

['time: 0.03180466492970784 line/min: 31441.928478420414']


#### Parameters:  `X` and `y`, `feature_engine`, etc

In [114]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap(X=['user_id'],y=['date','location'])
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])
g2.plot()

['time: 0.02227895657221476 line/min: 44885.40550625031']


In [115]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap(X=['user_id'],y=['date','location'], feature_engine='torch')
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])
g2.plot()

['time: 0.023025786876678465 line/min: 43429.56900260569']


testing various other parameters

In [117]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap(X=['user_id'],y=['date','location'], feature_engine='torch', n_neighbors= 2,min_dist=.5, spread=.1, local_connectivity=2, n_components=5,metric='hellinger')
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])
g2.plot()


['time: 0.003930246829986573 line/min: 254436.94588602122']


#### test `engine` flag to see speed boost

In [87]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap(engine='cuml')
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])

['time: 0.004134837786356608 line/min: 241847.4560960093']


In [88]:
g = graphistry.nodes(df)
t=time()
g2 = g.umap(engine='umap_learn') ## note this will take appreciable time depending on sample count defined above
min=(time()-t)/60
lin=df.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])

['time: 0.06711641947428386 line/min: 14899.483730403068']


#### Now lets look at some real data:

In [77]:
G=pd.read_csv('pygraphistry/demos/data/honeypot.csv')

g = graphistry.nodes(G)
t=time()
g3 = g.umap(engine='cuml')#-learn')
min=(time()-t)/60
lin=G.shape[0]/min
print(['time: '+str(min)+' line/min: '+str(lin)])

['time: 0.0008151054382324219 line/min: 269903.7323037323']


In [78]:
print(g3._edges.info())
g3._edges.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3728 entries, 0 to 3749
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   _src_implicit  3728 non-null   int32  
 1   _dst_implicit  3728 non-null   int32  
 2   _weight        3728 non-null   float32
dtypes: float32(1), int32(2)
memory usage: 72.8 KB
None


Unnamed: 0,_src_implicit,_dst_implicit,_weight
1046,71,144,0.205078
642,41,74,0.176112
811,53,152,0.079932
2699,171,70,0.140091
1466,101,144,0.050159


In [79]:
g3.plot()