# Dependencies

In [None]:
!pip install gensim

In [1]:
from gensim.models import KeyedVectors
from utils import getmesh,getgene
from commonalities import *

# Loading
Loading with 'r' parameters means another process can do the same and the memory is shared through MMAP

In [6]:
model = KeyedVectors.load('models/cspace.kv.bin','r')

In [2]:
smodel = KeyedVectors.load('models/cspace.100k.kv.bin','r')

# Coarse exploration of neighborhood
First explore neighborhood of given concepts for related diseases, genes, species or conditions

In [3]:
c1='ipf' # Idiopathic Pulmonary Fibrosis
c2='long_covid'

In [5]:
# concepts semantically similar
#list(get_names( 
smodel.most_similar(c1, topn=10) 

[('idiopathic_pulmonary_fibrosis', 0.8397676944732666),
 ('ssc-ild', 0.8062280416488647),
 ('nsip', 0.8031862378120422),
 ('ae-ipf', 0.8015839457511902),
 ('idiopathic_pulmonary', 0.798287034034729),
 ('disease_mesh_d054990', 0.787049412727356),
 ('ctd-ild', 0.7767012119293213),
 ('ra-ild', 0.7679328918457031),
 ('interstitial_lung_disease', 0.7670239210128784),
 ('ild', 0.7600297927856445)]

what is ILD?

In [6]:
smodel.most_similar('ild', topn=10)

[('interstitial_lung_disease', 0.8685597777366638),
 ('ctd-ild', 0.8181895613670349),
 ('nsip', 0.8018287420272827),
 ('disease_mesh_d017563', 0.7992202639579773),
 ('ra-ild', 0.7935575246810913),
 ('idiopathic_pulmonary_fibrosis', 0.7925955653190613),
 ('lung_disease', 0.7866021990776062),
 ('interstitial_pneumonia', 0.7826959490776062),
 ('hrct', 0.7738054394721985),
 ('ipf', 0.7600297927856445)]

what is RA-ILD?  

"rheumatoid arthritis" combined with "Interstitial lung disease". 

## explore concepts midway between 'RA' and 'ILD'

both on small model (coarse-grained exploration)

In [13]:
smodel.similar_by_vector( smodel.get_vector('rheumatoid_arthritis', norm=True)*1/2.0 + smodel.get_vector('interstitial_lung_disease', norm=True)*1/2.0 , topn=10)

[('interstitial_lung_disease', 0.9068495631217957),
 ('rheumatoid_arthritis', 0.9068494439125061),
 ('rheumatic_disease', 0.8805110454559326),
 ('inflammatory_arthritis', 0.8632184267044067),
 ('rheumatoid', 0.8588052988052368),
 ('systemic_sclerosis', 0.8577644228935242),
 ('juvenile_idiopathic_arthritis', 0.8413053154945374),
 ('arthritis', 0.8406729698181152),
 ('lung_disease', 0.8402458429336548),
 ('polymyalgia_rheumatica', 0.8381184935569763)]

and on full model for fine-grained exploration

In [14]:
model.similar_by_vector( model.get_vector('rheumatoid_arthritis', norm=True)*1/2.0 + model.get_vector('interstitial_lung_disease', norm=True)*1/2.0 , topn=10)

[('rheumatoid_arthritis_interstitial_lung_disease', 0.9765331149101257),
 ('rheumatoid_arthritis–interstitial_lung_disease', 0.9700795412063599),
 ('rheumatoid_arthritis-interstitial_lung_disease', 0.9683608412742615),
 ('rheumatoid_arthritis-related_interstitial_lung_disease',
  0.9678709506988525),
 ('_rheumatoid_arthritis-associated_interstitial_lung_disease',
  0.9631861448287964),
 ('rheumatoid_arthritis-associated_interstitial_lung_disease',
  0.9590489864349365),
 ('rheumatoid_interstitial_lung_disease', 0.9576582908630371),
 ('rheumatoid_arthritis-related_lung_disease', 0.9523972272872925),
 ('rheumatoid_arthritis_lung_disease', 0.9523601531982422),
 ('ra-ild_rheumatoid_arthritis-associated_interstitial_lung_disease',
  0.9505068063735962)]

the same can be done for long-covid

In [7]:
# concepts semantically similar to long-covid
smodel.most_similar(c2, topn=10)

[('long-covid', 0.8718876242637634),
 ('post-covid', 0.8640903830528259),
 ('post-covid-19', 0.8436791300773621),
 ('myalgic_encephalomyelitis', 0.8102188110351562),
 ('coronavirus_disease-19', 0.8082864880561829),
 ('coronavirus_disease-2019', 0.7872609496116638),
 ('covid', 0.7853097915649414),
 ('covid-19', 0.7601396441459656),
 ('covid-19-related', 0.7519919276237488),
 ('me_cfs', 0.7385003566741943)]

ME/CFS is Myalgic Encephalomyelitis / Chronic Fatigue Syndrome, shares a lot of symphtoms with Long-Covid!

# Find related genes
you may also discover unknown connections between apparently distant concepts, such as a disease and a gene

In [17]:
list(most_similar_genes(model,c1, topn=10000))

[('1283-ild', 'gene_1283-ild', 0.7549170255661011),
 ('1283-ild.^', 'gene_1283-ild.^', 0.7437087893486023),
 ('1283-ilds', 'gene_1283-ilds', 0.7414316534996033),
 ('1283-disease_mesh_d017563',
  'gene_1283-disease_mesh_d017563',
  0.7295284867286682),
 ('5053-disease_mesh_d017563',
  'gene_5053-disease_mesh_d017563',
  0.7005946040153503),
 ('17-disease_mesh_d017563',
  'gene_17-disease_mesh_d017563',
  0.693144679069519),
 ('260431-associated', 'gene_260431-associated', 0.6875972151756287),
 ('4137ease_mesh_d054990', 'gene_4137ease_mesh_d054990', 0.6871569156646729),
 ('1283-uip', 'gene_1283-uip', 0.6868862509727478),
 ('260431-associate', 'gene_260431-associate', 0.6852938532829285),
 ('1283-uip-hlf', 'gene_1283-uip-hlf', 0.6841643452644348),
 ('260431-affecte', 'gene_260431-affecte', 0.6790813207626343),
 ('1283-pah', 'gene_1283-pah', 0.6738100051879883),
 ('1283ease_mesh_d017563', 'gene_1283ease_mesh_d017563', 0.672349750995636),
 ('COPD', 'gene_260431', 0.6720150709152222),
 ('260

discover commonalities between IPF-associated genes and Long-Covid

**WARNING:** 
- gene 1283 is named *CTD*, like the *Connective Tissue Diseases*
- gene 260431 is named *COPD*, like the *Chronic Obstructive Pulmonary Disease*.

Those are Pubtator3 ambiguities, incorporated in CSpace, that will hopefully be solved in future Pubtator3 releases.

In [5]:
# gene PAH is the top most, non-erroneous related gene to IPF
list(shared_similarities(smodel,c1,'gene_5053',topn=1000))

[('p-Aminohippuric Acid', 0.5670960545539856),
 ('SFTPD', 0.5638678073883057),
 ('Iloprost', 0.5443216562271118),
 ('ELANE', 0.5402020215988159),
 ('Cystic Fibrosis', 0.5367894172668457),
 ('SFTPC', 0.5309598445892334),
 ('Beclomethasone', 0.5169974565505981),
 ('Albuterol', 0.5095176100730896),
 ('expired', 0.5091230273246765)]