# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib_venn import venn3, venn2
import random
import plotly.io as pio
from scipy import sparse
import os
import networkx as nx

In [2]:
seed = 16
random.seed(seed)
np.random.seed(seed)

#esto es para forzar a plt a poner fondos blancos en las figuras aunque el tema del notebook sea oscuro
#plt.rcParams['axes.facecolor'] = 'white'
#plt.rcParams['figure.facecolor'] = 'white'
cmap = plt.get_cmap("tab10")
pio.templates.default = "seaborn"

sns.set_style("darkgrid", rc={'xtick.bottom': True})

In [3]:
def node_labels_to_numeric(G):
    new_labels = {old_label:int(old_label) for old_label in list(G.nodes)}
    nx.relabel_nodes(G,new_labels,False)

In [4]:
data_processed = "../../../data/processed/"
data_interim = "../../../data/interim/"
data_external = "../../../data/external/"
tfidf_reports = "../../../reports/tfidf/"
reports_comunidades = "../../../reports/analisis_comunidades/"

graph_node_data = pd.read_csv(data_processed+"graph_data/grafo_alternativo_CG_nodos.csv")
G = nx.read_gml(data_processed+"graph_data/gda_network.gml")
node_labels_to_numeric(G)

In [5]:
def get_node_dict(G):
    return {node:data for (node,data) in list(G.nodes(data=True))}

# Jaccard de genes compartidos

In [6]:
def jaccard(set1,set2):
    intersection = len(set1&set2)
    union = len(set1|set2)
    return intersection/union

In [7]:
nodos_gda = pd.DataFrame(get_node_dict(G)).T.reset_index().rename(columns={"index":"node_index"})

In [8]:
nodos_gda

Unnamed: 0,node_index,node_type,node_name,node_id,node_source
0,6512,gene_protein,NPBWR1,2831,disgenet
1,27291,disease,"Osteopoikilosis, Isolated",C1833699,disgenet
2,16773,gene_protein,TCAP,8557,disgenet
3,17685,gene_protein,GPR52,9293,disgenet
4,9996,gene_protein,HERC5,51191,disgenet
...,...,...,...,...,...
20851,5377,gene_protein,PANX1,24145,disgenet
20852,22935,disease,Congenital glucose-galactose malabsorption,C0268186,disgenet
20853,782,gene_protein,AGPAT1,10554,disgenet
20854,33412,disease,Spinocerebellar ataxia type 40,C4518336,disgenet


In [9]:
conjuntos_enfermedad = {}
nodos_enfermedad = nodos_gda.loc[nodos_gda.node_type == "disease", "node_index"].values

for enfermedad in nodos_enfermedad:
    vecinos = list(G.neighbors(enfermedad))
    conjuntos_enfermedad[enfermedad] = set(vecinos)

In [12]:
matrix_jaccard = np.zeros((len(nodos_enfermedad), len(nodos_enfermedad)))
indices = np.triu_indices_from(matrix_jaccard, 1)

In [25]:
for i,j in zip(indices[0],indices[1]):
    nodo_i = nodos_enfermedad[i]
    nodo_j = nodos_enfermedad[j]
    conjunto_i = conjuntos_enfermedad[nodo_i]
    conjunto_j = conjuntos_enfermedad[nodo_j]
    coef = jaccard(conjunto_i, conjunto_j)
    matrix_jaccard[i,j] = coef