In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import pickle

In [2]:
# Charger et parser le fichier XML
tree = ET.parse('product6.xml')
root = tree.getroot()

# Initialiser une liste pour stocker les informations
data = []

# Parcourir tous les DrugRegulatoryStatus dans le fichier XML
for drug_status in root.findall('DrugRegulatoryStatusList/DrugRegulatoryStatus'):
    # Extraire l'ATC code (s'il existe)
    # atc_code = drug_status.findtext('ATCCode', default='')
    
    # Extraire les informations de chaque Substance
    for substance_association in drug_status.findall('SubstanceDrugRegulatoryStatusAssociationList/SubstanceDrugRegulatoryStatusAssociation'):
        substance = substance_association.find('Substance')
        
        code = substance.findtext('Code', default='')
        chemical_name = substance.findtext('ChemicalName', default='')
        name = substance.findtext('Name', default='')
        
        # Extraire les OrphaCodes et noms des maladies associés
        for disorder in drug_status.findall('DisorderList/Disorder'):
            orpha_code = disorder.findtext('OrphaCode', default='')
            disorder_name = disorder.findtext('Name', default='')
            
            # Extraire les informations de DrugTradeName (s'il y en a)
            trade_names = []
            for trade_name_association in drug_status.findall('DrugTradeNameDrugRegulatoryStatusAssociationList/DrugTradeNameDrugRegulatoryStatusAssociation'):
                trade_name = trade_name_association.findtext('DrugTradeName', default='')
                if trade_name:
                    trade_names.append(trade_name)

            # Si aucun nom commercial n'est trouvé, mettre une chaîne vide
            trade_names_str = ", ".join(trade_names) if trade_names else ''
            
            # Ajouter les informations dans le tableau
            data.append({
                # 'ATCCode': atc_code,
                'Code': code,
                'ChemicalName': chemical_name,
                'Name': name,
                'DrugTradeName': trade_names_str,
                'OrphaCode': orpha_code,
                'DisorderName': disorder_name
            })

# Convertir les données en DataFrame
df = pd.DataFrame(data)
df['Name'] = df['Name'].str.lower()
df['DisorderName'] = df['DisorderName'].str.lower()

In [3]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(5580, 6)
(4915, 6)


In [4]:
shep_kg = pd.read_csv("kg_giant_orphanet.csv", sep=",", dtype={"x_id": str, "y_id": str})

In [5]:
shep_kg["x_name_lower"] = shep_kg['x_name'].str.lower()
shep_kg["y_name_lower"] = shep_kg['y_name'].str.lower()
shep_kg["my_x_id"] = shep_kg['x_type'].str.lower() + "_" + shep_kg["x_id"]
shep_kg["my_y_id"] = shep_kg['y_type'].str.lower() + "_" + shep_kg["y_id"]

In [6]:
bef = shep_kg.shape[0]

In [7]:
print("Olmsted Syndrome 2 is duplicated with ids: ['disease_30961', 'disease_30965']. Removing all interactions with 'disease_30965' as x or y id.")
shep_kg = shep_kg[(shep_kg['my_x_id']!='disease_30965') & (shep_kg['my_y_id']!='disease_30965') ]
print(bef - shep_kg.shape[0]) # Should be 14

Olmsted Syndrome 2 is duplicated with ids: ['disease_30961', 'disease_30965']. Removing all interactions with 'disease_30965' as x or y id.
14


In [8]:
# Extraire les colonnes nécessaires pour x et y, puis les concaténer
x_info = shep_kg[['my_x_id', 'x_id', 'x_type', 'x_name', 'x_name_lower', 'x_source']].rename(
    columns={'my_x_id': 'my_id','x_id': 'id', 'x_type': 'type', 'x_name': 'name', 'x_name_lower': 'name_lower', 'x_source': 'source'}
)
y_info = shep_kg[['my_y_id', 'y_id', 'y_type', 'y_name', "y_name_lower", 'y_source']].rename(
    columns={'my_y_id': 'my_id', 'y_id': 'id', 'y_type': 'type', 'y_name': 'name', 'y_name_lower': 'name_lower', 'y_source': 'source'}
)

# Concaténer les informations de x et y
combined_info = pd.concat([x_info, y_info], ignore_index=True)

# Supprimer les doublons en fonction de l'ID pour créer le node_map
node_map = combined_info.drop_duplicates(subset='id').reset_index(drop=True)
print(node_map.shape)
node_map = combined_info.drop_duplicates(subset='my_id').reset_index(drop=True)
print(node_map.shape)
print("Some nodes share the same id. Making my_id (node_type + id) the main id key.")

(90271, 6)
(134029, 6)
Some nodes share the same id. Making my_id (node_type + id) the main id key.


In [9]:
node_map_drug_diseases = node_map[node_map["type"].isin(['drug', 'disease'])]

In [10]:
non_unique_names = node_map_drug_diseases[node_map_drug_diseases['name'].duplicated(keep=False)]
non_unique_names

Unnamed: 0,my_id,id,type,name,name_lower,source


In [11]:
drug_disease_kg = shep_kg[(shep_kg['my_x_id'].isin(node_map_drug_diseases['my_id'])) | (shep_kg['my_y_id'].isin(node_map_drug_diseases['my_id']))]

In [12]:
# Filtrer node_map_drug_diseases pour ne garder que les lignes où 'type' est 'drug'
tmp = node_map_drug_diseases[node_map_drug_diseases["type"] == "drug"]
# Créer le dictionnaire avec 'name_lower' comme clé et 'my_id' comme valeur
kg_drug_names_dict = dict(zip(tmp['name_lower'], tmp['my_id']))
# Et inversement
kg_drug_ids_dict = dict(zip(tmp['my_id'], tmp['name_lower']))

In [13]:
# Filtrer node_map_drug_diseases pour ne garder que les lignes où 'type' est 'disease'
tmp = node_map_drug_diseases[node_map_drug_diseases["type"] == "disease"]
# Créer le dictionnaire avec 'name_lower' comme clé et 'my_id' comme valeur
kg_diseases_names_dict = dict(zip(tmp['name_lower'], tmp['my_id']))
# Et inversement
kg_diseases_ids_dict = dict(zip(tmp['my_id'], tmp['name_lower']))

In [14]:
from collections import defaultdict

# Inverser le dictionnaire pour regrouper les clés par valeurs
value_to_keys = defaultdict(list)
for key, value in kg_diseases_ids_dict.items():
    value_to_keys[value].append(key)

# Identifier et afficher les valeurs dupliquées
duplicates = {value: keys for value, keys in value_to_keys.items() if len(keys) > 1}
duplicates

{}

In [15]:
df['drug_kg_id'] = df['Name'].map(kg_drug_names_dict)
df['disease_kg_id'] = df['DisorderName'].map(kg_diseases_names_dict)

In [16]:
filtered_df = df.dropna(subset=['drug_kg_id', 'disease_kg_id'])

In [17]:
drug_disease_pairs_kg = drug_disease_kg[
    ((drug_disease_kg['x_type'] == 'disease') & (drug_disease_kg['y_type'] == 'drug')) |
    ((drug_disease_kg['y_type'] == 'disease') & (drug_disease_kg['x_type'] == 'drug'))
]

print(drug_disease_pairs_kg['x_type'].unique())
print(drug_disease_pairs_kg['y_type'].unique())

['drug']
['disease']


In [18]:
in_kg = drug_disease_pairs_kg[['my_x_id', 'my_y_id']]
in_orpha = filtered_df[['drug_kg_id', 'disease_kg_id']]
print(in_kg.shape[0])
print(in_orpha.shape[0])

40041
829


In [19]:
print(filtered_df.shape)
print(filtered_df.drop_duplicates().shape)
print(filtered_df[['drug_kg_id', 'disease_kg_id']].drop_duplicates().shape)

(829, 8)
(829, 8)
(829, 2)


In [20]:
print(in_orpha.shape)
print(in_orpha.drop_duplicates().shape)
orpha_combinations = set(zip(in_orpha['drug_kg_id'], in_orpha['disease_kg_id']))
print(len(orpha_combinations))

(829, 2)
(829, 2)
829


In [21]:
# Créer un ensemble de combinaisons de in_orpha
orpha_combinations = set(zip(in_orpha['drug_kg_id'], in_orpha['disease_kg_id']))

# Créer un ensemble de combinaisons de shep_kg
kg_combinations = set(zip(shep_kg['my_x_id'], shep_kg['my_y_id']))

# Identifier les combinaisons qui sont dans orpha mais pas dans shep_kg
missing_combinations = orpha_combinations - kg_combinations

# Identifier les combinaisons qui sont dans orpha et dans shep_kg
already_in_combinations = orpha_combinations - missing_combinations

print(len(orpha_combinations))
print(len(kg_combinations))
print(len(missing_combinations))
print(len(already_in_combinations))

829
5471423
665
164


In [22]:
# Préparer le DataFrame not_in_kg
not_in_kg = []

for drug_id, disease_id in missing_combinations:
    # Récupérer les infos pour le drug et le disease
    drug_info = node_map_drug_diseases[node_map_drug_diseases['my_id'] == drug_id].iloc[0]
    disease_info = node_map_drug_diseases[node_map_drug_diseases['my_id'] == disease_id].iloc[0]
    
    # Créer une nouvelle ligne pour not_in_kg
    new_row = {
        'relation': 'orpha_treatment',
        'display_relation': 'orpha_treatment',
        'x_id': drug_info['id'] if drug_info['type'] == 'drug' else disease_info['id'],
        'x_type': drug_info['type'] if drug_info['type'] == 'drug' else disease_info['type'],
        'x_name': drug_info['name'] if drug_info['type'] == 'drug' else disease_info['name'],
        'x_source': drug_info['source'] if drug_info['type'] == 'drug' else disease_info['source'],
        'y_id': disease_info['id'] if disease_info['type'] == 'disease' else drug_info['id'],
        'y_type': disease_info['type'] if disease_info['type'] == 'disease' else drug_info['type'],
        'y_name': disease_info['name'] if disease_info['type'] == 'disease' else drug_info['name'],
        'y_source': disease_info['source'] if disease_info['type'] == 'disease' else drug_info['source'],
        'x_name_lower': drug_info['name_lower'] if drug_info['type'] == 'drug' else disease_info['name_lower'],
        'y_name_lower': disease_info['name_lower'] if disease_info['type'] == 'disease' else drug_info['name_lower'],
        'my_x_id': drug_id,
        'my_y_id': disease_id
    }
    
    # Ajouter la nouvelle ligne à la liste
    not_in_kg.append(new_row)

# Convertir la liste en DataFrame
not_in_kg = pd.DataFrame(not_in_kg)
print(not_in_kg.shape)

(665, 14)


In [23]:
# Préparer le DataFrame already_in_kg
already_in_kg = []

for drug_id, disease_id in already_in_combinations:
    # Récupérer les infos pour le drug et le disease
    drug_info = node_map_drug_diseases[node_map_drug_diseases['my_id'] == drug_id].iloc[0]
    disease_info = node_map_drug_diseases[node_map_drug_diseases['my_id'] == disease_id].iloc[0]
    
    # Créer une nouvelle ligne pour not_in_kg
    new_row = {
        'relation': 'orpha_treatment',
        'display_relation': 'orpha_treatment',
        'x_id': drug_info['id'] if drug_info['type'] == 'drug' else disease_info['id'],
        'x_type': drug_info['type'] if drug_info['type'] == 'drug' else disease_info['type'],
        'x_name': drug_info['name'] if drug_info['type'] == 'drug' else disease_info['name'],
        'x_source': drug_info['source'] if drug_info['type'] == 'drug' else disease_info['source'],
        'y_id': disease_info['id'] if disease_info['type'] == 'disease' else drug_info['id'],
        'y_type': disease_info['type'] if disease_info['type'] == 'disease' else drug_info['type'],
        'y_name': disease_info['name'] if disease_info['type'] == 'disease' else drug_info['name'],
        'y_source': disease_info['source'] if disease_info['type'] == 'disease' else drug_info['source'],
        'x_name_lower': drug_info['name_lower'] if drug_info['type'] == 'drug' else disease_info['name_lower'],
        'y_name_lower': disease_info['name_lower'] if disease_info['type'] == 'disease' else drug_info['name_lower'],
        'my_x_id': drug_id,
        'my_y_id': disease_id
    }
    
    # Ajouter la nouvelle ligne à la liste
    already_in_kg.append(new_row)

# Convertir la liste en DataFrame
already_in_kg = pd.DataFrame(already_in_kg)
print(already_in_kg.shape)

(164, 14)


In [24]:
filtered_shep_kg = shep_kg[~shep_kg.apply(lambda row: (row['my_x_id'], row['my_y_id']) in orpha_combinations, axis=1)]

In [25]:
final_shep_kg = pd.concat([filtered_shep_kg, already_in_kg, not_in_kg], axis=0, ignore_index=True)

In [26]:
final_shep_kg["display_relation"].value_counts()

display_relation
synergistic interaction    2672628
expression present         1518203
interacts with              343275
ppi                         321075
phenotype present           204766
parent-child                147108
associated with              96817
side effect                  79137
contraindication             28884
expression absent            19887
target                       16380
indication                    8533
enzyme                        5317
transporter                   3092
off-label use                 2457
linked to                     1795
phenotype absent              1483
carrier                        864
orpha_treatment                829
Name: count, dtype: int64

In [27]:
final_shep_kg.to_csv("shep_kg_with_orphan_treatments.csv", sep="\t", index=False)

In [28]:
# Filtrer pour obtenir les x_id avec display_relation == "orpha_treat"
orpha_treat_ids = final_shep_kg[final_shep_kg['display_relation'] == 'orpha_treatment']['my_x_id'].unique()

# Filtrer pour obtenir les lignes avec display_relation == "treatment" pour les x_id filtrés
treatment_counts = final_shep_kg[(final_shep_kg['my_x_id'].isin(orpha_treat_ids)) & (final_shep_kg['display_relation'] == 'indication')]

In [29]:
final_shep_kg[final_shep_kg['my_x_id'].isin(orpha_treat_ids)]["display_relation"].value_counts()

display_relation
synergistic interaction    233174
side effect                  9878
contraindication             2297
indication                   1546
target                       1206
orpha_treatment               829
enzyme                        750
transporter                   381
off-label use                 320
carrier                        93
Name: count, dtype: int64