# Harmonization

This notebook is used for harmonizing all the different namespaces into selected ones (mondo, pubchem.compound and ncbigene) along with a thresholding value to normalize the values of log-fold changes

# Imports

In [1]:
import pandas as pd
import os
import json
import logging
from tqdm import tqdm

from utils import harmonize_dataset, DATA_DIR

In [2]:
logger = logging.getLogger(__name__)

# Load datasets

In [3]:
with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'chemical_expression.json')) as file:
    creed_dict = json.load(file)
    
with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'disease_expression.json')) as file2:
    geo_dict = json.load(file2)
    
with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'chemical_expression.json')) as file3:
    l1000_dict = json.load(file3)
    
with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'disease_expression.json')) as file4:
    open_target_dict = json.load(file4)

# Harmonizing data

In [4]:
harmonized_creed_dict = harmonize_dataset(data_dict=creed_dict, threshold=1)

with open(os.path.join(DATA_DIR, 'creeds', 'normalized', 'harmonized_expression-new1.json'), 'w') as f1:
    json.dump(harmonized_creed_dict, f1, ensure_ascii=False, indent=2)

Harmonizing dictionary: 100%|██████████| 149/149 [00:00<00:00, 8376.80it/s]


In [5]:
harmonized_geo_dict = harmonize_dataset(data_dict=geo_dict, threshold=1)

with open(os.path.join(DATA_DIR, 'geo', 'normalized', 'harmonized_expression-new1.json'), 'w') as f1:
    json.dump(harmonized_geo_dict, f1, ensure_ascii=False, indent=2)

Harmonizing dictionary: 100%|██████████| 46/46 [00:00<00:00, 564.37it/s]


In [6]:
harmonized_l1000_dict = harmonize_dataset(data_dict=l1000_dict, threshold=1)

with open(os.path.join(DATA_DIR, 'l1000', 'normalized', 'harmonized_expression-new1.json'), 'w') as f1:
    json.dump(harmonized_l1000_dict, f1, ensure_ascii=False, indent=2)

Harmonizing dictionary: 100%|██████████| 2700/2700 [00:00<00:00, 10336.63it/s]


In [7]:
harmonized_targets_dict = harmonize_dataset(data_dict=open_target_dict, threshold=1)

with open(os.path.join(DATA_DIR, 'open_targets', 'normalized', 'harmonized_expression-new1.json'), 'w') as f1:
    json.dump(harmonized_targets_dict, f1, ensure_ascii=False, indent=2)

Harmonizing dictionary: 100%|██████████| 180/180 [00:00<00:00, 6892.10it/s]
