In [1]:
import pybel
import pandas as pd

  from tqdm.autonotebook import tqdm


In [2]:
bern2_data = pd.read_csv(
    's3://enveda-data-kg/nlp/bern2/bern2-edges-subset/predicted_plant_disease_high_confidence_to_ingest.tsv',
    sep='\t',
)

In [3]:
bern2_data.head(1)

Unnamed: 0,relation_prediction,confidence,pmid_sentence,entity1_curie,entity2_curie,entity1_name,entity2_name,sentence,mention1,mention2
0,True,medium,10067319_8,ncbitaxon:3369,mondo:0005324,Cryptomeria japonica,seasonal allergic rhinitis,"In conclusion, the numbers of patients with Ja...",Japanese cedar,Japanese cedar pollinosis


In [4]:
rows = []

for _, row in bern2_data.iterrows():
    
    if row['entity1_curie'].startswith(('ncbitaxon:', 'wfo:', 'itis:')):
        
        if not row['entity2_curie'].startswith('mondo'):
            continue
        
        plant_curie = row['entity1_curie']
        plant_name = row['entity1_name']
        disease_curie = row['entity2_curie']
    else:
        if not row['entity1_curie'].startswith('mondo'):
            continue
        plant_curie = row['entity2_curie']
        plant_name = row['entity2_name']
        disease_curie = row['entity1_curie'] 
        
        
    rows.append({
        'plant_curie': plant_curie,  
        'plant_name': plant_name,  
        'disease_curie': disease_curie,  
        'database': 'bern2',  
        'evidence': row['pmid_sentence'], 
    })

bern2_data = pd.DataFrame(rows)

In [5]:
bern2_data.head(1)

Unnamed: 0,plant_curie,plant_name,disease_curie,database,evidence
0,ncbitaxon:3369,Cryptomeria japonica,mondo:0005324,bern2,10067319_8


In [6]:
bern2_data.shape

(72981, 5)

In [7]:
def bel_to_df(path, database, key):
    graph = pybel.from_nodelink_file(path)
    
    rows = []
    
    for source, target, data in graph.edges(data=True):
        
        if isinstance(source, pybel.dsl.Population) and isinstance(target, pybel.dsl.Pathology):
            # skip other terminologies like itis
            if not source.curie.startswith('ncbitaxon:'):
                continue
            # restrict the mondo
            if not target.curie.startswith('mondo:'):
                continue
                
            rows.append({
                'plant_curie': source.curie,
                'disease_curie': target.curie,
                'database': database,
                'evidence': data[key] if key == 'evidence' else data[key]['identifier'],
            })
            
    return pd.DataFrame(rows)
        

In [8]:
etm = bel_to_df(
    '../data/raw/etm.bel.nodelink.json',
    'etm',
    'evidence'
)

imppat = bel_to_df(
    '../data/raw/imppat.bel.nodelink.json',
    'imppat',
    'citation',
)

vietnam = bel_to_df(
    '../data/raw/vietnam.bel.nodelink.json',
    'vietnam',
    'citation',
)

ewe = bel_to_df(
    '../data/raw/ewe.bel.nodelink.json',
    'ewe',
    'citation',
)

koreantk = bel_to_df(
    '../data/raw/koreantk.bel.nodelink.json',
    'koreantk',
    'evidence',
)

In [9]:
plant_disease_associations_df = pd.concat([
        bern2_data,
        etm,
        imppat,
        vietnam,
        ewe,
        koreantk,
    ],
    ignore_index=True,
)

In [10]:
plant_disease_associations_df.shape

(97066, 5)

In [11]:
plant_disease_associations_df.to_csv(
    '../data/processed/plant-disease/plant_disease_associations.tsv',
    sep='\t',
    index=False,
)