In [1]:
from aerobot.io import FEATURE_TYPES, DATA_PATH
import os 
import pandas as pd
import numpy as np

In [2]:
def load_data_jablonska(feature_type:str, path:str=os.path.join(DATA_PATH, 'jablonska/'), ):
    return pd.read_csv(os.path.join(path, f'jablonska_{feature_type}.csv'), index_col=0)
    

def load_data_madin(feature_type:str, path:str=os.path.join(DATA_PATH, 'madin/madin.h5')):
    # Create a dictionary mapping each feature type to a key in the HD5 file.
    key_map = {f:f for f in FEATURE_TYPES} # Most keys are the same as the feature type names.
    key_map.update({'embedding.genome':'WGE', 'embedding.geneset.oxygen':'OGSE', 'metadata':'AF'})
    key_map.update({'labels':'labels'})
    return pd.read_hdf(path, key=key_map[feature_type])

Curious about how much the annotated KO groups in the Jablonska and Madin datasets overlapped. I am running into a bug due to the fact that one (or both) of the datasets have no proteins annotated with some of the KO groups in the list of terminal oxidase KO groups. I am trying to figure out if it is better to fill in zeros or drop the missing KO groups.

If both datasets are missing the same KO groups, I will want to drop the KO groups which are not present. If I fill in with zeros, then the model weights corresponding to those input features will not be trained. If input data which *does* contain those KO groups is used as input to the trained model, then it could adversely effect predictions.

I first want to make sure that the columns in the Jablonska and Madin KO data are the same. 

In [3]:
madin_kos = list(load_data_madin('KO').columns)
assert len(madin_kos) == len(set(madin_kos)), 'Some of the KO groups in the Madin data are duplicated.'
print('Number of KO groups in the Madin data:', len(madin_kos))

Number of KO groups in the Madin data: 10314


In [4]:
jablonska_kos = list(load_data_jablonska('KO').columns)
assert len(jablonska_kos) == len(set(jablonska_kos)), 'Some of the KO groups in the Madin data are duplicated.'
print('Number of KO groups in the Jablonska data:', len(jablonska_kos))

Number of KO groups in the Jablonska data: 9198


In [5]:
all_kos = set(jablonska_kos).union(set(madin_kos))
print('Number of KO groups in the union:', len(all_kos))

# Write the KO groups to a file. 
all_kos_df = pd.DataFrame({'ko':list(all_kos)}).set_index('ko')
all_kos_df.to_csv(os.path.join(DATA_PATH, 'kos.csv'))

Number of KO groups in the union: 10409


Seems like a best call to use the union of the columns in each dataset. This should maximize the amount of information, while ensuring that all weights are actually updated during model training.

In [6]:
TERMINAL_OXIDASE_KOS = set(pd.read_csv(os.path.join(DATA_PATH, 'terminal_oxidase_kos.csv')).ko.unique())
print('Number of terminal oxidase-related KO groups:', len(TERMINAL_OXIDASE_KOS))

Number of terminal oxidase-related KO groups: 38


In [7]:
# How many of the terminal oxidase KO groups are in the Jablonska data? What about the Madin data?
print('Number of terminal oxidase KO groups in the Jablonska data:', len(TERMINAL_OXIDASE_KOS.intersection(jablonska_kos)))
print('Number of terminal oxidase KO groups in the Madin data:', len(TERMINAL_OXIDASE_KOS.intersection(madin_kos)))

Number of terminal oxidase KO groups in the Jablonska data: 24
Number of terminal oxidase KO groups in the Madin data: 26
