In [3]:
from aerobot.utils import DATA_PATH, MODELS_PATH
import os
import numpy as np 
import pandas as pd 
from aerobot.models import NonlinearClassifier

HANNAHS_DATA_PATH = os.path.join(DATA_PATH, 'hannah')

In [6]:
embeddings = pd.read_csv(os.path.join(HANNAHS_DATA_PATH, 'embeddings.csv'), index_col=0)
model = NonlinearClassifier.load(os.path.join(MODELS_PATH, 'linear_embedding_rna16s_ternary.joblib'))

In [7]:
predictions = model.predict(embeddings.values)

In [27]:
predictions_df = pd.DataFrame({'asv_id':embeddings.index, 'prediction':predictions}).set_index('asv_id')
predictions_df.to_csv(os.path.join(HANNAHS_DATA_PATH, 'predictions.csv'))

In [13]:
total = len(predictions_df)
n_aerobe = len(predictions_df[predictions_df['prediction'] == 'aerobe'])
n_anaerobe = len(predictions_df[predictions_df['prediction'] == 'anaerobe'])
n_facultative = len(predictions_df[predictions_df['prediction'] == 'facultative'])

print('Total ASVs:', total)
print('Total number of aeroboes:', n_aerobe , f'({int(100 * n_aerobe / total)}%)')
print('Total number of anaerobes:', n_anaerobe, f'({int(100 * n_anaerobe / total)}%)')
print('Total number of facultative:', n_facultative,  f'({int(100 * n_facultative / total)}%)')


Total ASVs: 96068
Total number of aeroboes: 68666 (71%)
Total number of anaerobes: 14523 (15%)
Total number of facultative: 12879 (13%)


In [24]:
taxonomy_df = pd.read_csv(os.path.join(HANNAHS_DATA_PATH, 'ASVs_taxonomy_Mar2023.tsv'), sep='\t', index_col=0)
taxonomy_df.index.name = 'asv_id'

In [None]:
predictions_df = predictions_df.merge(taxonomy_df, how='left', left_index=True, right_index=True)
predictions_df.to_csv(os.path.join(HANNAHS_DATA_PATH, 'predictions_with_taxonomy.csv'))

In [30]:
# Might be useful to figure out the abundance of different taxa in each cat