In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import random
import matplotlib.pyplot as plt
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
from IPython.core.display import HTML
HTML('<style>.container { width:95% !important; }</style><style>.output_png {display: table-cell;text-align: center;vertical-align: middle;}</style>')

In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe_connected'  # Ensures plots render correctly

In [None]:
directory_name = 'BacterialPathogensData'

In [None]:
from utils import load_dataframe_from_csv
data_df = load_dataframe_from_csv(df_csv_name='data.csv', directory_name=directory_name)
data_df

In [None]:
counts_df = load_dataframe_from_csv(df_csv_name='counts.csv', directory_name=directory_name)
counts_df

In [None]:
from utils import format_data_dict, extract_nested_dict
concepts_string, keys_string = format_data_dict(extract_nested_dict(pd.read_excel(directory_name+'/YES_NO_questions.xlsx', header=None)))
unique_classes = keys_string.split(', ')+['BACTERIUM']
print('#concepts: %d'%(len(unique_classes)) )
print('Concepts:')
print(', '.join(unique_classes))

In [None]:
from draw_graph_linear_assignment import fast_visualisation
fast_visualisation(
    data_df, 
    counts_df,
    desired_attributes_for_embedding=None, 
    column_group_by='CanonicalSpecies', 
    column_embedding='anonymised_embedding', 
    unique_classes=unique_classes,
    min_num_to_show=3,
    filename='fig3_overview.pdf',
    figure_size=1000,
    show_legend=False)

In [None]:
fast_visualisation(
    data_df, 
    counts_df,
    desired_attributes_for_embedding=None, 
    desired_attributes_for_color = ["ZOONOSIS"],
    column_group_by='CanonicalSpecies', 
    column_embedding='anonymised_embedding', 
    unique_classes=unique_classes,
    min_num_to_show=3,
    use_linear_contrast=True, 
    x_ref=5, 
    z_ref=30,
    cmap='hot_r',
    filename='fig3D_ZOONOSIS.pdf',
    highlight_elements=['Chlamydia psittaci','Chlamydia trachomatis'],
    figure_size=1000,
    show_legend=False)

In [None]:
fast_visualisation(
    data_df, 
    counts_df,
    desired_attributes_for_embedding=None, 
    desired_attributes_for_color = ["SEXUALLY TRANSMITTED"],
    column_group_by='CanonicalSpecies', 
    column_embedding='anonymised_embedding', 
    unique_classes=unique_classes,
    min_num_to_show=3,
    use_linear_contrast=True, 
    x_ref=5, 
    z_ref=30,
    cmap='hot_r',
    filename='fig3C_STD.pdf',
    highlight_elements=['Neisseria gonorrhoeae','Treponema pallidum', 'Shigella boydii', 'Klebsiella granulomatis' ],
    figure_size=1000,
    show_legend=False)

In [None]:
from draw_graph_linear_assignment import fast_visualisation
clustering_df = fast_visualisation(
    data_df, 
    counts_df,
    desired_attributes_for_embedding=None, 
    column_group_by='CanonicalSpecies', 
    column_embedding='anonymised_embedding', 
    unique_classes=unique_classes,
    min_num_to_show=12,
    n_clusters=75,
    contamination=1e-6,
    filename='fig5_pathogen_types.pdf',
    figure_size=1000,
    show_legend=False)

---

In [None]:
SYSTEM_concepts = ["NERVOUS SYSTEM", "VISUAL SYSTEM", "EAR", "CIRCULATORY SYSTEM", "RESPIRATORY SYSTEM", "DIGESTIVE SYSTEM", "SKIN", "MUSCULOSKELETAL", "GENITOURINARY", "SYSTEMIC", "STERILE SITE"]
min_num_causation = 3
selected_counts_df = counts_df[counts_df['n_causation'] >= min_num_causation]

selected_genera_list = ['Mycobacterium', 'Corynebacterium', 'Nocardia', 'Streptococcus', 'Staphylococcus', 'Clostridium', 'Burkholderia', 'Pseudomonas', 'Legionella']
selected_counts_df = selected_counts_df[selected_counts_df['Genus'].isin(selected_genera_list)]

selected_species_list = selected_counts_df['CanonicalSpecies'].values.tolist()
selected_data_df = data_df[data_df['CanonicalSpecies'].isin(selected_species_list)]

print('Selected data contains: ')
print('#genera: %d'%len(set(selected_data_df['Genus'].values.tolist())))
print('#species: %d'%len(set(selected_data_df['CanonicalSpecies'].values.tolist())))

data_hot_df = selected_data_df.loc[:,['CanonicalSpecies','onehot_embedding']]
averaged_embeddings = data_hot_df.groupby('CanonicalSpecies')['onehot_embedding'].agg(lambda x: np.mean(np.stack(x), axis=0))

from draw_graph_linear_assignment import compute_entropy_distribution_by_genus
compute_entropy_distribution_by_genus(
    data_series=averaged_embeddings,
    axis_labels=unique_classes,
    selected_names=selected_species_list, 
    selected_axis_names=SYSTEM_concepts,
    title='SYSTEM_concepts',
    combined_plot=True,
    scale=0.8,
    filename='fig4_entropy.pdf'
)

In [None]:
selected_axis_names = ["EAR", "CIRCULATORY SYSTEM", "NERVOUS SYSTEM", "VISUAL SYSTEM", "RESPIRATORY SYSTEM", "SKIN", "MUSCULOSKELETAL", "SYSTEMIC", "STERILE SITE", "DIGESTIVE SYSTEM", "GENITOURINARY"]
selected_species_list = ['Mycobacterium flavescens','Mycobacterium ulcerans']
selected_data_df = data_df[data_df['CanonicalSpecies'].isin(selected_species_list)]
data_hot_df = selected_data_df.loc[:,['CanonicalSpecies','onehot_embedding']]
averaged_embeddings = data_hot_df.groupby('CanonicalSpecies')['onehot_embedding'].agg(lambda x: np.mean(np.stack(x), axis=0))
selected_names = selected_species_list
comparative_selected_names = selected_names[1:]+selected_names[0:1]

from draw_graph_linear_assignment import plot_radar_from_series
plot_radar_from_series(
    averaged_embeddings, 
    selected_names=selected_names, 
    axis_labels=unique_classes, 
    selected_axis_names=selected_axis_names, 
    comparative_selected_names=comparative_selected_names, 
    num_per_row=6, 
    size=6,
    fix_axis_range=True,     # fix axes to [0, 1]
    log_scale=True,           # apply log(value + 1)
    filename='fig4B_radar.pdf'
)

In [None]:
from draw_graph_linear_assignment import plot_tsne_with_ellipses
genera = ['Neisseria']
species_list = set(data_df[data_df['Genus'].isin(genera)]['CanonicalSpecies'].values)
local_df = data_df[data_df['CanonicalSpecies'].isin(species_list)]

kwargs = dict(
    color_col='CanonicalSpecies', 
    confidence=15,
    n_std=2, 
    min_n_instances=8,
    plot_width=1000, 
    plot_height=1000,
    ellipse_darken_factor=0.95,
)
embedding_column = 'anonymised_embedding'
show_scatter=True
fig = plot_tsne_with_ellipses(
    local_df, 
    coordinate_col=embedding_column, 
    show_scatter=show_scatter,
    **kwargs)
fig.show()

In [None]:
from draw_graph_linear_assignment import plot_tsne_with_ellipses

def generate_all_plots(genera_list):
    genera = '-'.join(genera_list)
    species_list = set(data_df[data_df['Genus'].isin(genera_list)]['CanonicalSpecies'].values)
    local_df = data_df[data_df['CanonicalSpecies'].isin(species_list)]

    kwargs = dict(
        color_col='CanonicalSpecies', 
        confidence=15,
        n_std=2, 
        min_n_instances=8,
        plot_width=1000, 
        plot_height=1000,
        ellipse_darken_factor=0.95,
    )

    for embedding_column in ['onehot_embedding','embedding','anonymised_embedding']:
        for show_scatter in [True, False]:
            title=f"Species:{genera} with embedding:{embedding_column} [with points:{show_scatter}]"

            fig = plot_tsne_with_ellipses(
                local_df, 
                coordinate_col=embedding_column, 
                title=title,
                show_scatter=show_scatter,
                **kwargs)
            
genera_list = ['Neisseria', 'Klebsiella', 'Corynebacterium', 'Campylobacter', 'Yersinia', 'Burkholderia']
generate_all_plots(genera_list)

---