# App for analyzing GECCO runs from the Galaxy

1. Upload local data or query results of the GECCO from the Galaxy.
2. Identifying Biosynthetic Gene Clusters (BGCs).
3. Visualize BGCs.
4. Compare two samples in respect to each other.

Note: Sending GECCO jobs to Galaxy is part of another separate application.

## Platform dependent part
- Resolve platform setup
- the difference to local imports should be resolved by setting the VRE packages well

In [1]:
import os
import sys
import logging
import psutil
from IPython import get_ipython

logger = logging.getLogger(name="GECCO analyzer")

if 'google.colab' in str(get_ipython()):
    print('Setting Google colab, you will need a ngrok account to make the dashboard display over the tunnel. \
    https://ngrok.com/')
    # clone the momics-demos repository to use it to load data
    try:
        os.system('git clone https://github.com/palec87/momics-demos.git')
        logger.info(f"Repository cloned")
    except OSError as e:
        logger.info(f"An error occurred while cloning the repository: {e}")

    sys.path.insert(0,'/content/momics-demos')

    # this step takes time beacause of many dependencies
    os.system('pip install marine-omics')

elif psutil.users() == []:
    logger.info("Binder")

    logger.info('Binder will not allow you to upload the ".env" file')
    os.environ["GALAXY_EARTH_URL"] = "https://earth-system.usegalaxy.eu/"
    ###########################################################################################
    ### INPUT TOKEN HERE, If not using Galaxy, put any string below, but cannot stay empty ####
    ###########################################################################################
    os.environ["GALAXY_EARTH_KEY"] = ""
    assert os.environ["GALAXY_EARTH_KEY"] != "", "token cannot be an empty string, SET your API key."

else:
    logger.info("Local server")

from momics.utils import init_setup, get_notebook_environment, memory_load, reconfig_logger

# Set up logging
reconfig_logger()
init_setup()


INFO | root | Logging.basicConfig completed successfully


## Imports

In [2]:
import pandas as pd
import panel as pn

# All low level functions are imported from the momics package
import momics.diversity as div
import momics.plotting as pl
from momics.loader import bytes_to_df

In [3]:
# clustering
from collections import defaultdict
from time import time

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

extra_stop_words = [
    'domain', 'pfam', 'protein', 'family', 'superfamily', 'clan', 'interpro',
    'et', 'al', 'cite', '[[', ']]', '(', ')', 'figure', 'fig', 'table', 'tab', 'see',
    'also', 'example', 'examples', 'exampled', 'exampled', 'exemplary', 'exemplaryd',
    'https', 'www', 'doi', 'review', 'swissprot', 'uniprot', 'org', 'ncbi', 'pubmed', 'pubmedcentral',
    'ncbi', 'genbank', 'refseq', 'genome', 'genomic', 'gene', 'genes', 'protein', 'proteins',
    'sequence', 'sequences', 'seq', 'seqs', 'nucleotide', 'nucleotides', 'amino', 'acids',
    'acid', 'acids', 'aa', 'aa', 'nt', 'nts', 'ntseq', 'ntseqs', 'ntseqd', 'ntseqd',
]

stop_words = list(text.ENGLISH_STOP_WORDS.union(extra_stop_words))

### User settings

In [4]:
DEBUG = True  # enable stdout logging

## Loading and setup

In [5]:
if 'google.colab' in str(get_ipython()):
    root_folder = os.path.abspath(os.path.join('/content/momics-demos'))
else:
    root_folder = os.path.abspath(os.path.join('../'))

assets_folder = os.path.join(root_folder, 'assets')

### DF display

In [6]:
pn.extension("tabulator", "mathjax", "filedropper")
pn.extension(notifications=True)
DATASETS = {}

FILTERED_domains = None
PFAM_dict = {}


upload_local = pn.widgets.FileInput(
    multiple=True,
)

literal_galaxy_url = pn.widgets.TextInput(
    name='Galaxy server URL',
    placeholder='Enter a https server address here...',
)

literal_galaxy_key = pn.widgets.PasswordInput(
    name='Password',
    placeholder='Enter your password here...',
)

button_display_loaded = pn.widgets.Button(
    name="Display loaded files",
    button_type="primary",
    width=200,
)

In [10]:
button_display_loaded.on_click(
    lambda event: process_uploaded_tables(upload_local.filename, upload_local.value)
)

def display_tables_after_upload(datasets):
    global df_clusters, df_features, df_genes
    logger.info("Displaying tables after upload...")
    df_clusters = datasets[next((key for key in datasets if 'BGCs' in key or 'clusters' in key))]
    df_features = datasets[next((key for key in datasets if 'features' in key))]
    df_genes = datasets[next((key for key in datasets if 'genes' in key))]
    # return df_clusters, df_features, df_genes

def process_uploaded_tables(file_names, file_data):
    """
    Process the uploaded tables and display them in a tabular format.
    """
    logger.info("Processing uploaded tables...")
    DATASETS.clear()
    if file_data is None:
        pn.state.notifications.warning(
            'Files not loaded yet, try again soon.',
            duration=2000,
            )
        return
    logger.info(f"files: {file_names}")
    for i, name in enumerate(file_names):
        logger.info(f"Processing {name}...")
        DATASETS[name] = bytes_to_df(file_data[i])
    # Display the first table
    display_tables_after_upload(DATASETS)

In [11]:
pn.Column(
    upload_local,
    literal_galaxy_url,
    literal_galaxy_key,
    button_display_loaded,
    sizing_mode='stretch_width',
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'513ebb2a-b25c-4fec-b0ca-aa3aeb53380a': {'version…

In [16]:
pl.hvplot_bgcs_violin(
        df_clusters,
        normalize=False,
        ).opts(
            height=600,
            width=1000,
        )

## pfam API calls tab

In [22]:
from urllib import request
from time import sleep
import json

# plot the domain abundance
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import holoviews as hv
import hvplot.pandas  # noqa

PLOT_FACE_COLOR = "#e6e6e6"


def construct_pfam_url(pfam_id):
    """
    Construct the URL for the PFAM database.
    """
    return f"https://www.ebi.ac.uk/interpro/api/entry/pfam/{pfam_id}"


def filter_domain(df, abundance_cutoff=500):
    """
    Filter the domain abundance data.
    """
    # Filter out domains with abundance less than abundance_cutoff
    s = df.domain.value_counts()
    filtered_domains = s[s > abundance_cutoff]
    return filtered_domains


def extract_from_pfam_query(api_decode):
    """
    Extract data from the returned API call.
    """
    slim_keys = ['accession', 'name', 'description', 'integrated']
    return {k: api_decode['metadata'][k] for k in slim_keys}

BokehModel(combine_events=True, render_bundle={'docs_json': {'b13c340e-3ddf-4e79-bb3f-9a8636c74f98': {'version…

In [19]:
def api_loop_pfam(domains: pd.Series, n_calls: int = 10):
    pfam_dict = {}
    discarded = {}  # not possible to get complete data from the API
    # check if n_calls is less than the number of domains
    if n_calls > len(domains):
        n_calls = len(domains)
        logger.info(f"n_calls is greater than the number of domains, setting to {n_calls}")

    for pfam_id in tqdm(domains.index[:n_calls], desc="API calls progress", leave=True, colour='#666666'):
        logger.info(f"Fetching data for PFAM ID: {pfam_id}")
        url = construct_pfam_url(pfam_id)
        req = request.Request(url)

        try:
            res = request.urlopen(req)
        except request.HTTPError as e:
            logger.error(f"HTTP error: {e.code} - {e.reason}")
            if e.code == 404:
                logger.warning('Not found, skipping')
                discarded[pfam_id] = None
            elif e.code == 410:
                logger.warning('Discarded, Gone from the database, ie obsolete')
            else:
                logger.warning('Unknown error, skipping')
                discarded[pfam_id] = None
            continue

        if res.status == 408:
            logger.info('sleeping so skip request')

        payload = json.loads(res.read().decode())
        metadata = extract_from_pfam_query(payload)
        logger.info(f"metadata: {metadata}")

        if metadata['description'] is None:  # some pfams do not have description
            logger.info('No description, trying to fetch one from the IPR')
            ipr_id = payload['metadata']['integrated']
            logger.info(f"ipr_id: {ipr_id}")
            if ipr_id is None:
                logger.warning('No IPR ID, skipping')
                discarded[pfam_id] = metadata
                continue
            url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/{ipr_id}"
            logger.info(f"Fetching data for IPR ID: {ipr_id}")

            # API call
            req = request.Request(url)
            res = request.urlopen(req)
            if res.status == 408:
                logger.info('sleeping so skip request')

            payload = json.loads(res.read().decode())
            metadata['description'] = payload['metadata']['description']
    
        pfam_dict[pfam_id] = metadata
        sleep(0.5)
    return pfam_dict, discarded

In [20]:
markdown_pfam = pn.pane.Markdown(
"""
**Features table** contains pfam ids for the identified proteins.
- Histogram shows counts of each pfam id over all the contigs.
- To cluster the pfam domains by function, we query the pfam database for their description.
- Select how many calls (ordered by counts in the histogram) you want to make.
- Each call takes approximately 0.7 second.
- Returned values are filtered and stored in the dictionary (also in your working directory).
-   The dictionary `json` is saved in the working directory with a flag from abundance cutoff value, to facilitate loading.
- In the next tab, you can **tokenize**, **embed** and **cluster** the description of the pfam domains.
"""
)

abundance_cutoff = pn.widgets.IntInput(
    name='Abundance cutoff',
    value=500, step=50, start=50, end=10000,
    align="center",
)

n_calls = pn.widgets.IntInput(
    name='Number of API calls',
    value=50, step=10, start=20, end=500,
)

histogram = pn.pane.HoloViews(
    height=500,
    name="Histogram",
)

# buttons
button_histogram = pn.widgets.Button(
    name="Plot histogram",
    button_type="primary",
    width=200,
    align="end",
)

button_run_api = pn.widgets.Button(
    name="Run API",
    button_type="primary",
    width=200,
    align="end",
)

button_load_pfam_dict = pn.widgets.Button(
    name="Load PFAM dict",
    button_type="primary",
    width=200,
    align="end",
)

tqdm = pn.widgets.Tqdm(
    width=200,
    align=("end", 'end'),
)

pfam_tab = pn.Column(
    markdown_pfam,
    pn.Row(
        abundance_cutoff,
        button_histogram,
    ),
    pn.Row(
        n_calls,
        button_load_pfam_dict,
        button_run_api,
        tqdm,
    ),
    histogram,
    scroll=True,
    sizing_mode="stretch_both",
)

def load_pfam_dict():
    """
    Load the PFAM dictionary from a file
    """
    global PFAM_dict
    try:
        with open(os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json'), 'r') as f:
            PFAM_dict = json.load(f)
    except FileNotFoundError:
        logger.warning(f"File not found: {os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json')}")
        pn.state.notifications.warning(
            f"File not found: {os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json')}",
            duration=2000)
        return
    logger.info(f"PFAM_dict loaded from {os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json')}")


def run_api(n_calls):
    """
    Run the API calls to get the pfam descriptions
    """
    if FILTERED_domains is None or not isinstance(FILTERED_domains, pd.Series):
        logger.info("No datasets loaded or FILTERED_domains is not a pandas Series")
        pn.state.notifications.warning('Plot histogram, which filters domains.', duration=2000)
        return
    PFAM_dict, discarded = api_loop_pfam(FILTERED_domains, n_calls)

    if discarded is None or not isinstance(discarded, pd.Series):
       logger.warning(f"Discarded PFAM IDs: {len(discarded)}")
       pn.state.notifications.info(
           f"Not found and discarded PFAM IDs: {len(discarded)}",
            duration=2000)
       
    # save the PFAM_dict to a file
    with open(os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json'), 'w') as f:
        json.dump(PFAM_dict, f)
    logger.info(f"PFAM_dict saved to {os.path.join(f'pfam_dict_cutoff_{abundance_cutoff.value}.json')}")


def filter_histogram(abundance_cutoff):
    """
    Filter the features table from DATASETS and plot the histogram
    """
    if DATASETS == {}:
        logger.info("No datasets loaded")
        pn.state.notifications.warning('You have to load datasets first.', duration=2000)
        return
    df = DATASETS[next((key for key in DATASETS if 'features' in key))]
    # Filter out domains with abundance less than abundance_cutoff
    global FILTERED_domains
    FILTERED_domains = filter_domain(df, abundance_cutoff)
    logger.info(f"Filtered domains: {FILTERED_domains}")

    # plot the histogram
    histogram.object = pl.plot_domain_abundance(FILTERED_domains, abundance_cutoff)


## Buttons ##
button_histogram.on_click(
    lambda event: filter_histogram(abundance_cutoff.value)
)

button_run_api.on_click(
    lambda event: run_api(n_calls.value)
)

button_load_pfam_dict.on_click(
    lambda event: load_pfam_dict()
)

Watcher(inst=Button(align='end', button_type='primary', name='Load PFAM dict', width=200), cls=<class 'panel.widgets.button.Button'>, fn=<function <lambda> at 0x7a4a1d871080>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)

In [23]:
pfam_tab

BokehModel(combine_events=True, render_bundle={'docs_json': {'1a32f95d-ec75-455c-a1aa-c2f4d3f841b7': {'version…

### Cluster page

In [24]:
def cluster_domains(pfam_dict, n_clusters=5, stop_words='english'):
    """
    Cluster the domains using KMeans clustering.
    """
    # Extract the domain descriptions
    descriptions = [v['description'][0]['text'] for v in pfam_dict.values()]
    logger.info(f"Number of descriptions: {len(descriptions)}")
    # strip <p> tags
    descriptions = [desc.replace('<p>', '').replace('</p>', '') for desc in descriptions]
    logger.info(f'descriptions: {descriptions}')
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        stop_words=stop_words,
        max_df=0.15,
    )
    X = vectorizer.fit_transform(descriptions)

    # Perform KMeans clustering
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=4, random_state=42)
    kmeans.fit(X)

    # Get the cluster labels
    labels = kmeans.labels_

    # Create a dictionary to store the clusters
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(list(pfam_dict.keys())[i])

    return clusters, labels, X, kmeans, vectorizer


def fit_and_evaluate(km, X, labels, name=None, n_runs=5):
    name = km.__class__.__name__ if name is None else name

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    logger.info(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        # logger.info(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    return evaluation, evaluation_std

In [25]:
markdown_cluster = pn.pane.Markdown(
"""
From the dictionary, the domain descriptions are vectorized and clustered.
- Select the number of clusters you want to create.
- The clustering is done using the `k-means` algorithm.
- The most important words from the clusters are extracted and displayed.
""")

n_clusters = pn.widgets.IntInput(
    name='Number of clusters',
    value=5, step=1, start=2, end=50,
)

n_important_words = pn.widgets.IntInput(
    name='Number of important words',
    value=10, step=1, start=5, end=30,
)

perplexity = pn.widgets.IntInput(
    name='Perplexity',
    value=10, step=1, start=2, end=50,
)

n_components_LSA = pn.widgets.IntInput(
    name='Number of components for LSA',
    value=10, step=1, start=5, end=100,
)

button_cluster = pn.widgets.Button(
    name="Cluster",
    button_type="primary",
    width=200,
)

tsne = pn.pane.HoloViews(
    height=500,
    name="t-SNE",
)

vip_words = pn.widgets.Tabulator()
reports = pn.widgets.Tabulator(
    name="Clustering reports",
    sizing_mode="stretch_both",
)

cluster_tab = pn.Column(
    markdown_cluster,
    pn.Row(
        n_clusters,
        perplexity,
        n_important_words,
    ),
    pn.Row(
        n_components_LSA,
    ),
    button_cluster,
    pn.Row(
        tsne,
        vip_words,
    ),
    reports,
    scroll=True,
    sizing_mode="stretch_both",
)

def cluster():
    similarity, labels, X_tfidf, kmeans, vectorizer = cluster_domains(
        PFAM_dict,
        n_clusters=n_clusters.value,
        stop_words=stop_words,
    )
    
    _, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    logger.info(f"Number of elements assigned to each cluster: {cluster_sizes}")
    # calculate the t-SNE
    logger.info(f"N samples: {X_tfidf.shape[0]}")
    if X_tfidf.shape[0] <= perplexity.value:
        logger.warning(f"Perplexity is greater than the number of samples, setting to {X_tfidf.shape[0] - 1}")
        perplexity.value = X_tfidf.shape[0] - 1

    X_embedded = TSNE(
        n_components=2,
        learning_rate='auto',
        init='random',
        perplexity=perplexity.value,
    ).fit_transform(X_tfidf.toarray())

    logger.info(X_embedded.shape)
    tsne.object = pl.plot_tsne(X_embedded, kmeans)

    # LSA
    logger.info(f"data shape: {X_tfidf.shape}")
    if X_tfidf.shape[1] <= n_components_LSA.value:
        logger.warning(f"Number of components is greater than the number of features, setting to {X_tfidf.shape[1] - 1}")
        n_components_LSA.value = X_tfidf.shape[1] - 1
    lsa = make_pipeline(
        TruncatedSVD(n_components=n_components_LSA.value),
        Normalizer(copy=False),
    )
    t0 = time()
    X_lsa = lsa.fit_transform(X_tfidf)
    explained_variance = lsa[0].explained_variance_ratio_.sum()

    logger.info(f"LSA done in {time() - t0:.3f} s")
    logger.info(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

    # K-means again
    kmeans = KMeans(
        n_clusters=n_clusters.value,
        max_iter=100,
        n_init=1,
    )

    evaluation, evaluation_std = fit_and_evaluate(kmeans, X_lsa, labels, name="KMeans\nwith LSA on tf-idf vectors")
    # Combine values and standard deviations into a single DataFrame
    stats = {
        "Metric": evaluation.keys(),
        "Value": evaluation.values(),
        "Std": evaluation_std.values(),
    }

    df = pd.DataFrame(stats)
    # drop row with "estimator" in the name
    df = df[~df["Metric"].str.contains("estimator")]
    # round the values to 3 decimal places
    df["Value"] = pd.to_numeric(df["Value"], errors="coerce").round(4)
    df["Std"] = pd.to_numeric(df["Std"], errors="coerce").round(4)
    df = df.set_index("Metric")
    # change index name
    df.index.name = "KMeans\nwith LSA on tf-idf vectors"
    # Display the DataFrame
    reports.value = df

    original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()

    # Collect cluster terms into a dictionary
    cluster_terms = {}
    for i in range(kmeans.n_clusters):
        cluster_terms[f"Cluster {i}"] = [terms[ind] for ind in order_centroids[i, :n_important_words.value]]

    # Convert the dictionary to a DataFrame
    df_clusters = pd.DataFrame.from_dict(cluster_terms, orient="index")#.transpose()
    vip_words.value = df_clusters


button_cluster.on_click(
    lambda event: cluster()
)

Watcher(inst=Button(button_type='primary', name='Cluster', width=200), cls=<class 'panel.widgets.button.Button'>, fn=<function <lambda> at 0x7a4a1d38f9c0>, mode='args', onlychanged=False, parameter_names=('clicks',), what='value', queued=False, precedence=0)

In [26]:
cluster_tab

BokehModel(combine_events=True, render_bundle={'docs_json': {'63d4b5f4-6799-4421-98b2-c3f00f8c15f2': {'version…