# Sequences

This section provides an overview of the DNA sequences collected by various observatories within the EMO-BON network. It includes details about the types of sequences, their sources, and associated metadata.

In [None]:
from conneg_functions import execute_to_df, generate_sparql
import plotly.express as px
import plotly.graph_objects as go
from pandas import DataFrame
from IPython.display import display, HTML
import pandas as pd

## Sequence Sources

The following shows the different sources where sequences originate from.

In [None]:
# Fetch sequence sources (partof)
df_partof: DataFrame = execute_to_df('sequences_basic.sparql')

# Display as a simple list or selector
print(f"Found {len(df_partof)} different sequence sources:")
display(df_partof)

## Taxonomic Annotation Summary

Summary of taxonomic annotations grouped by rank and source, showing the count of annotations.

In [None]:
# Fetch taxonomic data summary
df_taxonomy_summary: DataFrame = execute_to_df('sequences_species.sparql')

table_html = df_taxonomy_summary.to_html(index=False, classes='display', table_id='taxonomy_summary_table')

# HTML and JavaScript for DataTables with filtering and download buttons
html_code = f"""
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css'>
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/buttons/2.2.2/css/buttons.dataTables.min.css'>
<script src='https://code.jquery.com/jquery-3.5.1.js'></script>
<script src='https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/dataTables.buttons.min.js'></script>
<script src='https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/buttons.html5.min.js'></script>
<style>
  table.display {{ width: 100%; }}
  div.dt-buttons {{ margin-bottom: 10px; }}
</style>
{table_html}
<script>
$(document).ready(function() {{
    $('#taxonomy_summary_table').DataTable({{
        dom: 'Bfrtip',
        buttons: ['csv', 'excel'],
        pageLength: 10,
        searching: true,
        ordering: true
    }});
}});
</script>
"""

# Display the HTML in the notebook
display(HTML(html_code))

### Visualization: Annotation Counts by Rank

In [None]:
# Visualize annotation counts by rank
if 'rank' in df_taxonomy_summary.columns and 'annotationCount' in df_taxonomy_summary.columns:
    # Group by rank and sum counts
    rank_summary = df_taxonomy_summary.groupby('rank')['annotationCount'].sum().reset_index()
    rank_summary = rank_summary.sort_values('annotationCount', ascending=False)
    
    fig = px.bar(rank_summary, x='rank', y='annotationCount',
                  title='Taxonomic Annotations by Rank',
                  labels={'rank': 'Taxonomic Rank', 'annotationCount': 'Number of Annotations'},
                  template='plotly_white')
    fig.update_traces(marker_color='#1f77b4')
    fig.update_layout(xaxis_tickangle=-45)
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No rank or annotationCount data available for visualization")

In [None]:
# Visualize annotation counts by source (partof)
if 'partof' in df_taxonomy_summary.columns and 'annotationCount' in df_taxonomy_summary.columns:
    # Group by partof and sum counts
    partof_summary = df_taxonomy_summary.groupby('partof')['annotationCount'].sum().reset_index()
    partof_summary = partof_summary.sort_values('annotationCount', ascending=False)
    
    fig = px.bar(partof_summary, x='partof', y='annotationCount',
                  title='Taxonomic Annotations by Source',
                  labels={'partof': 'Source', 'annotationCount': 'Number of Annotations'},
                  template='plotly_white')
    fig.update_traces(marker_color='#2ca02c')
    fig.update_layout(xaxis_tickangle=-45)
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No partof or annotationCount data available for visualization")

## Detailed Taxonomic Annotations

Detailed view of all taxonomic annotations including RNA expression values (lsuRNA, ssuRNA) and taxonomic information.

In [None]:
# Fetch detailed taxonomic annotations
df_annotations: DataFrame = execute_to_df('sequences_annotations.sparql')

table_html = df_annotations.to_html(index=False, classes='display', table_id='annotations_table')

# HTML and JavaScript for DataTables with filtering and download buttons
html_code = f"""
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css'>
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/buttons/2.2.2/css/buttons.dataTables.min.css'>
<script src='https://code.jquery.com/jquery-3.5.1.js'></script>
<script src='https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/dataTables.buttons.min.js'></script>
<script src='https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/buttons.html5.min.js'></script>
<style>
  table.display {{ width: 100%; }}
  div.dt-buttons {{ margin-bottom: 10px; }}
</style>
{table_html}
<script>
$(document).ready(function() {{
    $('#annotations_table').DataTable({{
        dom: 'Bfrtip',
        buttons: ['csv', 'excel'],
        pageLength: 10,
        searching: true,
        ordering: true
    }});
}});
</script>
"""

# Display the HTML in the notebook
display(HTML(html_code))

### Visualization: Distribution of Taxonomic Ranks in Detailed Annotations

In [None]:
# Visualize distribution of taxonomic ranks
if 'taxonRank' in df_annotations.columns:
    # Count by taxonRank
    rank_counts = df_annotations['taxonRank'].value_counts().reset_index()
    rank_counts.columns = ['taxonRank', 'count']
    
    fig = px.pie(rank_counts, values='count', names='taxonRank',
                 title='Distribution of Taxonomic Ranks',
                 template='plotly_white')
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No taxonRank data available for visualization")

In [None]:
# Visualize RNA expression values if available
if 'lsuRNA' in df_annotations.columns and 'ssuRNA' in df_annotations.columns:
    # Filter out rows with both RNA values
    df_rna = df_annotations.dropna(subset=['lsuRNA', 'ssuRNA'])
    
    if len(df_rna) > 0:
        # Convert to numeric if possible
        df_rna['lsuRNA'] = pd.to_numeric(df_rna['lsuRNA'], errors='coerce')
        df_rna['ssuRNA'] = pd.to_numeric(df_rna['ssuRNA'], errors='coerce')
        df_rna = df_rna.dropna(subset=['lsuRNA', 'ssuRNA'])
        
        if len(df_rna) > 0:
            fig = px.scatter(df_rna, x='lsuRNA', y='ssuRNA',
                           hover_data=['scientificName', 'taxonRank'],
                           title='LSU RNA vs SSU RNA Expression',
                           labels={'lsuRNA': 'LSU RNA', 'ssuRNA': 'SSU RNA'},
                           template='plotly_white')
            fig.update_traces(marker=dict(size=8, opacity=0.6))
            display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
        else:
            print("No numeric RNA expression data available for visualization")
    else:
        print("No RNA expression data available for visualization")
else:
    print("RNA expression columns not found in data")