# Sequences

This section provides an overview of the DNA sequences collected by various observatories within the EMO-BON network. It includes details about the types of sequences, their sources, and associated metadata.

In [None]:
from conneg_functions import execute_to_df, generate_sparql
import plotly.express as px
from pandas import DataFrame
from IPython.display import display, HTML

## Basic Sequence Information

The following table displays basic information about sequences in the EMO-BON knowledge graph.

In [None]:
# Fetch basic sequence data and display as interactive table
df_sequences_basic: DataFrame = execute_to_df('sequences_basic.sparql')

table_html = df_sequences_basic.to_html(index=False, classes='display', table_id='sequences_table')

# HTML and JavaScript for DataTables with filtering and download buttons
html_code = f"""
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css'>
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/buttons/2.2.2/css/buttons.dataTables.min.css'>
<script src='https://code.jquery.com/jquery-3.5.1.js'></script>
<script src='https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/dataTables.buttons.min.js'></script>
<script src='https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/buttons.html5.min.js'></script>
<style>
  table.display {{ width: 100%; }}
  div.dt-buttons {{ margin-bottom: 10px; }}
</style>
{table_html}
<script>
$(document).ready(function() {{
    $('#sequences_table').DataTable({{
        dom: 'Bfrtip',
        buttons: ['csv', 'excel'],
        pageLength: 10,
        searching: true,
        ordering: true
    }});
}});
</script>
"""

# Display the HTML in the notebook
display(HTML(html_code))

## Species Overview

The following table displays taxonomic information about species identified from sequences.

In [None]:
# Fetch species/taxonomy data from sequences
df_species: DataFrame = execute_to_df('sequences_species.sparql')

table_html = df_species.to_html(index=False, classes='display', table_id='species_table')

# HTML and JavaScript for DataTables with filtering and download buttons
html_code = f"""
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css'>
<link rel='stylesheet' type='text/css' href='https://cdn.datatables.net/buttons/2.2.2/css/buttons.dataTables.min.css'>
<script src='https://code.jquery.com/jquery-3.5.1.js'></script>
<script src='https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/dataTables.buttons.min.js'></script>
<script src='https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js'></script>
<script src='https://cdn.datatables.net/buttons/2.2.2/js/buttons.html5.min.js'></script>
<style>
  table.display {{ width: 100%; }}
  div.dt-buttons {{ margin-bottom: 10px; }}
</style>
{table_html}
<script>
$(document).ready(function() {{
    $('#species_table').DataTable({{
        dom: 'Bfrtip',
        buttons: ['csv', 'excel'],
        pageLength: 10,
        searching: true,
        ordering: true
    }});
}});
</script>
"""

# Display the HTML in the notebook
display(HTML(html_code))

## Species Distribution Visualization

Visualize the distribution of species across different taxonomic ranks.

In [None]:
# Count species by kingdom
if 'kingdom' in df_species.columns and not df_species['kingdom'].isna().all():
    kingdom_counts = df_species.groupby('kingdom').size().reset_index(name='count')
    kingdom_counts = kingdom_counts.sort_values('count', ascending=False)
    
    fig = px.bar(kingdom_counts, x='kingdom', y='count', 
                  title='Number of Sequences by Kingdom',
                  labels={'kingdom': 'Kingdom', 'count': 'Number of Sequences'},
                  template='plotly_white')
    fig.update_traces(marker_color='#2ca02c')
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No kingdom data available for visualization")

In [None]:
# Count species by phylum (top 10)
if 'phylum' in df_species.columns and not df_species['phylum'].isna().all():
    phylum_counts = df_species.groupby('phylum').size().reset_index(name='count')
    phylum_counts = phylum_counts.sort_values('count', ascending=False).head(10)
    
    fig = px.bar(phylum_counts, x='phylum', y='count', 
                  title='Top 10 Phyla by Number of Sequences',
                  labels={'phylum': 'Phylum', 'count': 'Number of Sequences'},
                  template='plotly_white')
    fig.update_traces(marker_color='#ff7f0e')
    fig.update_layout(xaxis_tickangle=-45)
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No phylum data available for visualization")

In [None]:
# Count species by family (top 15)
if 'family' in df_species.columns and not df_species['family'].isna().all():
    family_counts = df_species.groupby('family').size().reset_index(name='count')
    family_counts = family_counts.sort_values('count', ascending=False).head(15)
    
    fig = px.bar(family_counts, x='family', y='count', 
                  title='Top 15 Families by Number of Sequences',
                  labels={'family': 'Family', 'count': 'Number of Sequences'},
                  template='plotly_white')
    fig.update_traces(marker_color='#d62728')
    fig.update_layout(xaxis_tickangle=-45)
    display(HTML(fig.to_html(include_plotlyjs='cdn', full_html=False)))
else:
    print("No family data available for visualization")