#Rendering comparative gene sketches

----

Comprataive visualization of gene strcutures based on amino acid matches to unannotted genomes.

In [5]:
# Import necessary libraries

import pandas as pd
import altair as alt
import numpy as np

Provide data for rendering in the cell below. Make sure that URL is correctly formed with double slashes after `https:` (earlier versions of Galaxy has a bug where copying the link produced misformed URLs with just one slash).

In [6]:
# Paste link to the dataset here

dataset_url = 'https://usegalaxy.org/api/datasets/f9cad7b01a472135a8abd43f91f8d3cf/display?to_ext=tabular'

In [7]:
# Read data
# Galaxy places "." symbols in empty fields. Here they are replaces with NaNs.

data = pd.read_csv(
    dataset_url,
    sep='\t',
    names='genome,chr,start,end,orf,frame,strand,midpoint,matchStart,matchEnd,exon,id'.split(',')
).replace('.', np.NaN)

In [8]:
# Create unique identifier for grouping of images

data['genome_chr']=data['genome']+data['chr']

In [9]:
# Strip clade identifier from species names
# This would only work for VGP genomes

data['clade']=data['genome'].str[0]

In [10]:
# Take a peek at the data

data.head()

Unnamed: 0,genome,chr,start,end,orf,frame,strand,midpoint,matchStart,matchEnd,exon,id,genome_chr,clade
0,aGasCar1.fa,scaffold_1,552,684,aGasCar1.fascaffold_1_ORF.2,1,+,,,,,,aGasCar1.fascaffold_1,a
1,aGasCar1.fa,scaffold_1,762,1029,aGasCar1.fascaffold_1_ORF.3,1,+,,,,,,aGasCar1.fascaffold_1,a
2,aGasCar1.fa,scaffold_1,8880,9045,aGasCar1.fascaffold_1_ORF.31,1,+,8970.0,8904.0,9036.0,xbp-1u-p1,75.6,aGasCar1.fascaffold_1,a
3,aGasCar1.fa,scaffold_1,8880,9045,aGasCar1.fascaffold_1_ORF.31,1,+,8923.0,8904.0,8943.0,xbp-1s-p12,85.7,aGasCar1.fascaffold_1,a
4,aGasCar1.fa,scaffold_1,9270,9432,aGasCar1.fascaffold_1_ORF.32,1,+,,,,,,aGasCar1.fascaffold_1,a


In [11]:
# Function for rendering genome plots for individual species

def chart(df):

    orfs = alt.Chart(df).mark_rule(strokeWidth=2).encode(
        x = alt.X('start:Q',
                  scale=alt.Scale(domain=[src['start'].min()-500,src['end'].max()+500]),
                  title=None
                  ),
        x2 = alt.X2('end:Q'),
        y=alt.Y('frame:N',title=None),
        color=alt.condition(
            alt.datum['strand'] == '-',
            alt.value('blue'),
            alt.value('red'),
            ),
        tooltip=[
            alt.Tooltip('orf',title='ORF id'),
            alt.Tooltip('genome',title='Genome'),
            alt.Tooltip('chr',title='Chromosome'),
            alt.Tooltip('id',title='% identity')
        ]
    ).properties(
        title=group)
    
    arrow_start = orfs.mark_point(shape="arrow",strokeWidth=2).encode(
        x=alt.X('start'),
        angle=alt.condition(
            alt.datum['strand'] == '-',
            alt.value(270),
            alt.value(90),
            ),
        color=alt.condition(
            alt.datum['strand'] == '-',
            alt.value('blue'),
            alt.value('red'),
            )
        )

    matches = orfs.mark_rule(strokeWidth=10,opacity=.9).encode(
        x=alt.X('matchStart:Q'),
        x2=alt.X2('matchEnd:Q'),
        color=alt.Color('id:Q',scale=alt.Scale(scheme="blueorange"))
        )

    text = orfs.mark_text(yOffset=-10).encode(
        x=alt.X('midpoint:Q'),
        text=alt.Text('exon'),
        color=alt.value('black'),
        )

    arrow_end = arrow_start.encode(
        x='end:Q'
        )

    return(
        (orfs + arrow_start + arrow_end + matches + text).properties(height=100,width=800).interactive()
    )

In [12]:
# Create charts for each Genome/Chromosome combination
charts = []
for species in data.sort_values(by=['clade'])['genome'].unique():
    for group in data[data['genome']==species]['genome_chr'].unique():
        src = data[data['genome_chr']==group]
        charts.append(chart(src))
        
alt.vconcat(*charts).resolve_scale(
    y='shared'
).configure_title(
    fontSize=10,
    anchor='start',
)

In [13]:
exons = data[~data['exon'].isna()]

In [14]:
# Create a summary of matches across all species and exons in the data

alt.Chart(exons).mark_rect().encode(
    y = alt.Y('genome_chr:N',title='Species/chromosome',sort=data.sort_values(by=['clade'])['genome_chr'].unique()),
    x = alt.X('exon:N',title="gene segment"),
    color=alt.Color('id:Q',scale=alt.Scale(scheme="blueorange"),title='%AA identity'),
    tooltip=[
            alt.Tooltip('id',title='% identity')
        ]
)