# TPM data exploration with multiple interactions

In [1]:
import altair as alt
import numpy as np
import pandas as pd
import seaborn as sns

import sys
sys.path.append('../') # use modules in main directory

import genbank_utils as gu
import get_top_gene_set as gtgs

## Load expression data
Use the same data loading function from `get_top_gene_set.py`

In [2]:
# load TPM and feature data
data_file = '../data/extract_TPM_counts.tsv'
sample2condition_file = '../data/sample2condition.txt'
sample_file = None #'config/samples_to_include.txt'
condition_file = '../config/conditions_to_include.txt'
gb_file = '../data/5GB1c_sequence.gb'

df, sample2condition, samples, conditions, pos_feats, neg_feats = gtgs.load_data(data_file, 
                                                                                 sample2condition_file, 
                                                                                 sample_file, 
                                                                                 condition_file,
                                                                                 gb_file)


In [3]:
df.head()

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,gene_len,...,5GB1_pA9_red_tpm,5GB1_pA9_yellow_tpm,5GB1C-5G-La-BR1_tpm,5GB1C-5G-La-BR2_tpm,5GB1C-5G-N-BR1_tpm,5GB1C-5G-N-BR2_tpm,5GB1C-JG15-La-BR1_tpm,5GB1C-JG15-La-BR2_tpm,5GB1C-JG15-N-BR1_tpm,5GB1C-JG15-N-BR2_tpm
0,EQU24_RS00005,chromosomal replication initiator protein DnaA,CDS,dnaA,NZ_CP035467.1,0,1317,Derived by automated computational analysis us...,MSALWNNCLAKLENEISSSEFSTWIRPLQAIETDGQIKLLAPNRFV...,1318,...,38.557373,38.810668,37.444214,40.246006,40.100118,33.432274,39.880174,38.355431,30.247582,41.248441
1,EQU24_RS00010,DNA polymerase III subunit beta,CDS,,NZ_CP035467.1,1502,2603,Derived by automated computational analysis us...,MKYIINREQLLVPLQQIVSVIEKRQTMPILSNVLMVFRENTLVMTG...,1102,...,52.552767,52.461746,42.676553,49.210083,46.798476,48.142385,45.465136,46.498139,37.152951,52.90241
2,EQU24_RS00015,DNA replication/repair protein RecF,CDS,recF,NZ_CP035467.1,3060,4140,Derived by automated computational analysis us...,MSLQKLDIFNVRNIRQASLQPSPGLNLIYGANASGKSSVLEAIFIL...,1081,...,31.350991,34.914128,21.479309,24.204682,22.171104,22.006566,22.658157,22.753325,19.407103,29.834124
3,EQU24_RS00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS,gyrB,NZ_CP035467.1,4185,6600,Derived by automated computational analysis us...,MSENIKQYDSTNIQVLKGLDAVRKRPGMYIGDTDDGTGLHHMVFEV...,2416,...,74.848501,80.850761,54.959319,64.911376,59.653059,64.648318,69.119079,65.643179,57.590223,68.306759
4,EQU24_RS00025,hypothetical protein,CDS,,NZ_CP035467.1,6825,7062,Derived by automated computational analysis us...,VKTTKYFLTTRMRPDREIIKDEWIQYVVRFPENEHIQFDGRIRRWA...,238,...,50.324948,49.349547,34.539657,36.521074,37.789611,39.358066,38.992158,35.870964,41.462392,40.227192


In [4]:
feat2meta = gu.get_feat2meta_dict(gb_file)
feat2meta['EQU24_RS19310']

{'gene_symbol': 'pmoA',
 'product': 'methane monooxygenase/ammonia monooxygenase subunit A',
 'type': 'CDS',
 'strand': -1}

In [5]:
loc2pos = dict(df[['locus_tag','start_coord']].values)
loc2pos['EQU24_RS00025']

6825

## Transform the data frame for mean TPM calculations
Pivot the data matrix and combine samples that belong to the same experimental condition. Values below are averages across samples.

In [6]:
# for the loaded TPM matrix, which column contains the unique gene ids?
LOCUS_ID_COL = 'locus_tag'
# use this column to get a full list of all genes for which expression was measured
LOCI = list(df[LOCUS_ID_COL].values)

In [7]:
df_means = gtgs.get_average_tpm_by_condition(df,samples,conditions,sample2condition,LOCI)
df_means

locus_tag,exp_condition,EQU24_RS00005,EQU24_RS00010,EQU24_RS00015,EQU24_RS00020,EQU24_RS00025,EQU24_RS00030,EQU24_RS00035,EQU24_RS00040,EQU24_RS00045,...,EQU24_RS22110,EQU24_RS22115,EQU24_RS22120,EQU24_RS22125,EQU24_RS22130,EQU24_RS22135,EQU24_RS22140,EQU24_RS22145,EQU24_RS22150,EQU24_RS22155
0,MeOH,23.333155,18.915775,18.453916,18.267805,16.960643,12.377795,43.815536,9.67095,7.302145,...,1298.257682,15.624619,20.208066,26.004364,20.960234,28.719983,93.616437,161.528124,496.990651,280.344047
1,NO3_lowO2_slow_growth,32.050358,43.65676,21.351623,62.267687,41.684925,31.921455,57.849768,16.885694,14.926147,...,6497.868109,26.273485,28.945133,23.525245,26.432667,35.167264,178.996199,164.083806,433.438735,493.895115
2,NoCu,44.348687,59.62936,28.268717,56.818319,49.839406,38.394652,81.530362,40.501969,36.5765,...,8345.785345,43.065124,34.380565,44.419579,34.601933,65.339879,253.608495,273.284694,731.05219,1087.621126
3,NoLanthanum,33.444023,43.689839,23.172675,57.297047,42.367072,41.941657,102.513601,30.226787,19.462312,...,5085.637409,16.423284,35.588138,44.623117,43.201743,21.92726,109.78333,67.277718,211.575175,328.943746
4,WithLanthanum,35.462185,41.792237,20.644554,57.130166,34.258335,46.201637,110.721781,31.813805,19.438086,...,3942.957792,15.972203,34.318829,49.216725,40.000662,21.220809,98.10061,73.116973,194.389586,319.998959
5,highCu,47.861477,79.10949,33.534043,73.330408,48.662214,33.986359,92.999818,51.950784,50.370579,...,8132.547467,48.894308,35.60873,46.1093,30.125207,89.71015,342.981435,386.493127,1021.453762,1692.401154
6,highO2_slow_growth,64.784508,99.00297,44.856281,78.997757,77.842263,56.626268,97.721756,35.735531,28.808125,...,3468.582202,40.548782,48.532405,37.1395,38.204218,52.033315,220.196691,244.139008,505.42761,561.847119
7,lowCH4,30.829331,33.532522,18.49116,42.963648,31.643505,24.308334,74.948663,18.6019,12.935098,...,7477.339715,18.356915,24.390308,20.231568,19.721043,60.397912,226.909132,297.029289,874.637567,506.211825
8,lowCu,42.973556,61.209155,28.828713,61.573321,50.966799,31.319574,75.047593,40.03867,35.331019,...,7157.344557,43.386082,33.574108,36.872718,31.899782,66.743497,293.599291,313.741841,843.607251,1123.669681
9,lowO2_fast_growth,35.73619,43.159066,27.3258,48.976066,33.463183,26.694205,72.505275,15.430765,12.595934,...,9584.028559,26.951394,26.808972,31.889009,23.88239,72.20795,334.033049,376.513322,1178.068431,1183.180057


#### Add relevant metadata to the expression df

In [8]:
# melt df to format for Altair parallel coordinates viz
alt_df = pd.melt(df_means, id_vars=['exp_condition'], value_vars=LOCI,value_name='mean_exp')

# add metadata columns from dict
alt_df['gene_symbol'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['gene_symbol'])
alt_df['product'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['product'])
alt_df['type'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['type'])
alt_df['strand'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['strand'])
alt_df['desc_string']  = alt_df.apply(lambda row: f"{row['locus_tag']}|{row['gene_symbol']}|{row['product']}",axis=1)
alt_df['pos'] = alt_df['locus_tag'].apply(lambda x: loc2pos[x])
# impose a specific x-axis sort order on the df
# choose custom exp_condition order!
list_ordering = ['uMax','lowCH4','NoCu','lowCu','medCu','highCu','NO3_lowO2_slow_growth','highO2_slow_growth','lowO2_fast_growth','MeOH','NoLanthanum','WithLanthanum'] 
alt_df["exp_condition_order"] = pd.Categorical(alt_df["exp_condition"], categories=list_ordering)
alt_df['exp_id'] = alt_df['exp_condition'].apply(lambda x: list_ordering.index(x))
alt_df.head()

Unnamed: 0,exp_condition,locus_tag,mean_exp,gene_symbol,product,type,strand,desc_string,pos,exp_condition_order,exp_id
0,MeOH,EQU24_RS00005,23.333155,dnaA,chromosomal replication initiator protein DnaA,CDS,1,EQU24_RS00005|dnaA|chromosomal replication ini...,0,MeOH,9
1,NO3_lowO2_slow_growth,EQU24_RS00005,32.050358,dnaA,chromosomal replication initiator protein DnaA,CDS,1,EQU24_RS00005|dnaA|chromosomal replication ini...,0,NO3_lowO2_slow_growth,6
2,NoCu,EQU24_RS00005,44.348687,dnaA,chromosomal replication initiator protein DnaA,CDS,1,EQU24_RS00005|dnaA|chromosomal replication ini...,0,NoCu,2
3,NoLanthanum,EQU24_RS00005,33.444023,dnaA,chromosomal replication initiator protein DnaA,CDS,1,EQU24_RS00005|dnaA|chromosomal replication ini...,0,NoLanthanum,10
4,WithLanthanum,EQU24_RS00005,35.462185,dnaA,chromosomal replication initiator protein DnaA,CDS,1,EQU24_RS00005|dnaA|chromosomal replication ini...,0,WithLanthanum,11


### For a given set of top genes, visualize the expression across conditions
As an example, use 3 sets of top loci: 1%, 3% and 10%

In [9]:
top_locs_1_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 1)
df_1 = alt_df[alt_df['locus_tag'].isin(top_locs_1_perc)]

top_locs_3_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 3)
df_3 = alt_df[alt_df['locus_tag'].isin(top_locs_3_perc)]

top_locs_10_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 10)
df_10 = alt_df[alt_df['locus_tag'].isin(top_locs_10_perc)]

# Interactive Visualization!
This plot shows individual gene expression profiles across the 12 conditions used in this analysis.

Instructions: 
    * hover over lines to show gene info. 
    * select colors in the legend to highlight specific genes
    * shift click to select multiple genes from the legend
    * pan and zoom with mouse and scroll

In [10]:
def multi_plot(df,n):
    # split legend into multiple columns if too many genes
    col_num = 1 if len(df['locus_tag'].unique()) <=50 else 2

    # interactive selectors
    brush = alt.selection_interval(encodings=['x'])
    highlight = alt.selection(type='single', on='mousemove',
                              fields=['desc_string'], nearest=True)
    selection = alt.selection_multi(fields=['desc_string'], bind='legend')

    # upper chart: points along the genome
    points = alt.Chart(
        title=f"TPM values for genes in Top {n}% set"
    ).mark_point(
        size=100,
        opacity=0.5
    ).encode(
        x=alt.X('pos:Q',axis=alt.Axis(labelAngle=270,title="Genome position")),#,rotation=90),
        y='strand:Q',
        color='strand:N',
        tooltip=['locus_tag:O','product:N','pos:Q','strand:N']
    ).properties(
        width=600,
        height=50
    ).add_selection(
        brush
    )


    # lower chart: Parallel coords plot
    pbase = alt.Chart().encode(
        x=alt.X('exp_condition_order:N',
                sort=alt.EncodingSortField(
                    field="exp_condition_order:N", 
                    op="count"
                ),
                axis=alt.Axis(
                    labelAngle=315,
                    labelLimit=0,
                    title="Experimental Condition"
                ),
        ),
        y=alt.Y('mean_exp:Q',
                scale=alt.Scale(type='log'),
                axis=alt.Axis(title='Log TPM expression')
        ),
        color=alt.Color('desc_string:N',
                        scale=alt.Scale(scheme='tableau20'),
                        legend=alt.Legend(
                            title='Gene', 
                            orient = 'right',
                            labelLimit=0,
                            columns=col_num,
                            symbolLimit=200,
                            labelFontSize=14
                        )
        ),
        tooltip=['locus_tag','product','exp_condition','mean_exp']
    )

    # parallel coords points (based on pbase)
    ppoints = pbase.mark_circle().encode(
        opacity=alt.condition((brush|selection|highlight), alt.value(1), alt.value(0.2)),
        size=alt.condition((brush|selection|highlight), alt.value(100), alt.value(3))
    ).add_selection(
        selection,highlight
    ).properties(
        width=600,
        height=400
    )

    # parallel coords lines (based on pbase)
    plines = pbase.mark_line().encode(
        size=alt.condition((brush|selection|highlight), alt.value(3), alt.value(1)),
        opacity=alt.condition((brush|selection|highlight), alt.value(1), alt.value(0.5))
    ).interactive()

    # combine lines and points into 1 chart
    pcoords = ppoints + plines


    # vertically concat genome and parallel coords charts into one
    chart = alt.vconcat(
        points,
        pcoords,
        data=df.sort_values("exp_condition_order")
    ).resolve_scale(
        color='independent' # make legends independent
    ).configure_axis(
        labelFontSize=16,
        titleFontSize=20,
        grid=False,
    ).configure_title(
        fontSize=30
    )
    
    chart.save(f'multi_top{n}perc.html')
    return chart


In [11]:
multi_plot(df_3,'3')

In [12]:
multi_plot(df_1,'1')

In [13]:
multi_plot(df_10, '10')