# Assemble volcano plots for Lian's fig

In [1]:
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from override import GENE_NAME_OVERRIDE, GENE_PRODUCT_OVERRIDE
from Bio import SeqIO


In [2]:
# get feature info from genbank

def get_override_gene(locus_tag,cur_gene):
    '''
    Given a locus tag, return an overridden gene
    '''
    return GENE_NAME_OVERRIDE[locus_tag] if locus_tag in GENE_NAME_OVERRIDE else cur_gene

def get_override_product(locus_tag,cur_prod):
    '''
    Given a locus tag, return an overridden gene
    '''
    return GENE_PRODUCT_OVERRIDE[locus_tag] if locus_tag in GENE_PRODUCT_OVERRIDE else cur_prod

def get_feat2meta_dict(genbank_path):
    '''
    Given a genbank file, parse it and return a dictionary of locus and the 
    gene, product and type fields
    '''
    seq_record = SeqIO.parse(genbank_path, "genbank").__next__()
    feat_list = []
    # Loop over the genome file, get the features on each of the strands
    for feature in seq_record.features:
        if feature.type != 'gene': # exclude 'gene' wrapper type
            if 'locus_tag' in feature.qualifiers: # exclude features without a locus tag
                # get  locus tag, feature name and product
                lt = feature.qualifiers['locus_tag'][0]
                g = "" if 'gene' not in feature.qualifiers else feature.qualifiers['gene'][0]
                prod = "" if 'product' not in feature.qualifiers else feature.qualifiers['product'][0]
                t = feature.type
                strand = feature.strand

                # overrides
                g = get_override_gene(lt,g)
                prod = get_override_product(lt,prod)

                metadata = {
                    'gene_symbol':g,
                    'product':prod,
                    'type':t,
                    'strand':strand
                }

                feat_list.append((lt,metadata))

    return dict(feat_list)

In [3]:
gb_file = "data/5GB1c_sequence_20220411.gb"
loc2info = get_feat2meta_dict(gb_file)

loc2info['EQU24_RS19315']

{'gene_symbol': 'pmoC',
 'product': 'methane monooxygenase/ammonia monooxygenase subunit C',
 'type': 'CDS',
 'strand': -1}

In [4]:
# differential expression files
ch500_df = pd.read_csv('data/exp_condition_CH4_500ppm_vs_lowCH4.tsv',sep='\t')
ch500_df = ch500_df.reset_index().rename(columns={'index':'locus_tag'})
ch500_df['ch4_level'] = '500ppm'
display(ch500_df.head())

ch1000_df = pd.read_csv('data/exp_condition_CH4_1000ppm_vs_lowCH4.tsv',sep='\t')
ch1000_df = ch1000_df.reset_index().rename(columns={'index':'locus_tag'})
ch1000_df['ch4_level'] = '1000ppm'
display(ch1000_df.head())

ch_df = pd.concat([ch500_df,ch1000_df])
ch_df['padj_invlog'] = ch_df['padj'].apply(lambda x: -np.log10(x))
ch_df['abs_log2fc'] = np.abs(ch_df['log2FoldChange'])


ch_df

Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level
0,EQU24_RS00005,978.904659,0.553172,0.155627,3.554466,0.000379,0.001,500ppm
1,EQU24_RS00010,1006.377442,-0.350918,0.182868,-1.918965,0.054989,0.087342,500ppm
2,EQU24_RS00015,522.894898,0.120708,0.157373,0.767022,0.443069,0.52083,500ppm
3,EQU24_RS00020,2573.605057,-0.108161,0.145295,-0.744421,0.456622,0.534725,500ppm
4,EQU24_RS00035,574.805889,-0.027342,0.246114,-0.111093,0.911542,0.932222,500ppm


Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level
0,EQU24_RS00005,978.904659,0.067246,0.155342,0.432889,0.665096,0.731059,1000ppm
1,EQU24_RS00010,1006.377442,-0.410048,0.182303,-2.249268,0.024495,0.044711,1000ppm
2,EQU24_RS00015,522.894898,-0.293357,0.15708,-1.867561,0.061823,0.099813,1000ppm
3,EQU24_RS00020,2573.605057,0.079233,0.145007,0.546409,0.584785,0.663224,1000ppm
4,EQU24_RS00035,574.805889,0.249371,0.244742,1.018914,0.308244,0.392346,1000ppm


Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level,padj_invlog,abs_log2fc
0,EQU24_RS00005,978.904659,0.553172,0.155627,3.554466,3.787481e-04,9.996956e-04,500ppm,3.000132,0.553172
1,EQU24_RS00010,1006.377442,-0.350918,0.182868,-1.918965,5.498878e-02,8.734158e-02,500ppm,1.058779,0.350918
2,EQU24_RS00015,522.894898,0.120708,0.157373,0.767022,4.430687e-01,5.208296e-01,500ppm,0.283304,0.120708
3,EQU24_RS00020,2573.605057,-0.108161,0.145295,-0.744421,4.566221e-01,5.347250e-01,500ppm,0.271870,0.108161
4,EQU24_RS00035,574.805889,-0.027342,0.246114,-0.111093,9.115424e-01,9.322223e-01,500ppm,0.030481,0.027342
...,...,...,...,...,...,...,...,...,...,...
4184,EQU24_RS23155,33281.507902,0.040945,0.326197,0.125521,9.001111e-01,9.224127e-01,1000ppm,0.035075,0.040945
4185,EQU24_RS23160,21328.143416,-0.103794,0.268584,-0.386447,6.991656e-01,7.597171e-01,1000ppm,0.119348,0.103794
4186,EQU24_RS23165,32819.819287,-0.309035,0.320198,-0.965138,3.344759e-01,4.185643e-01,1000ppm,0.378238,0.309035
4187,EQU24_RS23170,585.412492,-0.425610,0.271593,-1.567088,1.170942e-01,1.724603e-01,1000ppm,0.763311,0.425610


In [5]:
ch500_df.shape

(4189, 8)

## Collect metadata from TPM df

In [6]:
with open('data/sample2condition.txt','r') as f:
    sample2cond = dict([x.strip().split('\t') for x in f.readlines()])
    samples = list(sample2cond.keys())

In [7]:
# tpm file
tpm_df = pd.read_csv('data/5GB1_tpms_20221031.tsv',sep='\t').fillna("")
tpm_df

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,gene_len,5GB1_ferm_Ack_QC_tpm,...,5GB1C-5G-N-BR1_tpm,5GB1C-5G-N-BR2_tpm,5GB1C-JG15-La-BR1_tpm,5GB1C-JG15-La-BR2_tpm,5GB1C-JG15-N-BR1_tpm,5GB1C-JG15-N-BR2_tpm,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm
0,EQU24_RS00005,chromosomal replication initiator protein DnaA,CDS,dnaA,NZ_CP035467.1,0,1317,Derived by automated computational analysis us...,1318,2.920380,...,38.638102,31.867873,30.546267,36.840627,29.198516,35.405768,56.747208,55.734395,46.812595,35.325741
1,EQU24_RS00010,DNA polymerase III subunit beta,CDS,dnaN,NZ_CP035467.1,1502,2603,Derived by automated computational analysis us...,1102,1.600865,...,45.092244,45.889651,34.824076,44.661748,35.864388,45.409001,32.721559,33.467532,34.906928,30.020538
2,EQU24_RS00015,DNA replication/repair protein RecF,CDS,recF,NZ_CP035467.1,3060,4140,Derived by automated computational analysis us...,1081,1.409423,...,21.362765,20.976809,17.355043,21.854708,18.734014,25.608242,26.409599,23.529439,21.368579,16.896055
3,EQU24_RS00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS,gyrB,NZ_CP035467.1,4185,6600,Derived by automated computational analysis us...,2416,3.186309,...,57.478160,61.623220,52.941842,63.050677,55.592843,58.631387,48.167231,51.249206,61.811500,54.226180
4,EQU24_RS00035,hypothetical protein,CDS,,NZ_CP035467.1,7350,7734,Derived by automated computational analysis us...,385,8.852007,...,118.910610,106.287739,102.200487,116.972791,105.924563,129.975893,86.942252,83.059104,118.967455,94.451247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4184,EQU24_RS23155,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4877662,4878310,Derived by automated computational analysis us...,649,54.488824,...,4912.583048,5746.455924,2972.807778,3975.569016,4832.782004,5503.508261,2093.320771,2189.062660,2612.122268,2556.194919
4185,EQU24_RS23160,6-phospho-3-hexuloisomerase,CDS,hxlB,NZ_CP035467.1,4882314,4882848,Derived by automated computational analysis us...,535,39.120128,...,2384.223783,2787.974601,1240.700959,1765.269056,2365.333503,2142.160596,1521.529928,1598.613630,2081.876627,1962.755769
4186,EQU24_RS23165,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4882851,4883499,Derived by automated computational analysis us...,649,49.546527,...,4663.779462,5303.775431,2831.473500,3738.866442,4470.981143,5103.421732,1613.939517,1649.074688,2003.711339,1940.573398
4187,EQU24_RS23170,transposase,CDS,,NZ_CP035467.1,4918898,4919603,Derived by automated computational analysis us...,706,5.792673,...,36.226783,48.282299,23.052263,30.934037,35.897452,41.177703,32.119364,25.035217,28.898838,26.694263


In [8]:
# apply gene override names/products
tpm_df['gene_symbol'] = tpm_df['locus_tag'].apply(lambda x: loc2info[x]['gene_symbol'])
tpm_df['product'] = tpm_df['locus_tag'].apply(lambda x: loc2info[x]['product'])
tpm_df[tpm_df['locus_tag']=='EQU24_RS12525']

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,gene_len,5GB1_ferm_Ack_QC_tpm,...,5GB1C-5G-N-BR1_tpm,5GB1C-5G-N-BR2_tpm,5GB1C-JG15-La-BR1_tpm,5GB1C-JG15-La-BR2_tpm,5GB1C-JG15-N-BR1_tpm,5GB1C-JG15-N-BR2_tpm,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm
2285,EQU24_RS12525,transfer-messenger RNA,tmRNA,ssrA,,2807909,2808284,,376,364195.629604,...,163378.044864,130870.098679,137852.653857,178606.335282,178731.713539,81931.066756,279755.345671,272077.624131,213393.339329,272606.270476


In [9]:
# drop all sample data cols so we can keep just the few we want
tpm_subdf = tpm_df.drop(samples,axis=1)

# which tpm cols do we actually want around?
cols2keep = ['5GB1C_CH4_500ppm-Rep1_tpm','5GB1C_CH4_500ppm-Rep2_tpm','5GB1C_CH4_1000ppm-Rep1_tpm','5GB1C_CH4_1000ppm-Rep2_tpm']
for col in cols2keep:
    tpm_subdf[col] = tpm_df[col]
tpm_subdf

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,gene_len,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm,5GB1C_CH4_1000ppm-Rep2_tpm
0,EQU24_RS00005,chromosomal replication initiator protein DnaA,CDS,dnaA,NZ_CP035467.1,0,1317,Derived by automated computational analysis us...,1318,56.747208,55.734395,35.325741,46.812595
1,EQU24_RS00010,DNA polymerase III subunit beta,CDS,dnaN,NZ_CP035467.1,1502,2603,Derived by automated computational analysis us...,1102,32.721559,33.467532,30.020538,34.906928
2,EQU24_RS00015,DNA replication/repair protein RecF,CDS,recF,NZ_CP035467.1,3060,4140,Derived by automated computational analysis us...,1081,26.409599,23.529439,16.896055,21.368579
3,EQU24_RS00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS,gyrB,NZ_CP035467.1,4185,6600,Derived by automated computational analysis us...,2416,48.167231,51.249206,54.226180,61.811500
4,EQU24_RS00035,hypothetical protein,CDS,,NZ_CP035467.1,7350,7734,Derived by automated computational analysis us...,385,86.942252,83.059104,94.451247,118.967455
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4184,EQU24_RS23155,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4877662,4878310,Derived by automated computational analysis us...,649,2093.320771,2189.062660,2556.194919,2612.122268
4185,EQU24_RS23160,6-phospho-3-hexuloisomerase,CDS,hxlB,NZ_CP035467.1,4882314,4882848,Derived by automated computational analysis us...,535,1521.529928,1598.613630,1962.755769,2081.876627
4186,EQU24_RS23165,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4882851,4883499,Derived by automated computational analysis us...,649,1613.939517,1649.074688,1940.573398,2003.711339
4187,EQU24_RS23170,transposase,CDS,,NZ_CP035467.1,4918898,4919603,Derived by automated computational analysis us...,706,32.119364,25.035217,26.694263,28.898838


In [10]:
tpm_subdf[cols2keep[:2]]

Unnamed: 0,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm
0,56.747208,55.734395
1,32.721559,33.467532
2,26.409599,23.529439
3,48.167231,51.249206
4,86.942252,83.059104
...,...,...
4184,2093.320771,2189.062660
4185,1521.529928,1598.613630
4186,1613.939517,1649.074688
4187,32.119364,25.035217


In [11]:
tpm_subdf['mean500_tpm'] = tpm_subdf[cols2keep[:2]].mean(axis=1)
tpm_subdf['mean1000_tpm'] = tpm_subdf[cols2keep[2:]].mean(axis=1)
tpm_subdf['mean_tpm'] = tpm_subdf[cols2keep].mean(axis=1)
tpm_subdf['log2_mean_tpm'] = np.log2(tpm_subdf['mean_tpm'])
tpm_subdf

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,gene_len,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,mean500_tpm,mean1000_tpm,mean_tpm,log2_mean_tpm
0,EQU24_RS00005,chromosomal replication initiator protein DnaA,CDS,dnaA,NZ_CP035467.1,0,1317,Derived by automated computational analysis us...,1318,56.747208,55.734395,35.325741,46.812595,56.240801,41.069168,48.654985,5.604516
1,EQU24_RS00010,DNA polymerase III subunit beta,CDS,dnaN,NZ_CP035467.1,1502,2603,Derived by automated computational analysis us...,1102,32.721559,33.467532,30.020538,34.906928,33.094545,32.463733,32.779139,5.034706
2,EQU24_RS00015,DNA replication/repair protein RecF,CDS,recF,NZ_CP035467.1,3060,4140,Derived by automated computational analysis us...,1081,26.409599,23.529439,16.896055,21.368579,24.969519,19.132317,22.050918,4.462767
3,EQU24_RS00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS,gyrB,NZ_CP035467.1,4185,6600,Derived by automated computational analysis us...,2416,48.167231,51.249206,54.226180,61.811500,49.708219,58.018840,53.863529,5.751237
4,EQU24_RS00035,hypothetical protein,CDS,,NZ_CP035467.1,7350,7734,Derived by automated computational analysis us...,385,86.942252,83.059104,94.451247,118.967455,85.000678,106.709351,95.855015,6.582782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4184,EQU24_RS23155,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4877662,4878310,Derived by automated computational analysis us...,649,2093.320771,2189.062660,2556.194919,2612.122268,2141.191715,2584.158594,2362.675154,11.206206
4185,EQU24_RS23160,6-phospho-3-hexuloisomerase,CDS,hxlB,NZ_CP035467.1,4882314,4882848,Derived by automated computational analysis us...,535,1521.529928,1598.613630,1962.755769,2081.876627,1560.071779,2022.316198,1791.193989,10.806706
4186,EQU24_RS23165,3-hexulose-6-phosphate synthase,CDS,hxlA,NZ_CP035467.1,4882851,4883499,Derived by automated computational analysis us...,649,1613.939517,1649.074688,1940.573398,2003.711339,1631.507103,1972.142368,1801.824735,10.815243
4187,EQU24_RS23170,transposase,CDS,,NZ_CP035467.1,4918898,4919603,Derived by automated computational analysis us...,706,32.119364,25.035217,26.694263,28.898838,28.577290,27.796550,28.186920,4.816954


In [12]:
# merge this in with the deseqdf
df = pd.merge(ch_df, tpm_subdf,how='left',on='locus_tag')
df

Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level,padj_invlog,abs_log2fc,...,note,gene_len,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,mean500_tpm,mean1000_tpm,mean_tpm,log2_mean_tpm
0,EQU24_RS00005,978.904659,0.553172,0.155627,3.554466,3.787481e-04,9.996956e-04,500ppm,3.000132,0.553172,...,Derived by automated computational analysis us...,1318,56.747208,55.734395,35.325741,46.812595,56.240801,41.069168,48.654985,5.604516
1,EQU24_RS00010,1006.377442,-0.350918,0.182868,-1.918965,5.498878e-02,8.734158e-02,500ppm,1.058779,0.350918,...,Derived by automated computational analysis us...,1102,32.721559,33.467532,30.020538,34.906928,33.094545,32.463733,32.779139,5.034706
2,EQU24_RS00015,522.894898,0.120708,0.157373,0.767022,4.430687e-01,5.208296e-01,500ppm,0.283304,0.120708,...,Derived by automated computational analysis us...,1081,26.409599,23.529439,16.896055,21.368579,24.969519,19.132317,22.050918,4.462767
3,EQU24_RS00020,2573.605057,-0.108161,0.145295,-0.744421,4.566221e-01,5.347250e-01,500ppm,0.271870,0.108161,...,Derived by automated computational analysis us...,2416,48.167231,51.249206,54.226180,61.811500,49.708219,58.018840,53.863529,5.751237
4,EQU24_RS00035,574.805889,-0.027342,0.246114,-0.111093,9.115424e-01,9.322223e-01,500ppm,0.030481,0.027342,...,Derived by automated computational analysis us...,385,86.942252,83.059104,94.451247,118.967455,85.000678,106.709351,95.855015,6.582782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,EQU24_RS23155,33281.507902,0.040945,0.326197,0.125521,9.001111e-01,9.224127e-01,1000ppm,0.035075,0.040945,...,Derived by automated computational analysis us...,649,2093.320771,2189.062660,2556.194919,2612.122268,2141.191715,2584.158594,2362.675154,11.206206
8374,EQU24_RS23160,21328.143416,-0.103794,0.268584,-0.386447,6.991656e-01,7.597171e-01,1000ppm,0.119348,0.103794,...,Derived by automated computational analysis us...,535,1521.529928,1598.613630,1962.755769,2081.876627,1560.071779,2022.316198,1791.193989,10.806706
8375,EQU24_RS23165,32819.819287,-0.309035,0.320198,-0.965138,3.344759e-01,4.185643e-01,1000ppm,0.378238,0.309035,...,Derived by automated computational analysis us...,649,1613.939517,1649.074688,1940.573398,2003.711339,1631.507103,1972.142368,1801.824735,10.815243
8376,EQU24_RS23170,585.412492,-0.425610,0.271593,-1.567088,1.170942e-01,1.724603e-01,1000ppm,0.763311,0.425610,...,Derived by automated computational analysis us...,706,32.119364,25.035217,26.694263,28.898838,28.577290,27.796550,28.186920,4.816954


In [13]:
df.columns

Index(['locus_tag', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue',
       'padj', 'ch4_level', 'padj_invlog', 'abs_log2fc', 'product', 'type',
       'gene_symbol', 'locus', 'start_coord', 'end_coord', 'note', 'gene_len',
       '5GB1C_CH4_500ppm-Rep1_tpm', '5GB1C_CH4_500ppm-Rep2_tpm',
       '5GB1C_CH4_1000ppm-Rep1_tpm', '5GB1C_CH4_1000ppm-Rep2_tpm',
       'mean500_tpm', 'mean1000_tpm', 'mean_tpm', 'log2_mean_tpm'],
      dtype='object')

## Volcano plots

In [14]:
# remove CDS
df = df[df['type']!='CDS']
df.shape

(114, 26)

In [18]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [15]:
def is_sig(pval, fc, pthresh=0.05,fc_thresh=1):
    if pval<pthresh and np.abs(fc) > fc_thresh:
        return True
    else:
        return False

In [16]:
df['is_sig'] = df.apply(lambda row: is_sig(row['padj'],row['log2FoldChange']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_sig'] = df.apply(lambda row: is_sig(row['padj'],row['log2FoldChange']),axis=1)


In [17]:
df[df['is_sig']==True]

Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level,padj_invlog,abs_log2fc,...,gene_len,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,mean500_tpm,mean1000_tpm,mean_tpm,log2_mean_tpm,is_sig
93,EQU24_RS00525,397.524699,-3.796255,0.312569,-12.145346,6.072519e-34,4.350526e-32,500ppm,31.361458,3.796255,...,78,11.296604,14.639779,10.393897,10.989306,12.968191,10.691601,11.829896,3.564366,True
114,EQU24_RS00635,1817.371655,-2.211669,0.392583,-5.633629,1.764566e-08,1.030486e-07,500ppm,6.986958,2.211669,...,77,223.346541,198.837051,214.271039,237.204508,211.091796,225.737773,218.414785,7.770927,True
254,EQU24_RS01370,64.699786,2.699290,0.246420,10.953999,6.357709e-28,2.898537e-26,500ppm,25.537821,2.699290,...,77,317.518275,276.883066,154.294993,182.644391,297.200670,168.469692,232.835181,7.863165,True
385,EQU24_RS02110,58.210131,1.354243,0.375620,3.605358,3.117229e-04,8.359842e-04,500ppm,3.077802,1.354243,...,78,75.620794,68.376379,48.214623,65.501474,71.998587,56.858048,64.428318,6.009623,True
503,EQU24_RS02745,1777.785399,-2.211932,0.393675,-5.618678,1.924236e-08,1.117226e-07,500ppm,6.951859,2.211932,...,77,218.499962,197.557609,218.570792,235.136503,208.028785,226.853648,217.441216,7.764482,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7573,EQU24_RS18545,708.293714,-1.183275,0.424158,-2.789705,5.275615e-03,1.155337e-02,1000ppm,1.937291,1.183275,...,77,233.780150,248.211914,116.038214,206.316442,240.996032,161.177328,201.086680,7.651674,True
7574,EQU24_RS18550,1323.707643,-3.466761,0.479512,-7.229766,4.838285e-13,6.470399e-12,1000ppm,11.189069,3.466761,...,78,7.641820,8.956100,7.292054,6.471963,8.298960,6.882008,7.590484,2.924192,True
7575,EQU24_RS18555,895.673158,-1.181078,0.485400,-2.433204,1.496585e-02,2.898492e-02,1000ppm,1.537828,1.181078,...,78,149.314521,142.551258,109.544060,118.971178,145.932890,114.257619,130.095254,7.023425,True
7626,EQU24_RS18845,1814.651141,-2.141570,0.384984,-5.562758,2.655446e-08,1.749367e-07,1000ppm,6.757119,2.141570,...,77,211.162779,180.517756,215.704290,230.692494,195.840267,223.198392,209.519330,7.710940,True


In [18]:
# significance thresholds
thresh_low = alt.Chart(pd.DataFrame({'x': [-1]})).mark_rule(strokeDash=[12, 6],size=2).encode(x='x')
thresh_high = alt.Chart(pd.DataFrame({'x': [1]})).mark_rule(strokeDash=[12, 6],size=2).encode(x='x')
sig_p = 0.05
thresh_sig = alt.Chart(pd.DataFrame({'y': [-np.log10(sig_p)]})).mark_rule(strokeDash=[12, 6],size=2).encode(y='y')

bubble_max = 300

In [25]:
def volcano_all_data(df,filename='alt_out/volcano_rna.html',plot_title=False):
    points = alt.Chart(df).mark_point().encode(
        x=alt.X('log2FoldChange:Q',title="log\u2082 Fold Change"),
        y=alt.Y('padj_invlog:Q', title="-log\u2081\u2080 p-value",
                axis=alt.Axis(tickCount=3)),
        color = alt.condition(alt.datum.is_sig,
                       alt.Color('ch4_level:N',
                                 title='CH\u2084 Level',
                                 scale=alt.Scale(domain=['500ppm', '1000ppm'], range=['#1f78b4', '#d95f02'])),
                       alt.value('lightgray')
        ),
        size=alt.Size('mean_tpm:Q',
                      title='Mean TPM',
                      scale=alt.Scale(domain=[1, 100000], range=[1, bubble_max],type='log')),
        tooltip=['locus_tag:N','gene_symbol:N', 'product:N',
                 alt.Tooltip('mean500_tpm:Q',format='.2f'),
                 alt.Tooltip('mean1000_tpm:Q',format='.2f')],
    ).properties(
        title=f"{'All data' if plot_title else ''}",
    ).interactive()

    chart = points + thresh_low + thresh_high + thresh_sig

#     chart = chart.configure_axis(
#         grid=False,
#         titleFontSize=20,
#         labelFontSize=14
#     )
    
    chart.save(filename)
    return chart

In [26]:
v_all = volcano_all_data(df)
v_all

In [38]:
rna_up = df[(df['is_sig']==True) & (df['log2FoldChange']>1)][cols]
rna_up

Unnamed: 0,locus_tag,ch4_level,type,log2FoldChange,baseMean,mean500_tpm,mean1000_tpm
254,EQU24_RS01370,500ppm,tRNA,2.69929,64.699786,297.20067,168.469692
385,EQU24_RS02110,500ppm,tRNA,1.354243,58.210131,71.998587,56.858048
1164,EQU24_RS06320,500ppm,tRNA,1.173607,224.823873,210.201061,202.690839
1682,EQU24_RS09235,500ppm,ncRNA,1.797797,967.764439,1521.39903,1065.430291
1763,EQU24_RS09685,500ppm,tRNA,2.700591,497.369928,1241.700264,1015.36859
3079,EQU24_RS16875,500ppm,tRNA,4.863896,478.030596,4181.204939,2648.383427
4443,EQU24_RS01370,1000ppm,tRNA,1.873209,64.699786,297.20067,168.469692
4574,EQU24_RS02110,1000ppm,tRNA,1.002839,58.210131,71.998587,56.858048
5353,EQU24_RS06320,1000ppm,tRNA,1.078589,224.823873,210.201061,202.690839
5871,EQU24_RS09235,1000ppm,ncRNA,1.315535,967.764439,1521.39903,1065.430291


In [36]:
cols = ['locus_tag','ch4_level','type','log2FoldChange','baseMean','mean500_tpm','mean1000_tpm']
df[cols]

Unnamed: 0,locus_tag,ch4_level,type,log2FoldChange,baseMean,mean500_tpm,mean1000_tpm
93,EQU24_RS00525,500ppm,tRNA,-3.796255,3.975247e+02,12.968191,10.691601
112,EQU24_RS00625,500ppm,rRNA,-0.299228,1.787685e+06,2100.019732,1922.873408
113,EQU24_RS00630,500ppm,tRNA,0.606909,4.484981e+03,3448.756030,4378.882623
114,EQU24_RS00635,500ppm,tRNA,-2.211669,1.817372e+03,211.091796,225.737773
115,EQU24_RS00640,500ppm,rRNA,0.501696,1.480710e+06,6183.390478,4902.699690
...,...,...,...,...,...,...,...
7628,EQU24_RS18855,1000ppm,rRNA,-0.278736,1.788390e+06,2067.804988,1892.382479
7629,EQU24_RS18860,1000ppm,tRNA,0.710998,8.169533e+02,1116.294347,1006.253554
7797,EQU24_RS19765,1000ppm,ncRNA,-3.567492,1.552798e+06,3728.028742,3152.596805
7948,EQU24_RS20650,1000ppm,tRNA,-2.027753,7.186784e+02,51.155367,26.052612


In [39]:
set(rna_up['locus_tag'].values)

{'EQU24_RS01370',
 'EQU24_RS02110',
 'EQU24_RS06320',
 'EQU24_RS09235',
 'EQU24_RS09685',
 'EQU24_RS16875'}

these are the rna loci that are up in 500/1000ppm relative to low CH4

In [42]:
df[df['locus_tag']=='EQU24_RS22130']

Unnamed: 0,locus_tag,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ch4_level,padj_invlog,abs_log2fc,...,gene_len,5GB1C_CH4_500ppm-Rep1_tpm,5GB1C_CH4_500ppm-Rep2_tpm,5GB1C_CH4_1000ppm-Rep1_tpm,5GB1C_CH4_1000ppm-Rep2_tpm,mean500_tpm,mean1000_tpm,mean_tpm,log2_mean_tpm,is_sig
4029,EQU24_RS22130,64.713234,0.93673,0.322133,2.907894,0.003639,0.007704,500ppm,2.1133,0.93673,...,91,60.432083,53.244507,47.763729,53.091191,56.838295,50.42746,53.632877,5.745046,False
8218,EQU24_RS22130,64.713234,0.743236,0.318967,2.330135,0.019799,0.037032,1000ppm,1.431422,0.743236,...,91,60.432083,53.244507,47.763729,53.091191,56.838295,50.42746,53.632877,5.745046,False
