# Explore TPM data matrix

In [1]:
import altair as alt
import numpy as np
import pandas as pd

from Bio import SeqIO

import get_top_gene_set as gtgs



## Load expression data

In [2]:
# load TPM data
data_file = 'data/extract_TPM_counts.tsv'
sample2condition_file = 'data/sample2condition.txt'
sample_file = None #'config/samples_to_include.txt'
condition_file = 'config/conditions_to_include.txt'

df, sample2condition, samples, conditions = gtgs.load_data(data_file, sample2condition_file, sample_file, condition_file)


In [3]:
df.head()

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,gene_len,...,5GB1_pA9_red_tpm,5GB1_pA9_yellow_tpm,5GB1C-5G-La-BR1_tpm,5GB1C-5G-La-BR2_tpm,5GB1C-5G-N-BR1_tpm,5GB1C-5G-N-BR2_tpm,5GB1C-JG15-La-BR1_tpm,5GB1C-JG15-La-BR2_tpm,5GB1C-JG15-N-BR1_tpm,5GB1C-JG15-N-BR2_tpm
0,EQU24_RS00005,chromosomal replication initiator protein DnaA,CDS,dnaA,NZ_CP035467.1,0,1317,Derived by automated computational analysis us...,MSALWNNCLAKLENEISSSEFSTWIRPLQAIETDGQIKLLAPNRFV...,1318,...,38.557373,38.810668,37.444214,40.246006,40.100118,33.432274,39.880174,38.355431,30.247582,41.248441
1,EQU24_RS00010,DNA polymerase III subunit beta,CDS,,NZ_CP035467.1,1502,2603,Derived by automated computational analysis us...,MKYIINREQLLVPLQQIVSVIEKRQTMPILSNVLMVFRENTLVMTG...,1102,...,52.552767,52.461746,42.676553,49.210083,46.798476,48.142385,45.465136,46.498139,37.152951,52.90241
2,EQU24_RS00015,DNA replication/repair protein RecF,CDS,recF,NZ_CP035467.1,3060,4140,Derived by automated computational analysis us...,MSLQKLDIFNVRNIRQASLQPSPGLNLIYGANASGKSSVLEAIFIL...,1081,...,31.350991,34.914128,21.479309,24.204682,22.171104,22.006566,22.658157,22.753325,19.407103,29.834124
3,EQU24_RS00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS,gyrB,NZ_CP035467.1,4185,6600,Derived by automated computational analysis us...,MSENIKQYDSTNIQVLKGLDAVRKRPGMYIGDTDDGTGLHHMVFEV...,2416,...,74.848501,80.850761,54.959319,64.911376,59.653059,64.648318,69.119079,65.643179,57.590223,68.306759
4,EQU24_RS00025,hypothetical protein,CDS,,NZ_CP035467.1,6825,7062,Derived by automated computational analysis us...,VKTTKYFLTTRMRPDREIIKDEWIQYVVRFPENEHIQFDGRIRRWA...,238,...,50.324948,49.349547,34.539657,36.521074,37.789611,39.358066,38.992158,35.870964,41.462392,40.227192


## Load features from genbank

In [4]:
# feature tuple indices
LEFT_IDX = 0
RIGHT_IDX = 1
STRAND_IDX = 2
LOCUS_IDX = 3
GENE_IDX = 4
TYPE_IDX = 5

# Load features from genbank
gb_file = "data/5GB1c_sequence.gb"
pos_feats, neg_feats = gtgs.get_pos_neg_features(gb_file)

In [5]:


# def insert_override_gene_names(feat_list):
#     '''
#     Given a list of features as a tuple, update some of the 
#     locus info with gene names
#     '''
#     #convert tuple to list
#     feat_list = [list(x) for x in feat_list]
    
#     updated_feats = []
#     for feat in feat_list:
#         g = get_override_gene(feat[LOCUS_IDX],feat[GENE_IDX])
#         feat[GENE_IDX] = g
#         updated_feats.append(feat)
        
#     return updated_feats
    

In [6]:
# update some gene names with a set of names to override (helpful for viz)
#pos_feats = insert_override_gene_names(pos_feats)
#neg_feats = insert_override_gene_names(neg_feats)

print(pos_feats[:5])
print(neg_feats[:5])

[[0, 1317, 1, 'EQU24_RS00005', 'dnaA', 'CDS'], [1502, 2603, 1, 'EQU24_RS00010', '', 'CDS'], [3060, 4140, 1, 'EQU24_RS00015', 'recF', 'CDS'], [4185, 6600, 1, 'EQU24_RS00020', 'gyrB', 'CDS'], [6825, 7062, 1, 'EQU24_RS00025', '', 'CDS']]
[[20389, 21388, -1, 'EQU24_RS00080', '', 'CDS'], [23251, 23782, -1, 'EQU24_RS00095', '', 'CDS'], [23841, 24321, -1, 'EQU24_RS00100', '', 'CDS'], [24340, 24994, -1, 'EQU24_RS00105', 'crp', 'CDS'], [32310, 32679, -1, 'EQU24_RS00130', '', 'CDS']]


## Determine loci which may be in operons

In [7]:
op_min_dist = 120
maybe_operon_loci = gtgs.flag_potential_operon_loci(pos_feats, neg_feats, op_min_dist)

print(f"Number of possible operon loci at distance of {op_min_dist}bp: {len(maybe_operon_loci)}/{len(pos_feats)+len(neg_feats)}")

Number of possible operon loci at distance of 120bp: 2068/4431


## Transform the data frame for mean TPM calculations

In [8]:
# for the loaded TPM matrix, which column contains the unique gene ids?
LOCUS_ID_COL = 'locus_tag'
# use this column to get a full list of all genes for which expression was measured
LOCI = list(df[LOCUS_ID_COL].values)

In [9]:
df_means = gtgs.get_average_tpm_by_condition(df,samples,conditions,sample2condition,LOCI)
df_means

locus_tag,exp_condition,EQU24_RS00005,EQU24_RS00010,EQU24_RS00015,EQU24_RS00020,EQU24_RS00025,EQU24_RS00030,EQU24_RS00035,EQU24_RS00040,EQU24_RS00045,...,EQU24_RS22110,EQU24_RS22115,EQU24_RS22120,EQU24_RS22125,EQU24_RS22130,EQU24_RS22135,EQU24_RS22140,EQU24_RS22145,EQU24_RS22150,EQU24_RS22155
0,MeOH,23.333155,18.915775,18.453916,18.267805,16.960643,12.377795,43.815536,9.67095,7.302145,...,1298.257682,15.624619,20.208066,26.004364,20.960234,28.719983,93.616437,161.528124,496.990651,280.344047
1,NO3_lowO2_slow_growth,32.050358,43.65676,21.351623,62.267687,41.684925,31.921455,57.849768,16.885694,14.926147,...,6497.868109,26.273485,28.945133,23.525245,26.432667,35.167264,178.996199,164.083806,433.438735,493.895115
2,NoCu,44.348687,59.62936,28.268717,56.818319,49.839406,38.394652,81.530362,40.501969,36.5765,...,8345.785345,43.065124,34.380565,44.419579,34.601933,65.339879,253.608495,273.284694,731.05219,1087.621126
3,NoLanthanum,33.444023,43.689839,23.172675,57.297047,42.367072,41.941657,102.513601,30.226787,19.462312,...,5085.637409,16.423284,35.588138,44.623117,43.201743,21.92726,109.78333,67.277718,211.575175,328.943746
4,WithLanthanum,35.462185,41.792237,20.644554,57.130166,34.258335,46.201637,110.721781,31.813805,19.438086,...,3942.957792,15.972203,34.318829,49.216725,40.000662,21.220809,98.10061,73.116973,194.389586,319.998959
5,highCu,47.861477,79.10949,33.534043,73.330408,48.662214,33.986359,92.999818,51.950784,50.370579,...,8132.547467,48.894308,35.60873,46.1093,30.125207,89.71015,342.981435,386.493127,1021.453762,1692.401154
6,highO2_slow_growth,64.784508,99.00297,44.856281,78.997757,77.842263,56.626268,97.721756,35.735531,28.808125,...,3468.582202,40.548782,48.532405,37.1395,38.204218,52.033315,220.196691,244.139008,505.42761,561.847119
7,lowCH4,30.829331,33.532522,18.49116,42.963648,31.643505,24.308334,74.948663,18.6019,12.935098,...,7477.339715,18.356915,24.390308,20.231568,19.721043,60.397912,226.909132,297.029289,874.637567,506.211825
8,lowCu,42.973556,61.209155,28.828713,61.573321,50.966799,31.319574,75.047593,40.03867,35.331019,...,7157.344557,43.386082,33.574108,36.872718,31.899782,66.743497,293.599291,313.741841,843.607251,1123.669681
9,lowO2_fast_growth,35.73619,43.159066,27.3258,48.976066,33.463183,26.694205,72.505275,15.430765,12.595934,...,9584.028559,26.951394,26.808972,31.889009,23.88239,72.20795,334.033049,376.513322,1178.068431,1183.180057


In [10]:
# get top locus ids in all conditions
n = 3
top_locs = gtgs.get_top_n_perc_by_condition(df_means, LOCI, n)

# filter out loci maybe in operons
top_locs_op_filter_out = [x for x in top_locs if x in maybe_operon_loci]

print(f'Top {n}%: {len(top_locs)} ({len(top_locs) - len(top_locs_op_filter_out)} filtered)\n')
print("Locus\toperon?")
for loc in top_locs:
    print(f'{loc}\t{loc in top_locs_op_filter_out}')

Top 3%: 37 (25 filtered)

Locus	operon?
EQU24_RS21720	False
EQU24_RS02970	False
EQU24_RS07385	True
EQU24_RS19310	True
EQU24_RS15745	False
EQU24_RS12965	True
EQU24_RS19105	False
EQU24_RS04160	True
EQU24_RS19315	False
EQU24_RS02240	True
EQU24_RS15705	False
EQU24_RS03495	False
EQU24_RS18060	False
EQU24_RS18140	False
EQU24_RS18125	True
EQU24_RS10370	False
EQU24_RS19765	False
EQU24_RS07185	False
EQU24_RS02265	True
EQU24_RS16195	False
EQU24_RS12095	False
EQU24_RS15100	False
EQU24_RS15535	False
EQU24_RS22110	False
EQU24_RS21665	False
EQU24_RS22055	True
EQU24_RS18135	True
EQU24_RS18355	False
EQU24_RS21040	False
EQU24_RS18130	True
EQU24_RS02895	False
EQU24_RS09110	True
EQU24_RS07390	False
EQU24_RS21560	False
EQU24_RS19305	True
EQU24_RS12525	False
EQU24_RS21565	False


# Visualize tradeoffs in top N% threshold used

In [11]:
tradeoff_data = []

# examine top sets using a range of top 1% to top 10%
for i in range(1,11):
    # get top locus ids in all conditions
    top_locs = gtgs.get_top_n_perc_by_condition(df_means, LOCI, i)
    top_locs_op_filter_out = [x for x in top_locs if x in maybe_operon_loci]
    
    # get the average expression of each locus across conditions
    means = []
    for loc in top_locs:
        mean_exp = np.mean(df_means[loc].values)
        means.append(mean_exp)
        
    # add row to the data frame consisting of:
    # n: top % threshold
    # locus count: number of top loci identified
    # locus count filtered: locus count after filtering operon genes
    # min expression: the minimum avg expression value from the top gene set
    row = [i,
           len(top_locs),
           len(top_locs)-len(top_locs_op_filter_out),
           min(means)
          ]
    
    tradeoff_data.append(row)
    
tradeoff_df = pd.DataFrame(tradeoff_data, columns=['n','loc_count','loc_count_filt','min_exp'])


In [12]:
tradeoff_df

Unnamed: 0,n,loc_count,loc_count_filt,min_exp
0,1,14,8,3022.573901
1,2,26,17,1653.480931
2,3,37,25,1131.648318
3,4,56,35,930.223684
4,5,84,43,667.493775
5,6,107,55,469.300182
6,7,127,65,369.788442
7,8,153,78,313.001223
8,9,178,92,313.001223
9,10,197,103,259.999672


In [13]:
num_genes_layer = alt.Chart(tradeoff_df).mark_circle(size=100).encode(
    x=alt.X('loc_count:Q', axis=alt.Axis(title='Number of loci in top set')),
    y=alt.Y('min_exp:Q', axis=alt.Axis(title='Minimum ave. TPM of loci in top set')),
    #x='loc_count:Q',
    #y='min_exp:Q',
    color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
    tooltip=[alt.Tooltip('n:Q', title='Top N%'),
             alt.Tooltip('loc_count:Q', title='Num loci in top set'),
             alt.Tooltip('loc_count_filt:Q', title='Num loci in top set after operon filtering'),
             alt.Tooltip('min_exp:Q', title='Min. ave. expr. in top set')
            ]
)

op_ex_layer = alt.Chart(tradeoff_df).mark_point(size=100).encode(
    x='loc_count_filt:Q',
    y='min_exp:Q',
    color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
    tooltip=[alt.Tooltip('n:Q', title='Top N%'),
             alt.Tooltip('loc_count:Q', title='Num loci in top set'),
             alt.Tooltip('loc_count_filt:Q', title='Num loci in top set after operon filtering'),
             alt.Tooltip('min_exp:Q', title='Min. ave. expr. in top set')
             ]
)

rule = alt.Chart(tradeoff_df).mark_rule().encode(
    x='loc_count_filt:Q',
    x2='loc_count:Q',
    y='min_exp:Q',
    color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
)

text = alt.Chart(tradeoff_df).mark_text(
    align='left',
    baseline='middle',
    angle=335,
    dx=5,
    dy=-7,
    size=14,
    color='black'
).encode(
    x='loc_count:Q',
    y='min_exp:Q',
    text=alt.Text('n:N'),
    #color=alt.Color('n:O', scale=alt.Scale(scheme='viridis'),legend=None),
).transform_calculate(n='"Top " + datum.n + "%"')

# combine layers into chart
chart = num_genes_layer + op_ex_layer + rule + text

# final chart configs
chart.properties(
    title='Tradeoff between top gene set % threshold'
).configure_title(
    fontSize=20,
).configure_axis(
    labelFontSize=14,
    titleFontSize=16
).interactive()

## Visualize gene expression trends across conditions

#### Add relevant metadata to the expression df

In [23]:
from override import GENE_NAME_OVERRIDE, GENE_PRODUCT_OVERRIDE

def get_override_gene(locus_tag,cur_gene):
    '''
    Given a locus tag, return an overridden gene
    '''
    return GENE_NAME_OVERRIDE[locus_tag] if locus_tag in GENE_NAME_OVERRIDE else cur_gene

def get_override_product(locus_tag,cur_prod):
    '''
    Given a locus tag, return an overridden gene
    '''
    return GENE_PRODUCT_OVERRIDE[locus_tag] if locus_tag in GENE_PRODUCT_OVERRIDE else cur_prod

def get_feat2meta_dict(genbank_path):
    '''
    Given a genbank file, parse it and return a dictionary of locus and the 
    gene, product and type fields
    '''
    seq_record = SeqIO.parse(genbank_path, "genbank").__next__()
    feat_list = []
    # Loop over the genome file, get the features on each of the strands
    for feature in seq_record.features:
        if feature.type != 'gene': # exclude 'gene' wrapper type
            if 'locus_tag' in feature.qualifiers: # exclude features without a locus tag
                # get  locus tag, feature name and product
                lt = feature.qualifiers['locus_tag'][0]
                g = "" if 'gene' not in feature.qualifiers else feature.qualifiers['gene'][0]
                prod = "" if 'product' not in feature.qualifiers else feature.qualifiers['product'][0]
                t = feature.type

                # overrides
                g = get_override_gene(lt,g)
                prod = get_override_product(lt,prod)

                metadata = {
                    'gene_symbol':g,
                    'product':prod,
                    'type':t
                }

                feat_list.append((lt,metadata))

    return dict(feat_list)

In [24]:
feat2meta = get_feat2meta_dict(gb_file)

In [26]:
feat2meta['EQU24_RS19310']

{'gene_symbol': 'pmoA',
 'product': 'methane monooxygenase/ammonia monooxygenase subunit A',
 'type': 'CDS'}

In [39]:
# melt df to format for Altair parallel coordinates viz
alt_df = pd.melt(df_means, id_vars=['exp_condition'], value_vars=LOCI,value_name='mean_exp')

# add metadata
alt_df['gene_symbol'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['gene_symbol'])
alt_df['product'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['product'])
alt_df['type'] = alt_df['locus_tag'].apply(lambda x: feat2meta[x]['type'])
alt_df['desc_string']  = alt_df.apply(lambda row: f"{row['locus_tag']}|{row['gene_symbol']}|{row['product']}",axis=1)

# impose a specific x-axis sort order on the df
# choose custom exp_condition order!
list_ordering = ['uMax','lowCH4','NoCu','lowCu','medCu','highCu','NO3_lowO2_slow_growth','highO2_slow_growth','lowO2_fast_growth','MeOH','NoLanthanum','WithLanthanum'] 
alt_df["exp_condition_order"] = pd.Categorical(alt_df["exp_condition"], categories=list_ordering)

alt_df.head()

Unnamed: 0,exp_condition,locus_tag,mean_exp,gene_symbol,product,type,desc_string,exp_condition_order
0,MeOH,EQU24_RS00005,23.333155,dnaA,chromosomal replication initiator protein DnaA,CDS,EQU24_RS00005|dnaA|chromosomal replication ini...,MeOH
1,NO3_lowO2_slow_growth,EQU24_RS00005,32.050358,dnaA,chromosomal replication initiator protein DnaA,CDS,EQU24_RS00005|dnaA|chromosomal replication ini...,NO3_lowO2_slow_growth
2,NoCu,EQU24_RS00005,44.348687,dnaA,chromosomal replication initiator protein DnaA,CDS,EQU24_RS00005|dnaA|chromosomal replication ini...,NoCu
3,NoLanthanum,EQU24_RS00005,33.444023,dnaA,chromosomal replication initiator protein DnaA,CDS,EQU24_RS00005|dnaA|chromosomal replication ini...,NoLanthanum
4,WithLanthanum,EQU24_RS00005,35.462185,dnaA,chromosomal replication initiator protein DnaA,CDS,EQU24_RS00005|dnaA|chromosomal replication ini...,WithLanthanum


In [32]:
def view_df(df):
    '''Small function to display df gene info '''
    print(f"Gene count: {len(df['locus_tag'].unique())}")
    return df[['locus_tag','gene_symbol','product','type']].drop_duplicates().sort_values('locus_tag').reset_index()

In [33]:
view_df(alt_df)

Gene count: 4213


Unnamed: 0,index,locus_tag,gene_symbol,product,type
0,0,EQU24_RS00005,dnaA,chromosomal replication initiator protein DnaA,CDS
1,12,EQU24_RS00010,,DNA polymerase III subunit beta,CDS
2,24,EQU24_RS00015,recF,DNA replication/repair protein RecF,CDS
3,36,EQU24_RS00020,gyrB,DNA topoisomerase (ATP-hydrolyzing) subunit B,CDS
4,48,EQU24_RS00025,,hypothetical protein,CDS
...,...,...,...,...,...
4208,50496,EQU24_RS22135,mnmE,tRNA uridine-5-carboxymethylaminomethyl(34) sy...,CDS
4209,50508,EQU24_RS22140,yidC,membrane protein insertase YidC,CDS
4210,50520,EQU24_RS22145,yidD,membrane protein insertion efficiency factor YidD,CDS
4211,50532,EQU24_RS22150,rnpA,ribonuclease P protein component,CDS


### For a given set of top genes, visualize the expression across conditions

In [51]:
top_locs_1_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 1)
df_1 = alt_df[alt_df['locus_tag'].isin(top_locs_1_perc)]

top_locs_3_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 3)
df_3 = alt_df[alt_df['locus_tag'].isin(top_locs_3_perc)]

top_locs_10_perc = gtgs.get_top_n_perc_by_condition(df_means, LOCI, 10)
df_10 = alt_df[alt_df['locus_tag'].isin(top_locs_10_perc)]

In [77]:
def altair_pcoords_plot_select_legend_and_highlight(df,n,xorder='exp_condition_order'):
    # If more than 30 entries, make 2 columns
    col_num = 1 if len(df['locus_tag'].unique()) <=50 else 2
    
    # selections
    highlight = alt.selection(type='single', on='mouseover',
                              fields=['desc_string'], nearest=True)
    
    selection = alt.selection_multi(fields=['desc_string'], bind='legend')

    
    # base?
    title=f'Average TPM of top {n}% of loci across conditions'
    base = alt.Chart(
        df.sort_values(xorder), 
        title=title,
        #titleFontSize=20
    ).encode(
        x=alt.X(f'{xorder}:N',
                sort=alt.EncodingSortField(field=f"{xorder}:N", op="count"),
                axis=alt.Axis(title='Experimental Condition')
               ),        
        y=alt.Y('mean_exp:Q',scale=alt.Scale(type='log')),
        size=alt.value(100),
        color=alt.Color('desc_string:N',
                        legend=alt.Legend(title='Gene', 
                                          orient = 'right',
                                          labelLimit=0,
                                          columns=col_num,
                                          symbolLimit=200
                                         )), 
    )
    
    # lines
    lines = base.mark_line().encode(
        size=alt.condition((selection|highlight), alt.value(3), alt.value(1)),
        opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.5))
    ).add_selection(
        selection,
    ).properties(
         width=600,
         height=400
    ).interactive()
    
    # points
    points = base.mark_circle().encode(
        tooltip=['locus_tag','product','type','gene_symbol','desc_string'],
        opacity=alt.condition((selection|highlight), alt.value(1), alt.value(0.2)),
        size=alt.condition((selection|highlight), alt.value(100), alt.value(3))
    ).add_selection(highlight)

    chart = lines + points 
    
    chart.configure_axis(
        labelFontSize=22,
        titleFontSize=20
    ).interactive()

    return chart

In [78]:
view_df(df_1)

Gene count: 14


Unnamed: 0,index,locus_tag,gene_symbol,product,type
0,9612,EQU24_RS04160,ssrS,6S RNA,ncRNA
1,27576,EQU24_RS12095,,cytochrome c,CDS
2,28536,EQU24_RS12525,ssrA,transfer-messenger RNA,tmRNA
3,29544,EQU24_RS12965,,hypothetical protein,CDS
4,36900,EQU24_RS16195,,hypothetical protein,CDS
5,41292,EQU24_RS18125,moxI,methanol dehydrogenase,CDS
6,41304,EQU24_RS18130,moxG,"cytochrome c(L), periplasmic",CDS
7,41328,EQU24_RS18140,moxF,"PQQ-dependent dehydrogenase, methanol/ethanol ...",CDS
8,41844,EQU24_RS18355,,hypothetical protein,CDS
9,44028,EQU24_RS19305,pmoB,methane monooxygenase/ammonia monooxygenase su...,CDS


In [79]:
altair_pcoords_plot_select_legend_and_highlight(df_1,"1")

In [80]:
view_df(df_3)

Gene count: 37


Unnamed: 0,index,locus_tag,gene_symbol,product,type
0,5148,EQU24_RS02240,,F0F1 ATP synthase subunit B,CDS
1,5208,EQU24_RS02265,,F0F1 ATP synthase subunit epsilon,CDS
2,6696,EQU24_RS02895,,"exosortase system-associated protein, TIGR0407...",CDS
3,6852,EQU24_RS02970,pqqA,pyrroloquinoline quinone precursor peptide PqqA,CDS
4,8040,EQU24_RS03495,,cold-shock protein,CDS
5,9612,EQU24_RS04160,ssrS,6S RNA,ncRNA
6,16536,EQU24_RS07185,,glutamate--ammonia ligase,CDS
7,16932,EQU24_RS07385,infC,translation initiation factor IF-3,CDS
8,16944,EQU24_RS07390,rpmI,50S ribosomal protein L35,CDS
9,20796,EQU24_RS09110,fae,formaldehyde-activating enzyme,CDS


In [81]:
altair_pcoords_plot_select_legend_and_highlight(df_3,"3")

In [15]:
def load_override_dict(filename):
    with open(filename, 'r') as f:
        lines = [x.strip().split('\t') for x in f.readlines()]
        override_dict = dict(lines)
        
    return override_dict

gene_ov = load_override_dict("data/gene_name_override.txt")
prod_ov = load_override_dict("data/gene_product_override.txt")