**Extract GenBank annotation info and add expression data from Mandel et al: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1009832#sec020**

In [1]:
from Bio import SeqIO
import pandas as pd
import csv
import numpy as np

### Extracting info from GenBank files
from: https://www.ncbi.nlm.nih.gov/nuccore/CP075068.1 to https://www.ncbi.nlm.nih.gov/nuccore/CP075072.1

In [2]:
gene_dict = {}

feature_types = ['GO_function','GO_process','GO_component','TransMembrane','SECRETED','antiSMASH','MEROPS',
                 'EggNog','COG', 'BUSCO','CAZy','SMCOG']

multiple_annotation_genes = {}

for i in range(1,6):

    for chrom in SeqIO.parse('genbank_files/chr{}.gb'.format(i), 'gb'):
        for f in chrom.features:
            if f.type == 'CDS':
                
                gene_features = [None]*len(feature_types)
                gene = f.qualifiers['locus_tag'][0]
                
                alt_name = None
                if 'gene' in f.qualifiers:
                    alt_name = f.qualifiers['gene'][0]
                
                product=None
                if 'product' in f.qualifiers:
                    product = f.qualifiers['product'][0]
                    
                pfam = None
                if 'db_xref' in f.qualifiers:
                    pfam = ';'.join([p.replace('PFAM:','') for p in f.qualifiers['db_xref'] if 'PFAM' in p])
                    
                if 'note' in f.qualifiers:                
                    for s in ''.join(f.qualifiers['note']).split('; '):
                        category = s.split(':')[0]
                        
                        if category in feature_types:
                            cat_index = feature_types.index(category)
                            split_on = ':'
                        elif 'SMCOG' in category:
                            cat_index = feature_types.index('SMCOG') 
                            split_on='SMCOG'
                        else:
                            continue
                            
                        if not gene_features[cat_index]:
                            gene_features[cat_index] = s.split(split_on, 1)[1]
                        else:
                            gene_features[cat_index]+=';'+s.split(split_on, 1)[1]

                if gene in gene_dict:
                    if gene in multiple_annotation_genes:
                        multiple_annotation_genes[gene] += [[product, alt_name, pfam]+gene_features]
                    else:
                        multiple_annotation_genes[gene] = [gene_dict[gene]] + [[product, alt_name, pfam]+gene_features]
                else:
                    gene_dict[gene] = [product, alt_name, pfam]+gene_features   

In [3]:
df = pd.DataFrame.from_dict(gene_dict, orient='index')
df.columns = ['product', 'alt_name', 'PFAM']+feature_types

go_cols = ['GO_function', 'GO_process', 'GO_component']
df[go_cols] = df[go_cols].replace("\s\[Evidence IEA\]", '', regex=True)


In [4]:
df.shape

(8237, 15)

## Add expression data and old CPSG names

From table S1 here: https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1009832#sec020

In [6]:
fp = csv.reader(open("Beyhan_Whiston_Carlin_SH.sorted.cdt"), dialect = csv.excel_tab)
temp = {}
for i,line in enumerate(fp):
    if i == 0:
        cols = line[1:]
        continue
    elif i < 3:
        continue
    temp[line[0]] = line[1:]
    
exp = pd.DataFrame.from_dict(temp, orient='index', columns=cols)

In [7]:
exp = exp[['Cp_new_GenBank','CiRS','WT.S/WT.H','p(WT.S/WT.H)','Cp_S/Cp_H', 'p(Cp_S/Cp_H)']]

exp = exp.replace(r'^\s*$', np.nan, regex=True)
exp = exp.astype({'WT.S/WT.H': 'float', 'p(WT.S/WT.H)': 'float',
                 'Cp_S/Cp_H':'float', 'p(Cp_S/Cp_H)':'float'})
exp = exp.sort_index()

def classify_expression(val, p):
    
    if np.isnan(val):
        return None
    if p <= 0.05 and val <= -1:
        return 'hyphae_upreg'
    elif p <= 0.05 and val >= 1:
        return 'spherule_upreg'
    else:
        return 'no upreg'
    
exp['Beyhan_exp'] = exp.apply(lambda x: classify_expression(x['WT.S/WT.H'], x['p(WT.S/WT.H)']), axis=1)
exp['Whiston_exp'] = exp.apply(lambda x: classify_expression(x['Cp_S/Cp_H'], x['p(Cp_S/Cp_H)']), axis=1)

exp = exp[~exp['Cp_new_GenBank'].isna()]
exp = exp.reset_index().set_index('Cp_new_GenBank').rename(columns={'index':'CPSG'})

In [8]:
merged = df.merge(exp, left_index=True, right_index=True, how='left')

In [9]:
merged.shape

(8237, 23)

In [10]:
print('# spherule-upregulated genes: {}'.format(merged[merged['Beyhan_exp']=='spherule_upreg'].shape[0]))
print('# hyphae-upregulated genes: {}'.format(merged[merged['Beyhan_exp']=='hyphae_upreg'].shape[0]))
print('# non differentially upregulated genes: {}'.format(merged[merged['Beyhan_exp']=='no upreg'].shape[0]))
print('# genes without expression data: {}'.format(merged[merged['Beyhan_exp'].isna()].shape[0]))


# spherule-upregulated genes: 1082
# hyphae-upregulated genes: 1200
# non differentially upregulated genes: 5019
# genes without expression data: 936


In [11]:
merged.to_csv('CpSilv_gb_annotations_and_expression_data.csv')