Dan Shea  
2017.07.27  

`analyze_genblast` will parse `genblasta` results to obtain the coverage of the best-hit against a lncRNA transcript.

In [1]:
import pandas as pd

In [2]:
# Files that hold the blastn results
blast_result_files = ['Bjuncea_incRNA.gblast.out.txt',
                      'Bjuncea_lincRNA.gblast.out.txt',
                      'Bjuncea_NAT.gblast.out.txt',
                      'Bjuncea_putative_mRNA.gblast.out.txt',
                      'Bnapus_incRNA.gblast.out.txt',
                      'Bnapus_lincRNA.gblast.out.txt',
                      'Bnapus_NAT.gblast.out.txt',
                      'Bnapus_putative_mRNA.gblast.out.txt',
                      'Bnigra_incRNA.gblast.out.txt',
                      'Bnigra_lincRNA.gblast.out.txt',
                      'Bnigra_NAT.gblast.out.txt',
                      'Bnigra_putative_mRNA.gblast.out.txt',
                      'Boleracea_incRNA.gblast.out.txt',
                      'Boleracea_lincRNA.gblast.out.txt',
                      'Boleracea_NAT.gblast.out.txt',
                      'Boleracea_putative_mRNA.gblast.out.txt',]

In [3]:
# The master excel file for lncRNA data
master_excel_file = 'Master_Sheet_20170719.xlsx'

In [4]:
# Read in the lncRNA sheets from the master excel file
incRNA  = pd.read_excel(master_excel_file, sheetname='putative incRNA')
lincRNA = pd.read_excel(master_excel_file, sheetname='putative lincRNA')
NAT     = pd.read_excel(master_excel_file, sheetname='putative NAT')
p_mRNA  = pd.read_excel(master_excel_file, sheetname='putative mRNA')

In [5]:
# Make a dictionary to store the blast results
blast_results = dict()
for filename in blast_result_files:
    key = filename.split('_')[0]
    if key not in blast_results:
        blast_results[key] = dict()

# Read in the blast results
for filename in blast_result_files:
    key  = filename.split('_')[0]
    key2 = filename.split('_')[1].split('.')[0]
    blast_results[key][key2] = pd.read_table(filename, header=None, sep='|')

In [6]:
# We need to split the column 3 into the length of the alignment and the %-coverage
for species in blast_results:
    for lncRNA_class in blast_results[species]:
        aln_len_list = list()
        cov_list     = list()
        for coverage_string in blast_results[species][lncRNA_class].iloc[:, 3]:
            aln_len, cov = coverage_string.split(':')[1].split('(')
            cov = cov.strip(')').strip('%')
            aln_len_list.append(aln_len)
            cov_list.append(cov)
        blast_results[species][lncRNA_class] = blast_results[species][lncRNA_class].join(pd.DataFrame({'aln_len':aln_len_list,
                                                                                                       'cov': cov_list,
                                                                                                       'species': [species for i in range(blast_results[species][lncRNA_class].shape[0])],
                                                                                                       'lncRNA': [lncRNA_class for i in range(blast_results[species][lncRNA_class].shape[0])]}))

In [7]:
# Keep only the columns we want for the tsv output
for species in blast_results:
    for lncRNA_class in blast_results[species]:
        blast_results[species][lncRNA_class] = blast_results[species][lncRNA_class].loc[:, [0,'aln_len','cov','lncRNA','species']]

In [8]:
for species in blast_results:
    for lncRNA_class in blast_results[species]:
        if lncRNA_class == 'incRNA':
            blast_results[species][lncRNA_class] = pd.merge(blast_results[species][lncRNA_class], incRNA, left_on=0, right_on='major_iso_id')
        elif lncRNA_class == 'lincRNA':
            blast_results[species][lncRNA_class] = pd.merge(blast_results[species][lncRNA_class], lincRNA, left_on=0, right_on='major_iso_id')
        elif lncRNA_class == 'NAT':
            blast_results[species][lncRNA_class] = pd.merge(blast_results[species][lncRNA_class], NAT, left_on=0, right_on='major_iso_id')
        elif lncRNA_class == 'putative':
            blast_results[species][lncRNA_class] = pd.merge(blast_results[species][lncRNA_class], p_mRNA, left_on=0, right_on='major_iso_id')

In [9]:
outputDF = None
for species in blast_results:
    for lncRNA_class in blast_results[species]:
        outputDF = pd.concat([outputDF, blast_results[species][lncRNA_class]])

In [10]:
# Keep only the columns we want for the tsv output
outputDF = outputDF.loc[:, [0,'aln_len','cov_x','lncRNA','species','len']]

In [11]:
outputDF.rename(index=str, columns={0:'major_iso_id', 'aln_len':'aln_len', 'cov_x':'coverage',
                                    'lncRNA':'lncRNA', 'species':'species', 'len':'transcript_len'}, inplace=True)

In [12]:
outputDF.to_csv('gblast_master_20170728.txt', sep='\t', header=True, index=False)

In [13]:
outputDF

Unnamed: 0,major_iso_id,aln_len,coverage,lncRNA,species,transcript_len
0,MSTRG.10025.1,361,73.8241,incRNA,Bjuncea,489
1,MSTRG.10220.1,208,34.7826,incRNA,Bjuncea,598
2,MSTRG.10492.1,211,8.95966,incRNA,Bjuncea,2355
3,MSTRG.1135.1,468,31.0139,incRNA,Bjuncea,1509
4,MSTRG.11900.1,370,39.6996,incRNA,Bjuncea,932
5,MSTRG.11984.1,137,29.2735,incRNA,Bjuncea,468
6,MSTRG.12059.1,199,92.9907,incRNA,Bjuncea,214
7,MSTRG.12903.1,745,98.0263,incRNA,Bjuncea,760
8,MSTRG.13446.1,1006,33.4441,incRNA,Bjuncea,3008
9,MSTRG.13811.1,792,68.3348,incRNA,Bjuncea,1159
