# Oskar tracker
- Author: Savandara BESSE & Leo Blondel
- Creation: 10-05-2019
- Last modification: 04-16-2020

In [1]:
import numpy as np
import pandas as pd
import os, progressbar
from Bio import SeqIO
from Bio import SearchIO



## Step 1.A : Create protein sequences from TSA data

### Required inputs
- TSA RAW folder: `/PATH/TO/TSA_INPUTS`
- TSA Protein folder : `PATH/TO/PROCESSED/TSA`

### Main steps:
1. Unzip TSA files
2. Change the exension fsa_nt to fasta
3. Translation step (with clean translation end)
4. Move translated TSA to TSA output folder

In [2]:
def build_TSA_inputs(TSA_path, protein_path):
    protein_list = []
    bar = progressbar.ProgressBar()
    if not os.path.isdir(protein_path):
        os.mkdir(protein_path)
        
#     os.system('gunzip {}/*'.format(TSA_path))
#     os.system('rename \'s/fsa_nt$/fasta/\' {}/*.fsa_nt'.format(TSA_path))
    
    for TSA in bar(os.listdir(TSA_path)):
        INPUT = os.path.join(TSA_path, TSA)
        OUTPUT = os.path.join(protein_path, 'translated_{}'.format(TSA))
        os.system('transeq -sequence {} -frame 6 -trim -clean -outseq {}'.format(INPUT,OUTPUT))    
        protein_list.append(OUTPUT)
    return protein_list

#### To run these steps you need to provide the folder path of your raw TSA

In [None]:
## Insect analysis 2019 
# %%time 
# TSA_path = '/media/savvy/DATA2/savvy/EXTAVOUR/SOURCES/TSA/TSA_INPUTS'
# protein_path = '/media/savvy/DATA2/savvy/EXTAVOUR/SOURCES/TSA/TSA_PROTEIN'
# translated_TSA = build_TSA_inputs(TSA_path, protein_path)

In [None]:
## Crustacean analysis 2017
# %%time 
# TSA_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/TSA_INPUTS'
# protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/TSA_PROTEIN'
# translated_TSA = build_TSA_inputs(TSA_path, protein_path)

## Step 1.B: Collect protein sequences from GCF data

### Required inputs
- Genome folder: `/PATH/TO/GENOMES`
- GCF Protein folder: `PATH/TO/PROCESSED/GCF`


### Main steps:
1. Copy GCF protein fasta files to GCF output folder
2. Unzip them

In [None]:
def reach_Proteins(currentFolder):
    for gFile in os.listdir(currentFolder):
        if '_protein.faa.gz' in gFile :
            return gFile
        
def build_GCF_inputs(genome_path, res_path):
    for folder in os.listdir(genome_path): 
        if 'GCF' in folder :
            currentFolder = os.path.join(genome_path, folder)
            proteome = reach_Proteins(currentFolder)
            target = os.path.join(genome_path, currentFolder, proteome)
            os.system('cp {} {}'.format(target, res_path))

#### To run these steps you need to provide the folder path of your Genome folder

In [None]:
# %%time 
# genome_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/Genomes'
# protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/GCF_PROTEIN'
# build_GCF_inputs(genome_path, protein_path)
# os.system('gunzip {}/*'.format(protein_path))

## Step 1.C: Collect protein sequences from GCA data

### Required inputs
- Genome folder: '/PATH/TO/GENOMES' 
- GCA Protein folder: 'PATH/TO/PROCESSED/GCA'


### Main steps:
/!\ Need to complete Augustus annotations of non annotated genomes by running 1.3_run_augustus_training.py
1. `reach_Genes()` will collect specifically genome fasta file for a given genome assembly
2. `build_GCA_inputs()` will generate the input folders needed for 1.4_run_augustus.py 
/!\ Run 1.4_run_augustus.py to generate sbatch scripts, wait until completion of all the annotations
3. Extraction of protein sequences from annotated genomes (GFF files)
    - Run getAnnoFasta.pl available from `scripts` folder of Augustus code source
3. Copy GCA protein fasta files to GCA output folder
4. Add Augustus model used for each non-annotated genomes registered in `../Data/01_Oskar_identification/2019/genome_insect_database.csv`

In [28]:
def reach_Genes(currentFolder):
    all_genomic_files = [ gFile for gFile in os.listdir(currentFolder) if '_genomic.fna.gz' in gFile ]
    if len(all_genomic_files) > 1 :
        for gFile in all_genomic_files :
            if ('cds' in gFile) or ('rna' in gFile):
                pass
            else :
                return gFile
    else :
        return all_genomic_files[0]
        
def build_GCA_inputs(genome_path, res_path):
    for folder in os.listdir(genome_path): 
        if 'GCA' in folder :
            currentFolder = os.path.join(genome_path, folder)
            proteome = reach_Genes(currentFolder)
            target = os.path.join(genome_path, currentFolder, proteome)
            os.system('cp {} {}'.format(target, res_path))

def set_augustus_model(order, table):
    if order == 'Diplura' :
        hmm_order = 'frankliniella_occidentalis'
    elif order == 'Archaeognatha':
        hmm_order = 'frankliniella_occidentalis'
    elif order == 'Odonata' :
        hmm_order = 'zootermopsis_nevadensis'
    elif order == 'Ephemeroptera' :
        hmm_order = 'frankliniella_occidentalis'
    elif order == 'Plecoptera' :
        hmm_order = 'zootermopsis_nevadensis'
    elif order == 'Orthoptera' :
        hmm_order = 'zootermopsis_nevadensis'
    elif order == 'Phasmatodea' :
        hmm_order = 'zootermopsis_nevadensis'
    elif order == 'Blattodea':
        hmm_order = 'zootermopsis_nevadensis'
    elif order == 'Thysanoptera' :
        hmm_order = 'frankliniella_occidentalis'
    elif order == 'Hemiptera' :
        hmm_order = 'bemisia_tabaci'
    elif order == 'Phthiraptera' :
        hmm_order = 'pediculus_humanus'
    elif order == 'Hymenoptera' :
        hmm_order = 'apis_mellifera'
    elif order == 'Strepsiptera' :
        hmm_order = 'tribolium_castaneum'
    elif order == 'Coleoptera' :
        hmm_order = 'tribolium_castaneum'
    elif order == 'Trichoptera' :
        hmm_order = 'papilio_xuthus'
    elif order == 'Lepidoptera' :
        hmm_order = 'papilio_xuthus'
    elif order == 'Siphonaptera' :
        hmm_order = 'ctenophalides_felis'
    elif order == 'Diptera' :
        family = table[table['order_name'] == order ]['family_name']
        if 'Culicidae' in family :
            hmm_order = 'aedes'
        elif 'Pteromalidae' in family :
            hmm_order = 'nasonia'
        else :
            hmm_order = 'fly'
    return hmm_order

In [None]:
# %%time 
# genome_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/Genomes'
# res_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/GCA_INPUTS'
# build_GCA_inputs(genome_path, res_path)
# os.system('gunzip {}/*'.format(res_path))

In [None]:
GCA = pd.read_csv('../Data/01_Oskar_identification/2019/genome_insect_database.csv')
GCA = GCA[GCA['genome_id'].str.contains('GCA')]
GCA['pgc_mode'] = GCA['order_name'].apply(set_germ_cell_formation)
GCA['augustus_model'] = GCA['order_name'].apply(set_augustus_model, args=(GCA,))

## Step 2 : Track Oskar

### Required inputs:
- LOTUS pHMM (version 3)
- OSK pHMM (version 3)
- TSA / GCF / GCA protein folders

### Main steps: 
- Run `execute_hmmsearch.py` for each type of data
- Copy the generated results in `../Data/01_Oskar_identification/hmmsearch_raw_results/XXX_V3`

In [17]:
storage_path = "/mnt/storage/Oskar_Evolution"

#### TSA

In [None]:
%%time 
## Crustacean analysis 
lotus_model = '../Data/Oskar_hmm/V3/LOTUS_CONSENSUS.hmm'
osk_model = '../Data/Oskar_hmm/V3/OSK_CONSENSUS.hmm'
protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/TSA_PROTEIN'
result_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/TSA_RESULT'
os.system('python3 execute_hmmsearch.py -a TSA -p {} -l {} -o {} -r {}'.format(protein_path, lotus_model, osk_model, result_path))

In [None]:
%%time 
## Insect analysis
lotus_model = '../Data/Oskar_hmm/V3/LOTUS_CONSENSUS.hmm'
osk_model = '../Data/Oskar_hmm/V3/OSK_CONSENSUS.hmm'
protein_path = os.path.join(storage_path, 'TSA_PROTEIN')
result_path =  os.path.join(storage_path, 'TSA_RESULTS')
os.system('python3 execute_hmmsearch.py -a TSA -p {} -l {} -o {} -r {}'.format(protein_path, lotus_model, osk_model, result_path))

In [None]:
!cp {os.path.join(storage_path, 'TSA_RESULTS', '*')} ../Data/01_Oskar_identification/hmmsearch_raw_results/TSA_V3

#### GCF

In [None]:
%%time 
lotus_model = '../Data/Oskar_hmm/V3/LOTUS_CONSENSUS.hmm'
osk_model = '../Data/Oskar_hmm/V3/OSK_CONSENSUS.hmm'
protein_path =  os.path.join(storage_path, 'GCF_PROTEIN')
result_path = os.path.join(storage_path, 'GCF_RESULTS')
os.system('python3 execute_hmmsearch.py -a GCF -p {} -l {} -o {} -r {}'.format(protein_path, lotus_model, osk_model, result_path))

In [None]:
!cp {os.path.join(storage_path, 'GCF_RESULTS', '*')} ../Data/01_Oskar_identification/hmmsearch_raw_results/GCF_V3

#### GCA

In [18]:
%%time 
lotus_model = '../Data/Oskar_hmm/V3/LOTUS_CONSENSUS.hmm'
osk_model = '../Data/Oskar_hmm/V3/OSK_CONSENSUS.hmm'
protein_path =  os.path.join(storage_path, 'GCA_PROTEIN')
result_path = os.path.join(storage_path, 'GCA_RESULTS')
os.system('python3 execute_hmmsearch.py -a GCA -p {} -l {} -o {} -r {}'.format(protein_path, lotus_model, osk_model, result_path))

CPU times: user 5.23 ms, sys: 4.08 ms, total: 9.31 ms
Wall time: 55.4 s


0

In [20]:
!cp {os.path.join(storage_path, 'GCA_RESULTS', '*')} ../Data/01_Oskar_identification/hmmsearch_raw_results/GCA_V3

## STEP 3 : Collect Oskar candidates

### Required inputs
- TSA / GCF / GCA Oskar result folders  `../Data/01_Oskar_identification/hmmsearch_raw_results/XXX_V3`

### Main steps
1. Identification of Oskar candidates by getting LOTUS and OSK hits
2. Add information of PGC mode
3. Save Oskar candidates in tables per data type available in `../Data/01_Oskar_identification/oskar_tracker_results/XXX_V3/xxx_oskar_results.csv`

In [25]:
def collect_Table(TSA, HMM):
    for DOM in HMM :
        if TSA in DOM :
            return DOM
        
def collect_Hits(hmmerTable):
    f = open(hmmerTable)
    tmp = [ line for line in f.readlines() if '#' not in line ]
    f.close()
    if len(tmp) != 0 : 
        return [ hit.id for hit in SearchIO.read(hmmerTable, 'hmmer3-tab') if hit.evalue <= 0.05 ]
    return []
    

def oskar_analysis(ID):
    lotus_res = os.path.join(result_path, collect_Table(ID, LOTUS))
    osk_res = os.path.join(result_path, collect_Table(ID, OSK))

    lotus_hits = collect_Hits(lotus_res)
    osk_hits = collect_Hits(osk_res)
    oskar_hits = list(set(lotus_hits) & set(osk_hits))
    
    if len(oskar_hits) == 0 :
        oskar_hits = 'None'
    else : 
        oskar_hits = ','.join(list(set(lotus_hits) & set(osk_hits)))
    if len(lotus_hits) == 0 :
        lotus_hits = 'None'
    else :
        lotus_hits = ','.join(list(lotus_hits))
    if len(osk_hits) == 0:
        osk_hits = 'None'
    else : 
        osk_hits = ','.join(list(osk_hits))

    return [ID, lotus_hits, osk_hits, oskar_hits ]
        
PGC_mode = {
    'Collembola':'Induction',
    'Diplura':'Induction',
    'Archaeognatha':'Induction',
    'Zygentoma':'Induction',
    'Odonata':'Induction',
    'Ephemeroptera':'Induction',
    'Zoraptera':'Induction',
    'Dermaptera':'Induction',
    'Plecoptera':'Induction',
    'Orthoptera':'Induction',
    'Mantophasmatodea':'Induction',
    'Grylloblattodea':'Induction',
    'Embioptera':'Induction',
    'Phasmatodea':'Induction',
    'Mantodea':'Induction',
    'Blattodea':'Induction',
    'Isoptera':'Induction',
    'Thysanoptera':'Induction',
    'Hemiptera':'Induction',
    'Phthiraptera':'Induction',
    'Psocoptera':'Induction',
    'Hymenoptera':'Inheritance',
    'Raphidioptera':'Inheritance',
    'Megaloptera':'Inheritance',
    'Neuroptera':'Inheritance',
    'Strepsiptera':'Inheritance',
    'Coleoptera':'Inheritance',
    'Trichoptera':'Inheritance',
    'Lepidoptera':'Inheritance',
    'Siphonaptera':'Inheritance',
    'Mecoptera':'Inheritance',
    'Diptera':'Inheritance'
}

def set_germ_cell_formation(x):
    return PGC_mode[x]

def formatTSA(x):
    x = x.replace('.1', '')
    return x

#### TSA

In [None]:
%%time
## Crustacean analysis 
tsa_path = '../Data/01_Oskar_identification/hmmsearch_raw_results/TSA_crustacea'
IDs = list(set([ TSA.split('_')[0] for TSA in os.listdir(tsa_path) ]))
LOTUS = [ domain for domain in sorted(os.listdir(tsa_path)) if 'lotus' in domain ]
OSK = [ domain for domain in sorted(os.listdir(tsa_path)) if 'osk' in domain ]

In [None]:
TSA_OSKAR = list(map(oskar_analysis, IDs))
TMP = pd.DataFrame(TSA_OSKAR, columns=['TSA', 'lotus_hits', 'osk_hits', 'oskar_hits'])
TMP['TSA'] = TMP['TSA'].apply(formatTSA)
TSA = pd.read_csv('../Data/01_Oskar_identification/2017/transcriptome_crustacean_database.csv')
TSA = TSA.merge(TMP, on='TSA')
TSA.to_csv('../Data/01_Oskar_identification/oskar_tracker_results/TSA_crustacea/tsa_oskar_results.csv', index=False, na_rep='None')

In [6]:
%%time
result_path = '../Data/01_Oskar_identification/hmmsearch_raw_results/TSA_V3'
IDs = list(set([ TSA.split('_')[0] for TSA in os.listdir(result_path) ]))
LOTUS = [ domain for domain in sorted(os.listdir(result_path)) if 'lotus' in domain ]
OSK = [ domain for domain in sorted(os.listdir(result_path)) if 'osk' in domain ]
TSA_OSKAR = list(map(oskar_analysis, IDs))

CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.2 s


In [7]:
TMP = pd.DataFrame(TSA_OSKAR, columns=['tsa_abrv', 'lotus_hits', 'osk_hits', 'oskar_hits'])
TSA = pd.read_csv('../Data/01_Oskar_identification/2019/transcriptome_insect_database.csv')

TSA['pgc_mode'] = TSA['order_name'].apply(set_germ_cell_formation)
TSA['tsa_abrv'] = [ '{}{}'.format(TSA['tsa_id'][i][:5], TSA['tsa_id'][i].split('.')[1]) for i in range(len(TSA)) ]

TSA = TSA.merge(TMP, on='tsa_abrv')
TSA.to_csv('../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_results.csv', index=False, na_rep='None')

#### GCF 

In [9]:
result_path = '../Data/01_Oskar_identification/hmmsearch_raw_results/GCF_V3'
IDs = list(set([ '{}_{}'.format(GCF.split('_')[0], GCF.split('_')[1]) for GCF in os.listdir(result_path) ]))
LOTUS = [ domain for domain in sorted(os.listdir(result_path)) if 'lotus' in domain ]
OSK = [ domain for domain in sorted(os.listdir(result_path)) if 'osk' in domain ]
GCF_OSKAR = list(map(oskar_analysis, IDs))

In [11]:
TMP = pd.DataFrame(GCF_OSKAR, columns=['genome_id', 'lotus_hits', 'osk_hits', 'oskar_hits'])

GCF = pd.read_csv('../Data/01_Oskar_identification/2019/genome_insect_database.csv')
GCF['pgc_mode'] = GCF['order_name'].apply(set_germ_cell_formation)

GCF = GCF.merge(TMP, on='genome_id')
GCF.to_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_results.csv', index=False, na_rep='None')

#### GCA

In [26]:
result_path = '../Data/01_Oskar_identification/hmmsearch_raw_results/GCA_V3'
IDs = list(set([ '{}_{}'.format(GCA.split('_')[0], GCA.split('_')[1]) for GCA in os.listdir(result_path) ]))
LOTUS = [ domain for domain in sorted(os.listdir(result_path)) if 'lotus' in domain ]
OSK = [ domain for domain in sorted(os.listdir(result_path)) if 'osk' in domain ]
GCA_OSKAR = list(map(oskar_analysis, IDs))

In [30]:
TMP = pd.DataFrame(GCA_OSKAR, columns=['genome_id', 'lotus_hits', 'osk_hits', 'oskar_hits'])

GCA = pd.read_csv('../Data/01_Oskar_identification/2019/genome_insect_database.csv')
GCA['pgc_mode'] = GCA['order_name'].apply(set_germ_cell_formation)
GCA['augustus_model'] = GCA['order_name'].apply(set_augustus_model, args=(GCA,))

GCA = GCA.merge(TMP, on='genome_id')
GCA.to_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_results.csv', index=False, na_rep='None')

## Step 4: Save Oskar candidates
### Required inputs
- TSA / GCF / GCA protein folders
- Names of FASTA output files `../Data/01_Oskar_identification/oskar_tracker_results/XXX_V3/xxx_oskar_candidates.fasta`

### Main steps
1. Save Oskar candidates in different fasta files according data sources

In [31]:
def collect_genome(x, table):
    return table[table['oskar_hits'].str.contains(x)]['genome_id'].values[0]

def retrieveSeq(seqID, dataType, fastaFile):
    for seqRecord in SeqIO.parse(fastaFile, 'fasta'):
        if seqID in seqRecord.id :
            if 'TSA' in dataType :
                seqRecord.seq = format_Seq(seqRecord.seq)
            return seqRecord
        
def first_AA(seq):
    for i in range(len(seq)):
        if 'M' in seq[i]:
            return i

def format_Seq(seq):
    return seq[first_AA(seq):]

def save_oskar(TABLE, protein_path, dataType, result_file):
    OSKAR_SEQ = [] 
    OSKAR_HITS = [ ID for LIST in TABLE[TABLE['oskar_hits'] != 'None']['oskar_hits'] for ID in LIST.split(',') ]
    bar = progressbar.ProgressBar()
    for SEQ in bar(OSKAR_HITS): 
        if 'TSA' in dataType :
            ID = SEQ[:4]
        else :
            ID = collect_genome(SEQ, TABLE)
        for PROTEOME in os.listdir(protein_path):
            if ID in PROTEOME :
                FASTA = os.path.join(protein_path, PROTEOME)
                OSKAR_SEQ.append(retrieveSeq(SEQ, dataType, FASTA))
    SeqIO.write(OSKAR_SEQ, result_file, 'fasta')        

In [10]:
protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/TSA_PROTEIN'
result_file = '../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_candidates.fasta'
save_oskar(TSA, protein_path, 'TSA', result_file)

100% |########################################################################|


In [12]:
protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/GCF_PROTEIN'
result_file = '../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_candidates.fasta'
save_oskar(GCF, protein_path, 'GCF', result_file)

100% (98 of 98) |########################| Elapsed Time: 0:00:09 Time:  0:00:09


In [33]:
protein_path = '/media/lblondel/5D1FA0DA2BE76E76/savy/SOURCES/GCA_PROTEIN'
result_file = '../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_candidates.fasta'
save_oskar(GCA, protein_path, 'GCA', result_file)

100% |########################################################################|


## Step 5: Create Oskar hmmsearch files for crossed-validation 

### Required inputs
- OSKAR pHMM (version 3)

In [11]:
tsa_candidates = '../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_candidates.fasta'
oskar_model = '../Data/Oskar_hmm/V3/OSKAR_CONSENSUS.hmm'
oskar_validation_results = '../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/final_tsa_oskar_search.txt'
os.system('hmmsearch --cpu 8 --tblout {} {} {}'.format(oskar_validation_results, oskar_model, tsa_candidates))

0

In [13]:
gcf_candidates = '../Data/01_Oskar_identification/oskar_tracker_results/GCF/gcf_oskar_candidates.fasta'
oskar_model = '../Data/Oskar_hmm/V3/OSKAR_CONSENSUS.hmm'
oskar_validation_results = '../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/final_gcf_oskar_search.txt'
os.system('hmmsearch --cpu 8 --tblout {} {} {}'.format(oskar_validation_results, oskar_model, gcf_candidates))

0

In [34]:
gca_candidates = '../Data/01_Oskar_identification/oskar_tracker_results/GCA/gca_oskar_candidates.fasta'
oskar_model = '../Data/Oskar_hmm/V3/OSKAR_CONSENSUS.hmm'
oskar_validation_results = '../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/final_gca_oskar_search.txt'
os.system('hmmsearch --cpu 8 --tblout {} {} {}'.format(oskar_validation_results, oskar_model, gca_candidates))

0

## Step 6: Oskar duplicate and isoform detection and filtering

In [2]:
import networkx as nx
from Bio import SeqIO
from Bio import SearchIO
from Bio import Align
from Bio import AlignIO
from Bio import Seq
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm

In [3]:
TSA_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_results.csv')
TSA_sequences = '../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_candidates.fasta'
TSA_hmmer = SearchIO.read('../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/final_tsa_oskar_search.txt', 'hmmer3-tab')
TSA_filtered_outpath = '../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_filtered.fasta'

In [4]:
GCF_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_results.csv')
GCF_sequences = '../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_candidates.fasta'
GCF_hmmer = SearchIO.read('../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/final_gcf_oskar_search.txt', 'hmmer3-tab')
GCF_filtered_outpath = '../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_filtered.fasta'

In [5]:
GCA_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_results.csv')
GCA_sequences = '../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_candidates.fasta'
GCA_hmmer = SearchIO.read('../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/final_gca_oskar_search.txt', 'hmmer3-tab')
GCA_filtered_outpath = '../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_filtered.fasta'

In [6]:
filtered_outpath = '../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.fasta'

In [7]:
all_seq_outpath = '../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.fasta'

#### Helper functions

In [8]:
# Return the Hamming distance between string1 and string2.
# string1 and string2 should be the same length.
def hamming_distance(seq1, seq2): 
    assert(len(seq1) == len(seq2)) 
    # Start with a distance of zero, and count up
    distance = 0.0
    # Loop over the indices of the string
    L = len(seq1)
    for i in range(L):
        # Add 1 to the distance if these two characters are not equal
        if seq1[i] != seq2[i]:
            distance += 1
    # Return the final count of differences
    return 1 - distance/L
#     return distance

def get_long_osk(alignements_path, threshold):
    sequences = {}
    for alignement in alignements_path:
        taxid = alignement.split('/')[-1].split('.')[0]
        handle = SeqIO.parse(alignement, 'fasta')
        sequences[taxid] = [s for s in handle]
    long_oskars = []
    for taxid in sequences:
        if len(sequences[taxid]) > 1:
            len_first_block = 0
            met = 0
            for i in range(len(sequences[taxid][0])):
                if met == len(sequences[taxid]):
                    break
                for j in range(len(sequences[taxid])):
                    if sequences[taxid][j][i] == 'M':
                        met += 1
                len_first_block += 1
            if len_first_block > threshold:
                long_oskars += sequences[taxid]
    return long_oskars

def group_sequences(sources):
    # Sources definition: [('ID', DataFrame), ('ID', Dataframe)....]
    
    sequences = {}
    for source in sources:
        for taxid, oskar_ids in source[1][source[1]['oskar_hits'] != "None"][['tax_id', 'oskar_hits']].values:
            if taxid not in sequences:
                sequences[taxid] = []
            sequences[taxid] += [(source[0], i.strip()) for i in oskar_ids.split(',')]
    return sequences

def get_all_sequences(sequences, sources):
    all_sequences = []
    
    name2seq = {}

    for source in sources:
        fasta = [s for s in SeqIO.parse(source[1], 'fasta')]
        for seq in fasta:
            if seq.name in name2seq:
                print("ERROR SEQUENCE ALREADY EXISTS !")
                return False
            name2seq[seq.name] = seq
            
    for taxid in sequences:
        for origin, seq in sequences[taxid]:
            s = name2seq[seq]
            s.description += '|' + origin
            all_sequences.append([origin, s])
 
    return all_sequences

def make_tmp_seq_groups(sequences, sources):
    # sources ->   [('ID', fasta_path), ('ID', fasta_path)....]
    name2seq = {}

    for source in sources:
        fasta = [s for s in SeqIO.parse(source[1], 'fasta')]
        for seq in fasta:
            if seq.name in name2seq:
                print("ERROR SEQUENCE ALREADY EXISTS !")
                return False
            name2seq[seq.name] = seq

    tmp = './tmp/oskars'
    if not os.path.isdir('tmp'):
        os.mkdir('tmp')
    if not os.path.isdir(tmp):
        os.mkdir(tmp)
    sequences_groups = []
    for taxid in sequences:
        if len(sequences[taxid]) > 1:
            tmpseqs = []
            for origin, seq in sequences[taxid]:
                s = name2seq[seq]
                s.description += '|' + origin
                tmpseqs.append(s)
            outpath = os.path.join(tmp, '{}.fasta'.format(taxid))
            sequences_groups.append(outpath)
            f = open(outpath, 'w')
            SeqIO.write(tmpseqs, f, 'fasta')
    return sequences_groups
            
def muscle_align(inpath):
    "Command line wrapper for muscle, muscle needs to be in your path !"
    outpath = inpath.replace('fasta', 'aligned.fasta')
    cmd = 'muscle -in {} -out {}'.format(inpath, outpath)
    os.system(cmd)
    return outpath
    
def align_sequences(sequences_groups):
    outpaths = []
    for path in tqdm(sequences_groups):
        outpath = muscle_align(path)
        outpaths.append(outpath)
    return outpaths

def trim_alignement(alignement):
    res = []
    for i in range(alignement.get_alignment_length()):
        col = alignement[:, i]
        if '-' not in col:
            res.append([s for s in col])
    res = np.array(res).T
    for i in range(len(alignement)):
        seq = ''.join(list(res[i]))
        alignement[i].seq = seq
    return alignement
    
def make_network(alignement_path):
    alignement = AlignIO.read(alignement_path, "fasta")
    trimed = trim_alignement(alignement)
    hammings = np.zeros((len(trimed), len(trimed)))
    nodes = []
    for i in range(len(trimed)):
        nodes.append(trimed[i].name)
        for j in range(i+1, len(trimed)):
            seq1 = trimed[i].seq
            seq2 = trimed[j].seq
            hammings[i][j] = hamming_distance(seq1, seq2)
    M = np.maximum( hammings, hammings.transpose() )
    G = nx.from_numpy_matrix(hammings)
    for i in G.nodes():
        G.nodes[i]['seq_id'] = nodes[i]
    return G, nodes
   
def find_clusters(alignements_path, threshold=0.9):
    clusters = {}
    for alignement_path in alignements_path:
        taxid = int(alignement_path.split('/')[-1].split('.')[0])
        clusters[taxid] = []
        G, nodes = make_network(alignement_path)
        toremove = []
        for n1,n2 in G.edges():
            if G.edges[n1, n2]['weight'] < threshold:
                toremove.append((n1, n2))
        for n1, n2 in toremove:
            G.remove_edge(n1, n2)
        for cc in nx.connected_components(G):
            tmp = []
            for i in cc:
                tmp.append(nodes[i])
            clusters[taxid].append(tmp)
    return clusters

# def filter_sequences(clusters, TSA_sequences_path, TSA_hmmer_table, GCF_sequences_path, GCF_hmmer_table, uniq_sequences):
def filter_sequences(clusters, sources, uniq_sequences):
    # [('TSA', TSA_seq_path, TSA_hmmer_path), ('GCF', GCF_seq .......
    result = []
    removed = []
    sequences = {}
    scores = {}

    # source element is 
    # ('ID', sequence_path, hmmer_hit_object)
    for source in sources:
        handle = SeqIO.parse(source[1], 'fasta')
        for s in handle:
            sequences[s.name] = (source[0], s)
        for hit in source[2]:
            scores[hit.id] = hit.evalue
    
    for s in uniq_sequences:
        result.append(sequences[s[1]])
            
    for taxid in clusters:
        for cluster in clusters[taxid]:
            tmp_score = []
            for seq_id in cluster:
                tmp_score.append([scores[seq_id], sequences[seq_id]])
            best_seq = sorted(tmp_score, key=lambda x: x[0])
            result.append(best_seq[0][1])
            removed += best_seq[1:]
    return result, removed

def clean_sequences(sequences):
    cleaned = []
    for seq in sequences:
        tmp = ""
        pos = 0
        for l in seq.seq:
            if l != 'X':
                tmp += l
            elif pos < 550:
                tmp += l
            else:
                break
            pos += 1
        seq.seq = Seq.Seq(tmp)
        cleaned.append(seq)
    return cleaned

def decorate_sequences(sequences, TSA_metadatas, GCF_metadatas, GCA_metadatas=None):
    results = []
    for s in sequences:
        tag = s[0]
        seq = s[1]
        if tag == 'TSA':
            TSA_ID = seq.name[0:6]
            metadata = TSA_metadatas[TSA_metadatas['tsa_abrv'] == TSA_ID][['species', 'family_name', 'order_name']].values[0]
        if tag == 'GCF':
            if len(seq.description.split('|')) != 4:
                GCF_specie = seq.description.split('[')[-1].split(']')[0]
                GCF_specie = " ".join(GCF_specie.split(' ')[:2])
                metadata = GCF_metadatas[GCF_metadatas['species'] == GCF_specie][['species', 'family_name', 'order_name']].values[0]
            else:
                metadata = seq.description.split('|')
        if tag == 'GCA':
            if len(seq.description.split('|')) != 4:
                try:
                    GCA_specie = seq.description.split('[')[-1].split(']')[0]
                    metadata = GCA_metadatas[GCA_metadatas['oskar_hits'].str.contains(GCA_specie)][['species', 'family_name', 'order_name']].values[0]
                except:
                    print(GCA_specie)
            else:
                metadata = seq.description.split('|')
        seq.description = "{}|{}|{}|{}".format(metadata[0], metadata[1], metadata[2], tag)
        results.append((metadata[1], metadata[2], seq))
    results = sorted(results, key=lambda x: (x[1], x[0]))
    results = [x[2] for x in results]
    return results

def remove_sequences(fasta_file, sequences, source=None):
    if source:
        check = [i[1] for i in sequences if i[0] == source]
    else:
        check = [i[1] for i in sequences]
    
    results = []
    handle = SeqIO.parse(fasta_file, 'fasta')
    for seq in handle:
        if seq.name not in check:
            results.append(seq)
    f = open(fasta_file, 'w')
    SeqIO.write(results, f, 'fasta')
    f.close()
    print("Sequences saved !")

#### Sequence list to be removed. Those sequences did not align with the rest of the oskar sequences despite matching the LOTUS + OSK exclusion criteria. They might be degenerate oskar sequences, but cannot be used for further processing as alignement is impossible. 

In [9]:
sequences_removed = [
    ('TSA', 'GCCL01027819.1_4'),
    ('TSA', 'GGKM01011220.1_4'),
    ('TSA', 'GFTU01001347.1_3'),
    ('TSA', 'GAEO01004319.1_2'),
    ('GCA', 'g35383.t1'),
    ('GCA', 'g4008.t1'),
    ('GCA', 'g1846.t1'),
    ('TSA', 'GBYD01074583.1_1'),
    ('TSA', 'GBYD01074580.1_3'),
    ('TSA', 'GHJE01032141.1_5')
]

#### /!\ Now processing all oskar sequences

In [10]:
# We first group the sequences by unique taxa
sequences = group_sequences([
    ('TSA', TSA_results),
    ('GCF', GCF_results),
    ('GCA', GCA_results),
])

uniq_sequences = [sequences[k][0] for k in sequences if len(sequences[k]) == 1]

In [11]:
# Then we create fasta files for each taxa, only if this taxa has more than one sequence
sequences_groups = make_tmp_seq_groups(
    sequences,
    [('TSA', TSA_sequences),
     ('GCF', GCF_sequences),
     ('GCA', GCA_sequences)
    ])

In [12]:
# We use Muscle to align those sequences (from the fasta files we just created)
alignements_path = align_sequences(sequences_groups)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))




#### Create Long Oskar alignment

In [13]:
long_oskars = get_long_osk(alignements_path, 80)

In [14]:
SeqIO.write(long_oskars, '../Data/01_Oskar_identification/oskar_tracker_results/long_oskar.fasta', 'fasta')

99

In [15]:
!muscle -in ../Data/01_Oskar_identification/oskar_tracker_results/long_oskar.fasta -out ../Data/01_Oskar_identification/oskar_tracker_results/long_oskar.aligned.fasta


MUSCLE v3.8.31 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

long_oskar 99 seqs, max length 1641, avg  length 716
00:00:00     24 MB(2%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00     24 MB(2%)  Iter   1  100.00%  K-mer dist pass 2
00:00:01     84 MB(6%)  Iter   1  100.00%  Align node
00:00:01     84 MB(6%)  Iter   1  100.00%  Root alignment
00:00:03     84 MB(6%)  Iter   2  100.00%  Refine tree
00:00:03     84 MB(6%)  Iter   2  100.00%  Root alignment
00:00:03     84 MB(6%)  Iter   2  100.00%  Root alignment
00:00:06     84 MB(6%)  Iter   3  100.00%  Refine biparts
00:00:09     84 MB(6%)  Iter   4  100.00%  Refine biparts
00:00:09     84 MB(6%)  Iter   5  100.00%  Refine biparts
00:00:09     84 MB(6%)  Iter   5  100.00%  Refine biparts
00:00:10     84 MB(6%)  Iter   6  100.00%  Refine biparts
00:00:10     84 MB(6%)  Iter   7  100.00%  Refine biparts
00:00:10     84 MB(6%)

#### Create Oskar alignment no with duplicates and isoforms

In [16]:
# We use graph theory (simple edge removal by threshold) to find sequences cluster within the sequence group
clusters = find_clusters(alignements_path, threshold=0.80)

In [17]:
# We filter the sequences keeping the best E-value sequence for each cluster group,
# and add back the sequences that were unique already
filtered_sequences, removed_sequences = filter_sequences(clusters, [
                                                ('TSA', TSA_sequences, TSA_hmmer),
                                                ('GCF', GCF_sequences, GCF_hmmer),
                                                ('GCA', GCA_sequences, GCA_hmmer)
                                                ], 
                                      uniq_sequences)

In [18]:
# We decorate the sequences for further processing with species, family and order name as well as a datatype tag "TSA"
decorated_sequences = decorate_sequences(filtered_sequences, TSA_results, GCF_results ,GCA_results)


In [19]:
cleaned_sequences = clean_sequences(decorated_sequences)

In [20]:
# We save our sequence group
f = open(filtered_outpath, 'w')
SeqIO.write(cleaned_sequences, f, 'fasta')
f.close()

In [21]:
remove_sequences(filtered_outpath, sequences_removed)

Sequences saved !


In [22]:
handle = SeqIO.parse(TSA_sequences, 'fasta')
total_seq = 0
for s in handle:
    total_seq += 1

tsa_rem = total_seq
handle = SeqIO.parse(GCF_sequences, 'fasta')
for s in handle:
    total_seq += 1

gcf_rem = total_seq - tsa_rem
handle = SeqIO.parse(GCA_sequences, 'fasta')
for s in handle:
    total_seq += 1
    
gca_rem = total_seq - (gcf_rem + tsa_rem)

print("Starting sequences")
print("TSA:", tsa_rem)
print("GCF:", gcf_rem)
print("GCA:", gca_rem)
print("Total:", total_seq)
  
handle = SeqIO.parse(filtered_outpath, 'fasta')
filtered_seq = 0
tsa_keep = 0
gcf_keep = 0
gca_keep = 0
for s in handle:
    filtered_seq += 1
    tag = s.description.split('|')[-1]
    if tag == 'TSA':
        tsa_keep += 1
    elif tag == 'GCF':
        gcf_keep += 1
    elif tag == 'GCA':
        gca_keep += 1

        
print("The algorithm filtered out {} sequences".format(total_seq-filtered_seq))
print("Final sequence count is {}. TSA:{} | GCF:{} | GCA:{}".format(filtered_seq, tsa_keep, gcf_keep, gca_keep))

Starting sequences
TSA: 325
GCF: 98
GCA: 99
Total: 522
The algorithm filtered out 143 sequences
Final sequence count is 379. TSA:217 | GCF:72 | GCA:90


#### Create Oskar alignment no with duplicates and isoforms for TSA

In [23]:
# Redoing the same process but with only one source type at a time for count purposes

# Doing TSA
sequences = group_sequences([
    ('TSA', TSA_results),
])
uniq_sequences = [sequences[k][0] for k in sequences if len(sequences[k]) == 1]
sequences_groups = make_tmp_seq_groups(
    sequences,
    [('TSA', TSA_sequences),
    ])
alignements_path = align_sequences(sequences_groups)
clusters = find_clusters(alignements_path, threshold=0.80)
filtered_sequences, removed_sequences = filter_sequences(clusters, [
                                                ('TSA', TSA_sequences, TSA_hmmer),
                                                ], 
                                      uniq_sequences)
decorated_sequences = decorate_sequences(filtered_sequences, TSA_results, GCF_results, GCA_results)
cleaned_sequences = clean_sequences(decorated_sequences)

# We save our sequence group
f = open(TSA_filtered_outpath, 'w')
SeqIO.write(cleaned_sequences, f, 'fasta')
f.close()
print("Sequences saved !")
remove_sequences(TSA_filtered_outpath, sequences_removed, source='TSA')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=58), HTML(value='')))


Sequences saved !
Sequences saved !


#### Create Oskar alignment no with duplicates and isoforms for GCF

In [24]:
# Doing GCF
sequences = group_sequences([
    ('GCF', GCF_results),
])
uniq_sequences = [sequences[k][0] for k in sequences if len(sequences[k]) == 1]
sequences_groups = make_tmp_seq_groups(
    sequences,
    [
     ('GCF', GCF_sequences),
    ])
alignements_path = align_sequences(sequences_groups)
clusters = find_clusters(alignements_path, threshold=0.80)
filtered_sequences, removed_sequences = filter_sequences(clusters, [
                                                ('GCF', GCF_sequences, GCF_hmmer),
                                                ], 
                                      uniq_sequences)
decorated_sequences = decorate_sequences(filtered_sequences, TSA_results, GCF_results ,GCA_results)
cleaned_sequences = clean_sequences(decorated_sequences)

# We save our sequence group
f = open(GCF_filtered_outpath, 'w')
SeqIO.write(cleaned_sequences, f, 'fasta')
f.close()
print("Sequences saved !")
remove_sequences(GCF_filtered_outpath, sequences_removed, source='GCF')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=15), HTML(value='')))


Sequences saved !
Sequences saved !


#### Create Oskar alignment no with duplicates and isoforms for GCA

In [25]:
# Doing GCA
sequences = group_sequences([
    ('GCA', GCA_results),
])
uniq_sequences = [sequences[k][0] for k in sequences if len(sequences[k]) == 1]
sequences_groups = make_tmp_seq_groups(
    sequences,
    [
     ('GCA', GCA_sequences)
    ])
alignements_path = align_sequences(sequences_groups)
clusters = find_clusters(alignements_path, threshold=0.80)
filtered_sequences, removed_sequences = filter_sequences(clusters, [
                                                ('GCA', GCA_sequences, GCA_hmmer)
                                                ], 
                                      uniq_sequences)
decorated_sequences = decorate_sequences(filtered_sequences, TSA_results, GCF_results ,GCA_results)
cleaned_sequences = clean_sequences(decorated_sequences)

# We save our sequence group
f = open(GCA_filtered_outpath, 'w')
SeqIO.write(cleaned_sequences, f, 'fasta')
f.close()
remove_sequences(GCA_filtered_outpath, sequences_removed, source='GCA')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Sequences saved !


#### Save all Oskar sequences

In [26]:
# Making a file of all the sequences
sequences = group_sequences([
    ('TSA', TSA_results),
    ('GCF', GCF_results),
    ('GCA', GCA_results),
])

all_sequences = get_all_sequences(
    sequences,
    [('TSA', TSA_sequences),
     ('GCF', GCF_sequences),
     ('GCA', GCA_sequences)
    ])

decorated_sequences = decorate_sequences(all_sequences, TSA_results, GCF_results ,GCA_results)
cleaned_sequences = clean_sequences(decorated_sequences)

# We save our sequence group
f = open(all_seq_outpath, 'w')
SeqIO.write(cleaned_sequences, f, 'fasta')
f.close()

## Step 7: Align the final result with hmmalign and refine with muscle

In [28]:
!hmmalign ../Data/Oskar_hmm/V3/OSKAR_CONSENSUS.hmm ../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.fasta > ../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.hmmaligned.sto 

In [59]:
records = SeqIO.parse('../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.hmmaligned.sto', 'stockholm')
seqs = [i for i in records]
SeqIO.write(seqs, '../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.hmmaligned.fasta', 'fasta')

522

In [60]:
!muscle -in ../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.hmmaligned.fasta -out ../Data/01_Oskar_identification/oskar_tracker_results/oskar_all.aligned.fasta -refine


MUSCLE v3.8.31 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

00:07:45     67 MB(5%)  Iter   1  100.00%  Refine biparts
00:13:16     69 MB(5%)  Iter   2  100.00%  Refine biparts
00:17:40     73 MB(5%)  Iter   3  100.00%  Refine biparts
00:19:33     75 MB(5%)  Iter   4  100.00%  Refine biparts
00:19:33     75 MB(5%)  Iter   4  100.00%  Refine biparts


In [38]:
!hmmalign ../Data/Oskar_hmm/V3/OSKAR_CONSENSUS.hmm ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.fasta > ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.hmmaligned.sto 

In [57]:
records = SeqIO.parse('../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.hmmaligned.sto', 'stockholm')
seqs = [i for i in records]
SeqIO.write(seqs, '../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.hmmaligned.fasta', 'fasta')

379

In [58]:
!muscle -in ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.hmmaligned.fasta -out ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta -refine


MUSCLE v3.8.31 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

00:02:18     49 MB(3%)  Iter   1  100.00%  Refine biparts
00:03:55     51 MB(4%)  Iter   2  100.00%  Refine biparts
00:04:52     52 MB(4%)  Iter   3  100.00%  Refine biparts
00:04:52     52 MB(4%)  Iter   3  100.00%  Refine biparts


#### Clean the alignement if needs be 

## Step 8
### Save OSK and LOTUS domains as separate alignements 
- OSK: oskar_filtered.aligned.OSK_domain.fasta
- LOTUS: oskar_filtered.aligned.LOTUS_domain.fasta

### Regenerate the OSK and LOTUS hmm and repeat the process

In [48]:
!hmmbuild ../Data/01_Oskar_identification/oskar_tracker_results/OSK_CONSENSUS.hmm ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.OSK_domain.fasta

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.2.1 (June 2018); http://hmmer.org/
# Copyright (C) 2018 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.OSK_domain.fasta
# output HMM file:                  ../Data/01_Oskar_identification/oskar_tracker_results/OSK_CONSENSUS.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     oskar_filtered.aligned.OSK_domain   378   663   198     4.05  0.590 

# CPU time: 0.30u 0.00s 00:00:00.30 Elapsed: 00:00:00.30


In [49]:
!hmmbuild ../Data/01_Oskar_identification/oskar_tracker_results/LOTUS_CONSENSUS.hmm ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.LOTUS_domain.fasta

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.2.1 (June 2018); http://hmmer.org/
# Copyright (C) 2018 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.LOTUS_domain.fasta
# output HMM file:                  ../Data/01_Oskar_identification/oskar_tracker_results/LOTUS_CONSENSUS.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     oskar_filtered.aligned.LOTUS_domain   378   167   100     7.37  0.590 

# CPU time: 0.17u 0.00s 00:00:00.17 Elapsed: 00:00:00.17


In [50]:
!hmmbuild ../Data/01_Oskar_identification/oskar_tracker_results/OSKAR_CONSENSUS.hmm ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.2.1 (June 2018); http://hmmer.org/
# Copyright (C) 2018 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             ../Data/01_Oskar_identification/oskar_tracker_results/oskar_filtered.aligned.fasta
# output HMM file:                  ../Data/01_Oskar_identification/oskar_tracker_results/OSKAR_CONSENSUS.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     oskar_filtered.aligned   378  2483   999    10.15  0.590 

# CPU time: 1.29u 0.00s 00:00:01.29 Elapsed: 00:00:01.29


In [51]:
# CHANGE THE V to the version you are working on
!mkdir ../Data/Oskar_hmm/V4

!cp ../Data/01_Oskar_identification/oskar_tracker_results/*.hmm ../Data/Oskar_hmm/V4/

In [52]:
# Set the current version as search files
!cp ../Data/Oskar_hmm/V4/*.hmm ../Data/Oskar_hmm/

## Step 9: Generate Metadata table of the results

In [38]:
def genome_type(x):
    if 'GCA' in x:
        return "GCA"
    elif 'GCF' in x:
        return 'GCF'
    else:
        return ""
    
def TSA_code(x):
    return x[:4]

def count_hits(x):
    if x != "None":
        return len(x.split(','))
    else:
        return 0

def make_matcher(match, results):
    mapping = {}
    for ids, oskar in results[[match, 'oskar_hits']].values:
        if oskar != "None":
            for osk in oskar.split(','):
                mapping[osk] = ids
    return mapping

def get_filtered_hits(fasta_path, mapping):
    handle = SeqIO.parse(fasta_path, 'fasta')
    sequences = [s for s in handle]
    tmp = []
    for s in sequences:
        spl = s.description.split('|')
        seq_id = spl[0].split(' ')[0]
        specie = ' '.join(spl[0].split(' ')[1:])
        family = spl[1]
        order = spl[2]
        source = spl[3]
        source_id = mapping[seq_id]
        tmp.append([source_id, seq_id, specie, family, order, source])
    oskar_infos = pd.DataFrame(tmp, columns=['id', 'Sequence_id', 'Specie', 'Family', 'order_name', 'Source'])
    return oskar_infos

#### Generate Table S1 that provide general information on Genomes and Transcriptomes used for Oskar exaustive collection

In [39]:
source_TSA = pd.read_csv("../Data/01_Oskar_identification/2019/transcriptome_insect_database.csv")
source_genome = pd.read_csv("../Data/01_Oskar_identification/2019/genome_insect_database.csv")
source_genome['source'] = source_genome['genome_id'].map(genome_type)
source_TSA['id'] = source_TSA['tsa_id']
source_TSA['source'] = "TSA"
# source_TSA['tsa_code'] = source_TSA['tsa_id'].map(TSA_code)
source_TSA = source_TSA[['id', 'tax_id', 'species', 'family_name', 'order_name', 'source']]
source_genome['id'] = source_genome['genome_id']
source_genome = source_genome[['id', 'tax_id', 'species', 'family_name', 'order_name', 'source']]
ncbi_metadata = source_TSA.append(source_genome).reset_index()
ncbi_metadata.to_csv('../Data/Tables/Table_S1.csv')

#### Create Table `search_results.csv` that summarizes the oskar hits per genome and transcriptome

In [40]:
TSA_search_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_results.csv')
TSA_mapping = make_matcher('tsa_id', TSA_search_results)
TSA_search_results['id'] = TSA_search_results['tsa_id']
TSA_search_results['source'] = 'TSA'
# TSA_search_results = TSA_search_results[TSA_search_results['oskar_hits'] != 'None']
TSA_search_results['hits'] = TSA_search_results['oskar_hits'].map(count_hits)
TSA_search_results = TSA_search_results[['id', 'species', 'family_name', 'order_name', 'hits', 'source']]
TSA_hits = get_filtered_hits('../Data/01_Oskar_identification/oskar_tracker_results/TSA_V3/tsa_oskar_filtered.fasta', TSA_mapping)
TSA_hits = TSA_hits.groupby('id', as_index=False).count()[['id', 'Specie']]
TSA_hits['filtered_hits'] = TSA_hits['Specie']
del(TSA_hits['Specie'])
TSA_search_results = TSA_search_results.merge(TSA_hits, on='id', how='left').fillna(0)

In [41]:
GCF_search_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_results.csv')
GCF_mapping = make_matcher('genome_id', GCF_search_results)
GCF_search_results['id'] = GCF_search_results['genome_id']
GCF_search_results['source'] = 'GCF'
# GCF_search_results = GCF_search_results[GCF_search_results['oskar_hits'] != 'None']
GCF_search_results['hits'] = GCF_search_results['oskar_hits'].map(count_hits)
GCF_search_results = GCF_search_results[['id', 'species', 'family_name', 'order_name', 'hits', 'source']]
GCF_hits = get_filtered_hits('../Data/01_Oskar_identification/oskar_tracker_results/GCF_V3/gcf_oskar_filtered.fasta', GCF_mapping)
GCF_hits = GCF_hits.groupby('id', as_index=False).count()[['id', 'Specie']]
GCF_hits['filtered_hits'] = GCF_hits['Specie']
del(GCF_hits['Specie'])
GCF_search_results = GCF_search_results.merge(GCF_hits, on='id', how='left').fillna(0)

In [42]:
GCA_search_results = pd.read_csv('../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_results.csv')
GCA_mapping = make_matcher('genome_id', GCA_search_results)
GCA_search_results['id'] = GCA_search_results['genome_id']
GCA_search_results['source'] = 'GCA'
# GCA_search_results = GCA_search_results[GCA_search_results['oskar_hits'] != 'None']
GCA_search_results['hits'] = GCA_search_results['oskar_hits'].map(count_hits)
GCA_search_results = GCA_search_results[['id', 'species', 'family_name', 'order_name', 'hits', 'source']]
GCA_hits = get_filtered_hits('../Data/01_Oskar_identification/oskar_tracker_results/GCA_V3/gca_oskar_filtered.fasta', GCA_mapping)
GCA_hits = GCA_hits.groupby('id', as_index=False).count()[['id', 'Specie']]
GCA_hits['filtered_hits'] = GCA_hits['Specie']
del(GCA_hits['Specie'])
GCA_search_results = GCA_search_results.merge(GCA_hits, on='id', how='left').fillna(0)

In [43]:
search_metadata = TSA_search_results.append(GCF_search_results).append(GCA_search_results).reset_index()

In [44]:
search_metadata.to_csv('../Data/01_Oskar_identification/oskar_tracker_results/search_results.csv', index=False)