# Genbank phylogeny from COI sequences

Complete mitogenome genbank records were downloaded for 4 ant species, each from a different subfamily:
  - ***Pseudomyrmex gracilis*** (Pseudomyrmecinae)
  - ***Formica fusca*** (Formicinae)
  - ***Linepthema humile*** (Dolichoderinae)
  - ***Solenopsis invicta*** (Myrmicinae)

In [82]:
# importing everything we'll need
from Bio import SeqIO
import os, glob, itertools
import pandas as pd
import skbio.io

Now, we'll be using SeqIO to obtain the COI nucleotide sequence for each species and save it into separate files:

In [83]:
def create_dir(dir_name):
    os.makedirs(os.path.dirname(dir_name), exist_ok=True)

def extract_COI(gb_file):
    for record in SeqIO.parse(gb_file, "genbank"):
        species_name = record.annotations.get('organism').replace(" ", "_")
        filename = "./coi_seqs/{}_coi.fa".format(species_name)
        create_dir(filename)
        with open(filename, "w") as coi_file:
            for gene in record.features:
                if gene.type in ["CDS"] and gene.qualifiers.get('gene')[0] in ['COX1', 'COI']:
                    header = "{}-{}".format(species_name, gene.qualifiers.get('gene')[0])
                    sequence = gene.location.extract(record.seq) # Mas seq pode ter stop codon truncado
                    if len(sequence) % 3 == 1:
                        sequence += "AA" #Resto 1 - Precisa adicionar 'AA'
                    elif len(sequence) % 3 == 2:
                        sequence += "A" #Resto 2 - Precisa adicionar 'A'
                    coi_file.write(">{}\n{}\n".format(header, sequence))

In [41]:
for gb_file in glob.glob("./ant_mitogenomes/*.gb"):
    extract_COI(gb_file)

Running blastn with the COI sequences against NCBI's formicidae sequences:

**OBS:** Need to install [taxdb database](ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz).

In [27]:
%%bash

# Extracting taxdb database (if necessary)
taxdb=$(pwd)/taxdb
if [[ ! -d "$taxdb" ]]; then 
    echo "taxdb dir not found. Extracting taxdb database..."
    mkdir $taxdb
    tar -C $taxdb -xaf taxdb.tar.gz
else
    echo "taxdb dir found"
fi

# Setting BLASTDB variable
echo "Setting BLASTDB variable to $taxdb"
export BLASTDB=$taxdb

# Creating directory for blast results (if necessary)
if [[ ! -d "blast_results" ]]; then 
    echo "Creating dirrectory for blast results"
    mkdir blast_results
else
    echo "blast_results directory already created"
fi

# Performing blast searches 
for coi in ./coi_seqs/*; do
    echo "Running blast search for $coi..." && 
    #blastn -query $coi -db ./blast_teste/ant_mito -out ./blast_results/$(basename $coi .fa).blast -outfmt "6 qseqid sseqid staxids sscinames stitle sacc saccver slen sstart send qseq";
    blastn -query $coi -db nr -max_target_seqs 100 -remote -entrez_query "Formicidae [Organism]" -outfmt "7 qseqid sseqid staxids sscinames stitle sacc saccver slen sstart send qseq" -out ./blast_results/$(basename $coi .fa).blast; # set max_target_seqs to higher value when running this for real
done 

taxdb dir found
Setting BLASTDB variable to /home/gabriel/Dropbox/repos/genbank_phylogeny/taxdb
blast_results directory already created
Running blast search for ./coi_seqs/Formica_fusca_coi.fa...
Running blast search for ./coi_seqs/Linepithema_humile_coi.fa...
Running blast search for ./coi_seqs/Pseudomyrmex_gracilis_coi.fa...
Running blast search for ./coi_seqs/Solenopsis_invicta_coi.fa...


**NOTE:** Blast with the `-remote` flag can take quite some time to run and is not compatible with `-taxidlist`... If hard disk space is not a problem, maybe it would be better to just download the entire nt database and run it locally...

Saving the blast results into dataframes:

In [7]:
def create_dataframe(blast_result):
    with open(blast_result) as blast7:
        df = skbio.io.read(blast7, format='blast+7', into=pd.DataFrame)
        return df

def extract_columns_dataframe(df):
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseq']]
    df['sseq'] = df['sseq'].apply(lambda x: x.replace('-', ''))
    #df['sseqlen'] = len(df['sseq'])
    df['sseqlen'] = df.apply(lambda row: len(row.sseq), axis = 1) 
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseqlen', 'sseq']]
    return df
    
#df = extract_columns_dataframe(create_dataframe('blast_results/Formica_fusca_coi_old.blast'))
#print(df)
#df.to_excel("blast.xlsx")

In [13]:
#Testing methods of dataframe
#df[df["staxids"].str.contains(";")].index
#df.index
#df[";" in df.staxids].index

Int64Index([8, 306, 314, 499], dtype='int64')

Now that we have the blast results in a dataframe, we can clean it in order to:

-  Remove rows with more than one taxid;
-  Sort dataframe (descending) for both taxid and sseqlen;
-  Keep only one record by taxid (the one with the longest sseqlen)

In [8]:
def clean_dataframe(df):
    clean_df = df.drop(df[df.staxids.str.contains(";")].index) # Removing rows with hybrid sequences (more than one taxid value)
    #clean_df["staxids"] = pd.to_numeric(clean_df["staxids"])
    clean_df = clean_df.sort_values(by=["staxids", "sseqlen"], ascending=False) # Sorting dataframe by taxid and sseqlen (descending) - Guarantees that highest sseqlen will always be the first row for that taxid
    # Printing all rows to check output
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    #    print(clean_df[["sscinames", "sacc", "staxids", "sseqlen"]])
    clean_df = clean_df.drop_duplicates(subset="staxids", keep='first') # Keeps only one record per txid. The one that has the highest sseqlen
    return clean_df

#print(clean_dataframe(df).dtypes)
#
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
#    print(clean_dataframe(df)[["sscinames", "sacc", "staxids", "sseqlen"]])

Now that we have the functions to extract and clean the data, we have to concatenate the blast results into a single, final dataframe:

In [9]:
blast_data = []
for blast_result in glob.glob("./blast_results/*.blast"):
    blast_data.append(clean_dataframe(extract_columns_dataframe(create_dataframe(blast_result))))
blast_alldata = pd.concat(blast_data)

  warn("%r does not look like a %s file"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sseq'] = df['sseq'].apply(lambda x: x.replace('-', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sseqlen'] = df.apply(lambda row: len(row.sseq), axis = 1)
  warn("%r does not look like a %s file"
  warn("%r does not look like a %s file"
  warn("%r does not look like a %s file"


Despite the warnings, the resulting dataframe is correctly formatted and henceforth suitable for downstream analyses:

In [10]:
blast_alldata

Unnamed: 0,qseqid,staxids,sscinames,sacc,sseqlen,sseq
164,Formica_fusca-COX1,88063,Messor bouvieri,DQ074325,1248,GGATCATCTATAAGAATGATTATTCGACTTGAATTAGGATCATGTA...
109,Formica_fusca-COX1,84561,Oecophylla smaragdina,AB185475,1042,CCTTTAATATTAGGATCGCCTGATATAGCATATCCCCGTATAAATA...
4,Formica_fusca-COX1,84560,Formica lemani,AB019425,974,ATTCCCTTAATACTAGGATCTCCAGACATAGCTTATCCTCGTATAA...
40,Formica_fusca-COX1,84555,Polyrhachis dives,KT266831,1530,ATGAAAAAATGACTCTATTCAACTAACCATAAAGATATTGGAATGT...
168,Formica_fusca-COX1,81629,Messor structor,KT184578,1367,AGAATAATTATCCGACTTGAACTAGGGTCCTGTAACTCATTAATTA...
...,...,...,...,...,...,...
439,Solenopsis_invicta-COX1,144042,Pogonomyrmex rugosus,FJ824455,1371,ATAATTATTCGACTTGAACTTGGTTCATGTAATAGCTTAATTAATA...
0,Solenopsis_invicta-COX1,13686,Solenopsis invicta,HQ215538,1529,ATGAATAAATGACTTTTTTCAACAAATCACAAAGACATTGGAATTT...
4,Solenopsis_invicta-COX1,121131,Solenopsis geminata,HQ215537,1529,ATGAACAAATGATTTTTTTCAACTAATCACAAAGATATTGGAATTT...
327,Solenopsis_invicta-COX1,1031672,Messor minor x Messor cf. wasmanni BCSS-2011,EU441274,1212,TGTAATTCATTAATTAACAATGATCAAATTTATAATACTTTAGTGA...


Let's create a matrix with the percentage of hits shared between all sequences:

In [11]:
# First, let's create a dict with query headers as keys and a list of subject accession as values:
blast_dict = {k: f.tolist()
     for k, f in blast_alldata.groupby('qseqid')['sacc']}

print(blast_dict)

{'Formica_fusca-COX1': ['DQ074325', 'AB185475', 'AB019425', 'KT266831', 'KT184578', 'GQ255194', 'GQ255162', 'GQ255191', 'GQ255164', 'GQ255205', 'GQ255201', 'GQ255199', 'AB010934', 'DQ353343', 'AB010936', 'AB010933', 'AB010930', 'AB103360', 'AB103357', 'AB103364', 'AB010926', 'LN607805', 'FJ982472', 'FJ982453', 'FJ982452', 'FJ982451', 'FJ982446', 'FJ982462', 'FJ982456', 'GQ255168', 'NC_049861', 'KX146469', 'JN562438', 'KU504914', 'KJ141815', 'FJ982465', 'KY770018', 'FJ982469', 'FJ982442', 'KU504892', 'GQ255208', 'GQ255193', 'GQ255189', 'GQ255192', 'GQ255176', 'GQ255173', 'GQ255159', 'GQ255157', 'GQ255147', 'GQ255145', 'GQ255137', 'GQ255126', 'GQ255187', 'KT184523', 'FJ824422', 'MK037284', 'LT977413', 'LT977411', 'KR895865', 'KX665068', 'MG757150', 'JF863557', 'LT978303', 'FJ982439', 'KU504865', 'KU504864', 'KU504857', 'KU504849', 'KU504848', 'KU504847', 'KU504831', 'NC_046399', 'NC_046398', 'NC_046426', 'NC_046424', 'NC_046423', 'NC_046422', 'NC_046420', 'NC_046421', 'BK010385', 'GQ2551

In [72]:
#Calculating the number of subject accessions shared between two species

def matrix_absolute_matches(blast_dict):
    matrix_dict = {k: [] for k in blast_dict.keys()} #Empty 
    for pair in itertools.product(blast_dict.keys(), repeat=2): #All possible combinations
        match = 0
        for acc in blast_dict[pair[0]]:
            if acc in blast_dict[pair[1]]:
                match += 1
        matrix_dict[pair[0]].append(match)
    return pd.DataFrame(matrix_dict, index=matrix_dict.keys())

# Testing function:

#for pair in itertools.product(blast_dict.keys(), repeat=2): #All possible combinations
#    match = 0
#    for acc in blast_dict[pair[0]]:
#        if acc in blast_dict[pair[1]]:
#            match += 1
#    print("{} x {} = {} identical matches".format(pair[0], pair[1], match))
#
#matrix_absolute_matches(blast_dict)

In [73]:
#Calculating the percentage of subject accessions in species 1 (column) found in species 2 (row)

def matrix_percent_matches(blast_dict):
    matrix_dict = {k: [] for k in blast_dict.keys()} #Empty 
    for pair in itertools.product(blast_dict.keys(), repeat=2): #All possible combinations
        match = 0
        for acc in blast_dict[pair[0]]:
            if acc in blast_dict[pair[1]]:
                match += 1
        matrix_dict[pair[0]].append("{} / {} = {}".format(match, 
                                                          len(blast_dict[pair[0]]), 
                                                              round(match/len(blast_dict[pair[0]]), 2)))
    return pd.DataFrame(matrix_dict, index=matrix_dict.keys())

# Testing function:

#for pair in itertools.product(blast_dict.keys(), repeat=2): #All possible combinations
#    match = 0
#    for acc in blast_dict[pair[0]]:
#        if acc in blast_dict[pair[1]]:
#            match += 1
#    print("{2} percent of the blast hits from {0} are found in {1} ".format(pair[0], pair[1], round(match/len(blast_dict[pair[0]]), 2)))
#
#matrix_percent_matches(blast_dict)

In [75]:
# Printing dataframes (matrices)

matrix_absolute_matches(blast_dict)

Unnamed: 0,Formica_fusca-COX1,Linepithema_humile-COX1,Pseudomyrmex_gracilis-COX1,Solenopsis_invicta-COX1
Formica_fusca-COX1,194,68,23,51
Linepithema_humile-COX1,68,192,34,66
Pseudomyrmex_gracilis-COX1,23,34,119,35
Solenopsis_invicta-COX1,51,66,35,180


In [76]:
matrix_percent_matches(blast_dict)

Unnamed: 0,Formica_fusca-COX1,Linepithema_humile-COX1,Pseudomyrmex_gracilis-COX1,Solenopsis_invicta-COX1
Formica_fusca-COX1,194 / 194 = 1.0,68 / 192 = 0.35,23 / 119 = 0.19,51 / 180 = 0.28
Linepithema_humile-COX1,68 / 194 = 0.35,192 / 192 = 1.0,34 / 119 = 0.29,66 / 180 = 0.37
Pseudomyrmex_gracilis-COX1,23 / 194 = 0.12,34 / 192 = 0.18,119 / 119 = 1.0,35 / 180 = 0.19
Solenopsis_invicta-COX1,51 / 194 = 0.26,66 / 192 = 0.34,35 / 119 = 0.29,180 / 180 = 1.0


Lastly, let's create 3 final fasta files:

1. With all query sequences
2. With all unique subject sequences from the blast searches
1. With all query + subject sequences

In [126]:
def queries_to_multifasta(directory):
    #create_dir("final_seqs/")
    with open("./final_seqs/queries.fa", "w") as queries:
        for fasta in glob.glob("{}/*fa".format(directory)):
            for record in SeqIO.parse(fasta, "fasta"):
                queries.write(record.format("fasta"))

def subjects_to_multifasta(blast_alldataframe):
    #create_dir("final_seqs/")
    unique_blast_subjects = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="staxids", keep='first') # Keep only largest sequence per taxid
    with open("./final_seqs/subjects.fa", "w") as subjects:
        unique_blast_subjects.apply(lambda x: subjects.write(">{}_{}\n{}".format(
            x['sscinames'].replace(" ", "_"), x["sacc"], x['sseq'], axis=1)))

In [132]:
def queries_to_multifasta(directory):
    #create_dir("final_seqs/")
    with open("./final_seqs/queries.fa", "w") as queries:
        for fasta in glob.glob("{}/*fa".format(directory)):
            for record in SeqIO.parse(fasta, "fasta"):
                queries.write(record.format("fasta"))

def subjects_to_multifasta(blast_alldataframe):
    #create_dir("final_seqs/")
    unique_blast_subjects = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="staxids", keep='first') # Keep only largest sequence per taxid
    with open("./final_seqs/subjects.fa", "w") as subjects:
        for row in unique_blast_subjects.itertuples(): # Awfully slow. Need to optimize this later
            subjects.write(">{}_{}\n{}\n".format(row.sscinames.replace(" ", "_"), row.sacc, row.sseq))
        #unique_blast_subjects.apply(lambda x: subjects.write(">{}_{}\n{}".format(x['sscinames'].replace(" ", "_"), x["sacc"], x['sseq'])))

In [133]:
create_dir("final_seqs/")
queries_to_multifasta("./coi_seqs")
subjects_to_multifasta(blast_alldata)

In [135]:
# Concatenating both multifastas:
!cat final_seqs/queries.fa final_seqs/subjects.fa > final_seqs/all_seqs.fa

In [94]:
blast_alldata

Unnamed: 0,qseqid,staxids,sscinames,sacc,sseqlen,sseq
164,Formica_fusca-COX1,88063,Messor bouvieri,DQ074325,1248,GGATCATCTATAAGAATGATTATTCGACTTGAATTAGGATCATGTA...
109,Formica_fusca-COX1,84561,Oecophylla smaragdina,AB185475,1042,CCTTTAATATTAGGATCGCCTGATATAGCATATCCCCGTATAAATA...
4,Formica_fusca-COX1,84560,Formica lemani,AB019425,974,ATTCCCTTAATACTAGGATCTCCAGACATAGCTTATCCTCGTATAA...
40,Formica_fusca-COX1,84555,Polyrhachis dives,KT266831,1530,ATGAAAAAATGACTCTATTCAACTAACCATAAAGATATTGGAATGT...
168,Formica_fusca-COX1,81629,Messor structor,KT184578,1367,AGAATAATTATCCGACTTGAACTAGGGTCCTGTAACTCATTAATTA...
...,...,...,...,...,...,...
439,Solenopsis_invicta-COX1,144042,Pogonomyrmex rugosus,FJ824455,1371,ATAATTATTCGACTTGAACTTGGTTCATGTAATAGCTTAATTAATA...
0,Solenopsis_invicta-COX1,13686,Solenopsis invicta,HQ215538,1529,ATGAATAAATGACTTTTTTCAACAAATCACAAAGACATTGGAATTT...
4,Solenopsis_invicta-COX1,121131,Solenopsis geminata,HQ215537,1529,ATGAACAAATGATTTTTTTCAACTAATCACAAAGATATTGGAATTT...
327,Solenopsis_invicta-COX1,1031672,Messor minor x Messor cf. wasmanni BCSS-2011,EU441274,1212,TGTAATTCATTAATTAACAATGATCAAATTTATAATACTTTAGTGA...


In [120]:
#Using 'staxids' when removing duplicates generates more unique values than 'sacc'

def check_df_len_uniqcol(dataframe):
    print("Length: {}".format(len(dataframe)))
    for column in dataframe.columns:
        print("{} is unique: {}".format(column, dataframe[column].is_unique))
    print()
        
unique_blast_subjects_staxids = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="staxids", keep='first')
unique_blast_subjects_sacc = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="sacc", keep='first')
check_df_len_uniqcol(unique_blast_subjects_staxids)
check_df_len_uniqcol(unique_blast_subjects_sacc)

Length: 441
qseqid is unique: False
staxids is unique: True
sscinames is unique: True
sacc is unique: True
sseqlen is unique: False
sseq is unique: True

Length: 473
qseqid is unique: False
staxids is unique: False
sscinames is unique: False
sacc is unique: True
sseqlen is unique: False
sseq is unique: True

