# Genbank phylogeny from COI sequences

Complete mitogenome genbank records were downloaded for 4 ant species, each from a different subfamily:
  - ***Pseudomyrmex gracilis*** (Pseudomyrmecinae)
  - ***Formica fusca*** (Formicinae)
  - ***Linepthema humile*** (Dolichoderinae)
  - ***Solenopsis invicta*** (Myrmicinae)

In [1]:
# importing everything we'll need
from Bio import SeqIO
import os, glob
import pandas as pd
import skbio.io

Now, we'll be using SeqIO to obtain the COI nucleotide sequence for each species and save it into separate files:

In [41]:
def create_dir(dir_name):
    os.makedirs(os.path.dirname(dir_name), exist_ok=True)

def extract_COI(gb_file):
    for record in SeqIO.parse(gb_file, "genbank"):
        species_name = record.annotations.get('organism').replace(" ", "_")
        filename = "./coi_seqs/{}_coi.fa".format(species_name)
        create_dir(filename)
        with open(filename, "w") as coi_file:
            for gene in record.features:
                if gene.type in ["CDS"] and gene.qualifiers.get('gene')[0] in ['COX1', 'COI']:
                    header = "{}-{}".format(species_name, gene.qualifiers.get('gene')[0])
                    sequence = gene.location.extract(record.seq) # Mas seq pode ter stop codon truncado
                    if len(sequence) % 3 == 1:
                        sequence += "AA" #Resto 1 - Precisa adicionar 'AA'
                    elif len(sequence) % 3 == 2:
                        sequence += "A" #Resto 2 - Precisa adicionar 'A'
                    coi_file.write(">{}\n{}\n".format(header, sequence))

for gb_file in glob.glob("./ant_mitogenomes/*.gb"):
    extract_COI(gb_file)

Running blastn with the COI sequences against NCBI's formicidae sequences:

**OBS:** Need to install [taxdb database](ftp://ftp.ncbi.nlm.nih.gov/blast/db/taxdb.tar.gz).

In [27]:
%%bash

# Extracting taxdb database (if necessary)
taxdb=$(pwd)/taxdb
if [[ ! -d "$taxdb" ]]; then 
    echo "taxdb dir not found. Extracting taxdb database..."
    mkdir $taxdb
    tar -C $taxdb -xaf taxdb.tar.gz
else
    echo "taxdb dir found"
fi

# Setting BLASTDB variable
echo "Setting BLASTDB variable to $taxdb"
export BLASTDB=$taxdb

# Creating directory for blast results (if necessary)
if [[ ! -d "blast_results" ]]; then 
    echo "Creating dirrectory for blast results"
    mkdir blast_results
else
    echo "blast_results directory already created"
fi

# Performing blast searches 
for coi in ./coi_seqs/*; do
    echo "Running blast search for $coi..." && 
    #blastn -query $coi -db ./blast_teste/ant_mito -out ./blast_results/$(basename $coi .fa).blast -outfmt "6 qseqid sseqid staxids sscinames stitle sacc saccver slen sstart send qseq";
    blastn -query $coi -db nr -max_target_seqs 100 -remote -entrez_query "Formicidae [Organism]" -outfmt "7 qseqid sseqid staxids sscinames stitle sacc saccver slen sstart send qseq" -out ./blast_results/$(basename $coi .fa).blast; # set max_target_seqs to higher value when running this for real
done 

taxdb dir found
Setting BLASTDB variable to /home/gabriel/Dropbox/repos/genbank_phylogeny/taxdb
blast_results directory already created
Running blast search for ./coi_seqs/Formica_fusca_coi.fa...
Running blast search for ./coi_seqs/Linepithema_humile_coi.fa...
Running blast search for ./coi_seqs/Pseudomyrmex_gracilis_coi.fa...
Running blast search for ./coi_seqs/Solenopsis_invicta_coi.fa...


**NOTE:** Blast with the `-remote` flag can take quite some time to run and is not compatible with `-taxidlist`... If hard disk space is not a problem, maybe it would be better to just download the entire nt database and run it locally...

Saving the blast results into dataframes:

In [2]:
def create_dataframe(blast_result):
    with open(blast_result) as blast7:
        df = skbio.io.read(blast7, format='blast+7', into=pd.DataFrame)
        return df

def filter_dataframe(df):
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseq']]
    df['sseq'] = df['sseq'].apply(lambda x: x.replace('-', ''))
    #df['sseqlen'] = len(df['sseq'])
    df['sseqlen'] = df.apply(lambda row: len(row.sseq), axis = 1) 
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseqlen', 'sseq']]
    return df
    
df = filter_dataframe(create_dataframe('blast_results/Formica_fusca_coi_old.blast'))
print(df)
df.to_excel("blast.xlsx")

  warn("%r does not look like a %s file"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sseq'] = df['sseq'].apply(lambda x: x.replace('-', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sseqlen'] = df.apply(lambda row: len(row.sseq), axis = 1)


                 qseqid                staxids  \
0    Formica_fusca-COX1                  72779   
1    Formica_fusca-COX1                 208979   
2    Formica_fusca-COX1                  72779   
3    Formica_fusca-COX1                  72779   
4    Formica_fusca-COX1                  84560   
..                  ...                    ...   
495  Formica_fusca-COX1                 609739   
496  Formica_fusca-COX1                 609832   
497  Formica_fusca-COX1                 609832   
498  Formica_fusca-COX1                  72779   
499  Formica_fusca-COX1  609832;609890;2328943   

                                             sscinames      sacc  sseqlen  \
0                                        Formica fusca  LN607805     1530   
1                                       Formica selysi  KP670862     1530   
2                                        Formica fusca  FJ824419     1377   
3                                        Formica fusca  AY334398     1191   
4             

In [13]:
#Testing methods of dataframe
df[df["staxids"].str.contains(";")].index
#df.index
#df[";" in df.staxids].index

Int64Index([8, 306, 314, 499], dtype='int64')

In [28]:
def clean_dataframe(df):
    clean_df = df.drop(df[df.staxids.str.contains(";")].index) # Removing rows with hybrid sequences (more than one taxid value)
    clean_df = clean_df.sort_values(by=["staxids", "sseqlen"], ascending=False) # Sorting dataframe by taxid and sseqlen (descending) - Guarantees that highest sseqlen will always be the first row for that taxid
    # Printing all rows to check output
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    #    print(clean_df[["sscinames", "sacc", "staxids", "sseqlen"]])
    clean_df = clean_df.drop_duplicates(subset="staxids", keep='first') # Keeps only one record per txid. The one that has the highest sseqlen
    return clean_df

with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(clean_dataframe(df)[["sscinames", "sacc", "staxids", "sseqlen"]])

                                        sscinames       sacc  staxids  sseqlen
164                               Messor bouvieri   DQ074325    88063     1248
109                         Oecophylla smaragdina   AB185475    84561     1042
4                                  Formica lemani   AB019425    84560      974
40                              Polyrhachis dives   KT266831    84555     1530
168                               Messor structor   KT184578    81629     1367
93                                Myrmica vandeli   GQ255194   758485     1371
269                      Myrmica n. sp. 6 GJ-2010   GQ255162   758483     1376
103                      Myrmica n. sp. 3 GJ-2010   GQ255191   758480     1371
50                       Myrmica n. sp. 2 GJ-2010   GQ255164   758479     1379
388                           Myrmica n. sp. M350   GQ255205   758476     1377
419                           Myrmica n. sp. M295   GQ255201   758472     1377
372                           Myrmica n. sp. M273   