# Genbank phylogeny from multiple genes

In [1]:
# importing everything we'll need
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline
import os, glob, itertools, tarfile, subprocess
import pandas as pd

Sequences obtained using code in the 'download_accessions.ipynb'

In [2]:
def create_dir(dir_name):
    os.makedirs(os.path.dirname(dir_name), exist_ok=True)

In [79]:
def setup_taxdb():
    taxdb_dir = "{}/taxdb/".format(os.getcwd())
    if not (os.path.exists("{}/taxdb.btd".format(taxdb_dir)) and os.path.exists("{}/taxdb.bti".format(taxdb_dir))):
        print("taxdb files (taxdb/taxdb.btd e taxdb/taxdb.bti) not found. Extracting taxdb database...")
        create_dir(taxdb_dir)
        tarfile.open("{}/taxdb.tar.gz".format(os.getcwd()), "r:*").extractall(taxdb_dir)
    os.putenv("BLASTDB", taxdb_dir)
setup_taxdb()

In [80]:
def run_blastn(fasta):
    blast_output = "{}.blast".format(os.path.splitext(fasta)[0]) #Get fasta name without extension
    #blastn = NcbiblastnCommandline(cmd="blastn", query=fasta, db='nr', remote=True, outfmt="6 qseqid staxids sscinames sacc sseq",
    #                     max_target_seqs=5000, entrez_query="Formicidae [Organism]", out=blast_output)
    subprocess.run('blastn -query {} -db nr -max_target_seqs 5000 -remote \
    -entrez_query "Formicidae [Organism]" -outfmt "6 qseqid staxids sscinames sacc sseq"\
    -out {}'.format(fasta, blast_output), shell=True)

In [84]:
def run_all_blast_searches(directory):
    setup_taxdb()
    extensions = [".fa", ".fasta"]
    for extension in extensions:
        for fasta in glob.glob("{}*{}".format(directory, extension)):
            print(fasta)
            run_blastn(fasta)

In [70]:
# Finallly running blast for all seeds
# Não roda com sequências que tenham Ns

for directory in glob.glob("{}/seed_fasta/*/".format(os.getcwd())):
    print(directory)
#    run_all_blast_searches(directory)

/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/EF1/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/EF2/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/18S/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/COX1/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/28S/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/abdA/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/LWR/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/WNT/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/CYTB/
/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/12S/


Saving the blast results into dataframes:

In [64]:
def create_dataframe(blast_result):
    return pd.read_csv(blast_result, sep='\t',
               names=['qseqid', 'staxids', 'sscinames', 'sacc', 'sseq']) 
    
def extract_columns_dataframe(df):
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseq']]
    df['sseq'] = df['sseq'].apply(lambda x: x.replace('-', ''))
    #df['sseqlen'] = len(df['sseq'])
    df['sseqlen'] = df.apply(lambda row: len(row.sseq), axis = 1) 
    df = df[['qseqid', 'staxids', 'sscinames', 'sacc', 'sseqlen', 'sseq']]
    return df
    
df = extract_columns_dataframe(create_dataframe('seed_fasta/COX1/Formica_fusca_coi.blast'))
print(dir(df["sscinames"].str))
print(df.sscinames.str.contains("\."))

#df.head(n=100)
#df.to_excel("blast.xlsx")

['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__frozen', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_doc_args', '_freeze', '_get_series_list', '_inferred_dtype', '_is_categorical', '_is_string', '_make_accessor', '_orig', '_parent', '_validate', '_wrap_result', 'capitalize', 'casefold', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'r

Now that we have the blast results in a dataframe, we can clean it in order to:

-  Remove rows with more than one taxid;
-  Sort dataframe (descending) for both taxid and sseqlen;
-  Keep only one record by taxid (the one with the longest sseqlen);
-  Remove species with unwanted characters (".", ":", and "-") 

In [69]:
def clean_dataframe(df):
    clean_df = df.drop(df[df.staxids.str.contains(";")].index) # Removing rows with hybrid sequences (more than one taxid value)
    clean_df["staxids"] = pd.to_numeric(clean_df["staxids"])
    clean_df = clean_df.sort_values(by=["staxids", "sseqlen"], ascending=False) # Sorting dataframe by taxid and sseqlen (descending) - Guarantees that highest sseqlen will always be the first row for that taxid
    clean_df = clean_df.drop_duplicates(subset="staxids", keep='first') # Keeps only one record per txid. The one that has the highest sseqlen
    for non_valid_ssciname in ["\.", ":", "-"]:
        clean_df = clean_df[~clean_df.sscinames.str.contains(non_valid_ssciname)] # Removes columns with sscinames containing the characters in the list
    return clean_df

#clean_dataframe(df).to_csv("clean.csv")

Now that we have the functions to extract and clean the data, we have to concatenate the blast results into a single, final dataframe:

In [159]:
def merge_blast_dataframes(directory):
    blast_data = []
    for blast_result in glob.glob("{}*.blast".format(directory)):
        blast_data.append(clean_dataframe(extract_columns_dataframe(create_dataframe(blast_result))))
    return pd.concat(blast_data)

Let's create a matrix with the percentage of hits shared between all sequences:

In [162]:
#Calculating the number of subject accessions shared between two species

def matrix_absolute_matches(blast_full_dataframe):
    blast_dict = {k: f.tolist() for k, f in blast_full_dataframe.groupby('qseqid')['sacc']}
    matrix_dict = {k: [] for k in blast_dict.keys()} #Empty 
    for pair in itertools.product(blast_dict.keys(), repeat=2): #All possible combinations
        match = 0
        for acc in blast_dict[pair[0]]:
            if acc in blast_dict[pair[1]]:
                match += 1
        matrix_dict[pair[0]].append(match)
    return pd.DataFrame(matrix_dict, index=matrix_dict.keys())

In [174]:
# Printing dataframes (matrices)

matrix_absolute_matches(blast_dict).to_excel("./final_results/matrix_absolute.xlsx")
matrix_absolute_matches(blast_dict)

Unnamed: 0,Formica_fusca-COX1,Linepithema_humile-COX1,Pseudomyrmex_gracilis-COX1,Solenopsis_invicta-COX1
Formica_fusca-COX1,2718,151,77,130
Linepithema_humile-COX1,151,1556,243,338
Pseudomyrmex_gracilis-COX1,77,243,1176,154
Solenopsis_invicta-COX1,130,338,154,1517


In [175]:
matrix_percent_matches(blast_dict).to_excel("./final_results/matrix_percent.xlsx")
matrix_percent_matches(blast_dict)

Unnamed: 0,Formica_fusca-COX1,Linepithema_humile-COX1,Pseudomyrmex_gracilis-COX1,Solenopsis_invicta-COX1
Formica_fusca-COX1,2718 / 2718 = 1.0,151 / 1556 = 0.1,77 / 1176 = 0.07,130 / 1517 = 0.09
Linepithema_humile-COX1,151 / 2718 = 0.06,1556 / 1556 = 1.0,243 / 1176 = 0.21,338 / 1517 = 0.22
Pseudomyrmex_gracilis-COX1,77 / 2718 = 0.03,243 / 1556 = 0.16,1176 / 1176 = 1.0,154 / 1517 = 0.1
Solenopsis_invicta-COX1,130 / 2718 = 0.05,338 / 1556 = 0.22,154 / 1176 = 0.13,1517 / 1517 = 1.0


Lastly, let's create 3 final fasta files:

1. With all query sequences
2. With all unique subject sequences from the blast searches
1. With all query + subject sequences

In [167]:
def queries_to_multifasta(directory):
    #create_dir("final_seqs/")
    with open("./final_seqs/queries.fa", "w") as queries:
        for fasta in glob.glob("{}/*fa".format(directory)):
            for record in SeqIO.parse(fasta, "fasta"):
                queries.write(record.format("fasta"))

def subjects_to_multifasta(blast_alldataframe):
    #create_dir("final_seqs/")
    unique_blast_subjects = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="staxids", keep='first') # Keep only largest sequence per taxid
    with open("./final_seqs/subjects.fa", "w") as subjects:
        for row in unique_blast_subjects.itertuples(): # Awfully slow. Need to optimize this later
            subjects.write(">{}_{}\n{}\n".format(row.sscinames.replace(" ", "_"), row.sacc, row.sseq))
        #unique_blast_subjects.apply(lambda x: subjects.write(">{}_{}\n{}".format(x['sscinames'].replace(" ", "_"), x["sacc"], x['sseq'])))

In [168]:
create_dir("final_seqs/")
queries_to_multifasta("./coi_seqs")
subjects_to_multifasta(blast_alldata)

In [169]:
# Concatenating both multifastas:
!cat final_seqs/queries.fa final_seqs/subjects.fa > final_seqs/all_seqs.fa

In [69]:
#run_blastn("/home/gabriel/Dropbox/repos/genbank_phylogeny/seed_fasta/EF1/Formica_fusca-EF1.fasta")

KeyboardInterrupt: 

That's pretty much it!!! :)

## Complementary functions

In [120]:
# OBS: Using 'staxids' when removing duplicates generates more unique values than 'sacc'

def check_df_len_uniqcol(dataframe):
    print("Length: {}".format(len(dataframe)))
    for column in dataframe.columns:
        print("{} is unique: {}".format(column, dataframe[column].is_unique))
    print()
        
unique_blast_subjects_staxids = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="staxids", keep='first')
unique_blast_subjects_sacc = blast_alldata.sort_values(by=["sseqlen"], ascending=False).drop_duplicates(subset="sacc", keep='first')
check_df_len_uniqcol(unique_blast_subjects_staxids)
check_df_len_uniqcol(unique_blast_subjects_sacc)

Length: 441
qseqid is unique: False
staxids is unique: True
sscinames is unique: True
sacc is unique: True
sseqlen is unique: False
sseq is unique: True

Length: 473
qseqid is unique: False
staxids is unique: False
sscinames is unique: False
sacc is unique: True
sseqlen is unique: False
sseq is unique: True



In [145]:
# Counting number of columns of blast files
def count_tab_columns_for_each_line(filename, delimiter):
    with open(filename) as fh:
        for linenum, line in enumerate(fh, start=1):
            print("Line {}: {} columns".format(linenum, len(line.split(delimiter))))

In [171]:
count_tab_columns_for_each_line("./blast_results/Pseudomyrmex_gracilis_coi.blast", "\t")

Line 1: 5 columns
Line 2: 5 columns
Line 3: 5 columns
Line 4: 5 columns
Line 5: 5 columns
Line 6: 5 columns
Line 7: 5 columns
Line 8: 5 columns
Line 9: 5 columns
Line 10: 5 columns
Line 11: 5 columns
Line 12: 5 columns
Line 13: 5 columns
Line 14: 5 columns
Line 15: 5 columns
Line 16: 5 columns
Line 17: 5 columns
Line 18: 5 columns
Line 19: 5 columns
Line 20: 5 columns
Line 21: 5 columns
Line 22: 5 columns
Line 23: 5 columns
Line 24: 5 columns
Line 25: 5 columns
Line 26: 5 columns
Line 27: 5 columns
Line 28: 5 columns
Line 29: 5 columns
Line 30: 5 columns
Line 31: 5 columns
Line 32: 5 columns
Line 33: 5 columns
Line 34: 5 columns
Line 35: 5 columns
Line 36: 5 columns
Line 37: 5 columns
Line 38: 5 columns
Line 39: 5 columns
Line 40: 5 columns
Line 41: 5 columns
Line 42: 5 columns
Line 43: 5 columns
Line 44: 5 columns
Line 45: 5 columns
Line 46: 5 columns
Line 47: 5 columns
Line 48: 5 columns
Line 49: 5 columns
Line 50: 5 columns
Line 51: 5 columns
Line 52: 5 columns
Line 53: 5 columns
Li