In [8]:
import numpy as np
import pandas as pd 
from pandas.errors import EmptyDataError
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import re
from Bio.Alphabet import IUPAC
import subprocess
from collections import OrderedDict
import os, os.path
import sys
import glob
import shutil
from Bio.SubsMat.MatrixInfo import blosum62
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup 

pd.options.mode.chained_assignment = None
#Chrome Driver imports
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0

In [13]:
"""File and directory management functions"""
def create_directory(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

def empty_directory(path):
    for i in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(i):
            shutil.rmtree(i)
        else:
            os.remove(i)

def create_run_directory(run_name):
    """Make diretory tree for a run."""
    dirpaths = ["{0}","{0}/input","{0}/input/ODB","{0}/input/NCBI","{0}/output","{0}/summary","{0}/run_params"]
    for dirpath in dirpaths:
        formatted = dirpath.format(run_name)
        create_directory(formatted)

def write_run_params_file(config, spec_path, spec_hc):
    """Documents some run specific parameters. 
    
    config: config file object storing some run specifications (directory names, file paths) 
    spec_path: file path for the input species list being used for analysis 
    spec_hc: hashcode generated from species list. 
    """
    config_keys = ["RunName","GenesFile","odb_level"]
    run_name = config["RunName"]
    fpath = run_name+"/run_params/params.txt"
    params_f = open(fpath, 'wt')
    for key in config_keys:
        val = config[key]
        file_line = "{0}: {1}\n".format(key,val)
        params_f.write(file_line)
    params_f.write("species_list: {0}\n".format(spec_path))
    params_f.write("species_hashcode: {0}\n".format(spec_hc))
    params_f.close()

#tmp directory is used to store any run intermediate files (sequence filtering results)
create_directory("tmp")
empty_directory('tmp')

In [10]:
#Error classes for OrthoDB query failures, missing sequence data, or GeneCards queries
# class Error(Exception):
#     """Base class for exceptions in this module."""
#     pass
class SequenceDataEx(Error):
    """Error class to be raised if missing GS sequences from OrthoDB/ NCBI data"""
    error_type = "SequenceDataError"
    def __init__(self,code, message):
        self.code = code
        self.message = message
class OrthoDBQueryError(Error):
    """Error class to be raised if OrthoDB query failed to generate input files"""
    error_type = "OrthoDBQueryError"
    def __init__(self, code, message):
        self.code = code
        self.message = message
class GeneCardsError(Error):
    """Error class if aliases for a gene symbol could not be automatically fetched"""
    error_type = "GeneCardsError"
    def __init__(self, code, message):
        self.code = code
        self.message = message
class SequenceAnalysisError(Error):
    """Error class if JSD/ BLOSUM metrics analysis cannot be completed for a gene"""
    error_type = "SequenceAnalysisError"
    def __init__(self, code, message):
        self.code = code
        self.message = message

        
def write_errors(errors_fpath,gene_name,error):
    """Maintains a tsv file of gene symbols and errors generated during the run.
    """
    error_type = error.error_type
    error_code = error.code 
    error_msg = error.message
    if not os.path.exists(errors_fpath):
        errors_f = open(errors_fpath,'wt')
        errors_f.write("gene\terror_type\terror_code\terror_str\n")
    else:
        errors_df = pd.read_csv(errors_fpath,delimiter='\t')
        if gene_name in errors_df["gene"].unique():
            gene_error_df = errors_df.loc[errors_df["gene"]==gene_name,:]
            if gene_error_df["error_str"].str.contains(error_msg).any():
#                 print("Previously stored error:")
                error_row = gene_error_df.loc[gene_error_df["error_str"]==error_msg,:]
                gene_name,error_type,error_code,error_msg = error_row.values[0]
                print("{0}\t{1}\t{2}\t{3}".format(gene_name,error_type,error_code,error_msg))
                return
    errors_f = open(errors_fpath,'at')
    fline = "{0}\t{1}\t{2}\t{3}\n".format(gene_name,error_type,error_code,error_msg)
    errors_f.write(fline)
    print(fline)
    errors_f.close()

In [14]:
#Funcitons for reading config files, including run parameters, gene symbols list, and species lists. 

def parse_config(config_file="config/config.txt"):
    """Parse config text file (INI format) to establish paramters for the run
    
    config_file: path to the config file ("config/config.txt" by default)
    """
    import configparser
    config = configparser.ConfigParser()
    config.read(config_file)
    return config["DEFAULT"]
def parse_genes(genes_path="config/genes.txt"):
    """Parses gene file into list of uppercase, whitespace trimmed gene names"""
    gene_flines = open(genes_path).readlines()
    genes = [gene.strip().upper() for gene in gene_flines]
    return genes
def parse_species(species_path="config/v10_0_species.txt"):
    #Reads species list from file in config directory. Also returns a hashcode for the list of species used
    spec_lines = open(species_path).readlines()
    species = [spec.strip() for spec in spec_lines]
    concat = ""
    for spec in species: 
        concat = concat + spec
    hc = np.abs(hash(concat))
    return species, hc

def odb_tablev9(species_list,table_path="odb9v1_raw/odb9v1_species.tab"):
    """Reads orthodb v9 tsv file into a DataFrame of species names/ tax_ids and other ODB information.
        Mainly used for taxid <-> species name conversions
    """
    odb = pd.read_csv(table_path,delimiter="\t",header=None,names=["tax_id","odb_id","spec_name","clustered_genes","ortho_groups","mapping_type"])
    filtered = pd.DataFrame(columns=odb.columns)
    for spec in species_list:
        row = odb[odb["spec_name"]==spec]
        filtered = filtered.append(row)
    filtered.drop(columns=["clustered_genes","ortho_groups","mapping_type"],inplace=True)
    return filtered
def odb_tablev10(species_list,table_path="config/odb10v0_species.tab"):
    """odb10v0_species.tab:
    1.	NCBI tax id
    2.	Ortho DB individual organism id, based on NCBI tax id
    3.	scientific name inherited from the most relevant NCBI tax id
    4.	genome asssembly id, when available
    5.	total count of clustered genes in this species
    6.	total count of the OGs it participates
    7.	mapping type, clustered(C) or mapped(M)
    Reads above file into a DataFrame used for tax_id/ species name information 
    """
    odb = pd.read_csv(table_path,delimiter="\t",header=None,names=["tax_id","odb_id","spec_name","assembly_id","clustered_genes","ortho_groups","mapping_type"])
    filtered = pd.DataFrame(columns=odb.columns)
    for spec in species_list:
        row = odb[odb["spec_name"]==spec]
        filtered = filtered.append(row)
    filtered.drop(columns=["clustered_genes","ortho_groups","mapping_type"],inplace=True)
    return filtered

#Read config files 
config = parse_config()
test_species = config["TestSpecies"]
species_path="config/v10_0_species.txt"
spec_list, hc = parse_species(species_path)
gene_list = parse_genes(config["GenesFile"])
tax_table = odb_tablev10(spec_list)
run_name = config["RunName"]
create_run_directory(run_name)

errors_fpath = '{0}/summary/errors.tsv'.format(run_name)
seq_qc_fpath = '{0}/summary/seq_QC.tsv'.format(run_name)

DISPLAY_PARAMS = False
if DISPLAY_PARAMS:
    print("Tax table for species list at "+species_path)
    display(tax_table)
    print("Gene list: "+str(gene_list))
    print("Run Name: "+ run_name)
#Verify that species table, gene list, and run_name are correct

In [5]:
def gen_blos_df():
    """
    Generates 1) a list of accepted amino acid characters, 2) a DataFrame corresponding to the BLSOUM62 matrix
    that can be indexed using non-gap amino acid characters on rows/ cols, and 3) a dictionary from non-gap
    amino acid characters to background probabilities from the BLOSUM62 matrix
    Uses BLOSUM background distribution from Capra and Singh (2007)
    """
    global aas, blosum62_bg
    aas = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V','-']
    bg_probs = [0.078, 0.051, 0.041, 0.052, 0.024, 0.034, 0.059, 0.083, 0.025, 0.062, 0.092, 0.056, 0.024, 0.044, 0.043, 0.059, 0.055, 0.014, 0.034, 0.072]#, 0.000001]
    blosum62_bgdict = dict(zip(aas,bg_probs))
    blosum62_bg = bg_probs
    blos_df = pd.DataFrame(index=aas[:-1],columns=aas[:-1])
    for pair in blosum62:
        val = blosum62[pair]
        first, second = pair[0],pair[1]
        if first in aas and second in aas:
            blos_df.loc[first,second] = val
            blos_df.loc[second,first] = val
    sim_matrix = blos_df.values
    return aas, blosum62_bg, blos_df, sim_matrix


In [1]:
#Acquire input data via OrthoDB API 
def ODB_query(run_name,gene_name,level_str,spec_str):
    """Queries OrthoDB via the fasta and tab API for gene_name. 
    More info: https://www.orthodb.org/orthodb_userguide.html#api
    level_str corresponds to the API variable for phylogenetic clade 
    spec_str corresponds to the taxonomy ids for the list of species from the config folder 
    """
    import time, json 
    from json import JSONDecodeError
    #File paths and OrthoDB urls for downloads. NOTE BASE_URL might need updating depending on ODB conventions
    BASE_URL = "https://v101.orthodb.org"
    query_str = "query={0}".format(gene_name)
    fasta_url = "{0}/fasta?{1}&{2}&{3}".format(BASE_URL,query_str,level_str,spec_str)
    fasta_path = "{0}/input/{1}.fasta".format(run_name,gene_name)
    tsv_url = "{0}/tab?{1}&{2}&{3}".format(BASE_URL,query_str,level_str,spec_str)
    tsv_path = "{0}/input/{1}.tsv".format(run_name,gene_name)
    #Obey OrthoDB download restrictions (one request per second) bc you're a good noodle
    t1 = time.process_time()
    fasta_proc = subprocess.run(args=['wget',fasta_url,'-O',fasta_path])
    if (time.process_time()-t1) < 1:
        time.sleep(0.5) 
    t1 = time.process_time()
    tsv_proc = subprocess.run(args=['wget',tsv_url,'-O',tsv_path])
    if (time.process_time()-t1) < 1:
        time.sleep(0.5)
    try: 
        #JSON format returned if no results for query string - try opening downloaded data as JSON, if
        #successful, raise an OrthoDBQueryError
        tsv_json = json.load(open(tsv_path))
        os.remove(fasta_path)
        os.remove(tsv_path)
        raise OrthoDBQueryError(0,"No OrthoDB results for query")
    except JSONDecodeError:
        #Check if html syntax present in file (result of too many clusters returned to be downloaded); 
        #if not, query was successful and run_name/input should now have ODB formatted .fasta and .tsv files
        file_txt = ""
        with open(fasta_path,"rt") as fasta_f:
            for i in range(10):
                file_txt = file_txt + fasta_f.readline()
            if bool(BeautifulSoup(file_txt,"html.parser").find()):
                os.remove(fasta_path)
                os.remove(tsv_path)
                raise OrthoDBQueryError(1,"OrthoDB search yielded too many clusters")
            #If no OrthoDBQueryError is raised, download was successful (no further action needed)
    
def download_input_data(gene_list,tax_table,config):
    """Queries OrthoDB for all entries in gene list (logs failed searches into errors_fpath), using species 
    list from tax_table and taxonomy level provided in config directory. This function will attempt to 
    query OrthoDB for each gene symbol in gene_list according to the species list in the config directory. 
    Note that for configuring the species list, one OrthoDB input file can be generated with a large 
    number of species whose sequences can later be removed from the sequence set used for alignment/ analysis,
    allowing the same input file to serve different downstream analyses with smaller species sets. 
    
    Returns the list of gene symbols from gene_list for which OrthoDB data was successfully downloaded
    and the list of gene symbols for which the OrthoDB queries failed"""
    tax_ids = tax_table["tax_id"].values.astype(str)
    spec_str = "species="+",".join(tax_ids)
    level_str = "level="+str(config["odb_level"])
    failed_queries = []
    run_name = config["RunName"]
    errors_fpath = run_name+"/summary/errors.tsv"
    if os.path.exists(errors_fpath):
        errors_df = pd.read_csv(errors_fpath,delimiter='\t')
        ODB_errors_df = errors_df.loc[errors_df["error_type"]=="OrthoDBQueryError",:]
        check_error_file = True
    else:
        check_error_file = False
    for gene_name in gene_list: 
        fasta_path = "{0}/input/{1}.fasta".format(run_name,gene_name)
        if config.getboolean("OverwriteInput") or not os.path.exists(fasta_path):
            if check_error_file and gene_name in ODB_errors_df["gene"].unique():#ODB_errors_df["gene"].str.match(gene_name).any():
                ODB_error_row = ODB_errors_df.loc[ODB_errors_df["gene"]==gene_name,:]
                genename,error_type,error_code,error_msg = ODB_error_row.values[0]
                print("{0}\t{1}\t{2}\t{3}".format(genename,error_type,error_code,error_msg))
                failed_queries.append(gene_name)
            else:
                try:
                    ODB_query(run_name,gene_name,level_str,spec_str)
                except OrthoDBQueryError as odb_error:
                    failed_queries.append(gene_name)
                    write_errors(errors_fpath,gene_name,odb_error)

    print("Input queries downloaded.")
    valid_queries = [gene for gene in gene_list if gene not in failed_queries]
    return valid_queries, failed_queries


In [None]:
def download_AGS_data(input_csv_fpath, config):
    run_name = config["RunName"]
    NCBI_input_dir = "{0}/input/NCBI".format(run_name)
    create_directory(AGS_input_dir)
    
    unfilled_ID_csv_fpath = "config/cDNA_list_AGS_geneIDs_edited.csv"
    filled_outpath = "{0}/summary/cDNA_list_AGS_geneIDs_complete.csv".format(run_name)
    NCBI_errors_fpath = "{0}/summary/NCBI_errors.tsv".format(run_name)
    
    geneID_df = map_AGS_geneIDs(unfilled_ID_csv_fpath,filled_outpath,NCBI_errors_fpath)
    AGS_gene_id_df = geneID_df.loc[~geneID_df[field_name].isnull(),:]
    AGS_gene_id_df = download_NCBI_records(AGS_gene_id_df,NCBI_input_dir)
    
def map_AGS_geneIDs(csv_inpath, results_outpath, errors_tsv_path):
    """Reads a csv file at specified path, generates a new DataFrame containing AGS Gene IDs when available.
    
    csv_inpath: csv file containing list of genes and other identifying information. Only required columns are
    "Human Gene ID", and FIELD_NAME (AGS Gene ID). 
    For each row in the csv file, attempts to fetch NCBI Gene ID for AGS using ortholog information. Will
    fail if 1) Human Gene ID is missing or 2) No orthologs could be fetched for AGS and will write a brief
    error message to errors_tsv.
    Returns a DataFrame object corresponding to the original csv file with all possible entries of FIELD_NAME
    populated with the ortholog Gene ID numbers. 
    
    To repeat this sequence fetching for another species, adjust AGS_TAX_ID to the corresponding species' 
    NCBI Taxonomy ID (can be found on Taxonomy Browser)
    """
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()  
    chrome_options.add_argument("--headless")  
    WINDOW_SIZE = "1920,1080"
    chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
    driver = webdriver.Chrome(chrome_options=chrome_options)

    FIELD_NAME = "AGS Gene ID"
    AGS_TAX_ID = "9999"
    field_conv_dict = {"Human Gene ID":str,FIELD_NAME:str}
    csv_df = pd.read_csv(csv_inpath,dtype=field_conv_dict)

    missing_AGS_gid = csv_df.loc[csv_df[FIELD_NAME].isnull(),:]
    missing_hgid = missing_AGS_gid.loc[missing_AGS_gid["Human Gene ID"].isnull(),:]

    if os.path.exists(errors_tsv_path):
        errors_df = pd.read_csv(errors_tsv_path,delimiter="\t",index_col="gene")
        
    for idx,row in missing_AGS_gid.iterrows():
        symbol = row["Gene Symbol"]
        hgid = row["Human Gene ID"]
        if idx in missing_hgid.index:
            write_errors(symbol,"No Human GeneID present in data",errors_tsv_path)
            continue
        else:
            try:
                req_url = "https://www.ncbi.nlm.nih.gov/gene/{0}/ortholog/?scope={1}".format(hgid,AGS_TAX_ID)
                driver.get(req_url)
                result_url = driver.current_url
                if re.search("scope={0}".format(AGS_TAX_ID),result_url):
                    entry_xpath = "//tbody/tr/td[@class=' fld-gene']/a"
                    entry = driver.find_element_by_xpath(entry_xpath)
                    entry_href = entry.get_attribute('href')
                    entry_gid = re.search("/gene/(\d*)",entry_href).groups()[0]
                    csv_df.loc[idx, FIELD_NAME] = entry_gid
                else:
                    write_errors(symbol,"No AGS ortholog present for GeneID {0}".format(hgid),errors_tsv_path)
            except NoSuchElementException as e:
                write_errors(symbol,"No Orthologs present for GeneID {0}".format(hgid),errors_tsv_path)
    
    csv_df.to_csv(results_outpath)
    return csv_df

def download_NCBI_records(AGS_gene_id_df, NCBI_records_dirpath):
    """Downloads NCBI protein records for each NCBI Gene ID listed in AGS_gene_id_df.
    
    AGS_gene_id_df: DataFrame object with required columns 'Gene Symbol' and 'AGS Gene ID.' Gene symbol 
    entries are used to name fasta files downloaded; AGS Gene IDs are queried using Entrez elink 
    to 1) match Gene ID to all corresponding Protein IDs and 2) download those Protein IDs into one fasta 
    file per gene symbol, saved into NCBI_records_dirpath
    
    Adds the comma-separated list of NCBI Protein record IDs to AGS_gene_id_df in the column "Prot_UIDs"
    and returns AGS_gene_id_df
    """
    ENTREZ_BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    NCBI_API_KEY = "24a8e1dd4d64cf50b37f0bdd369af8274309"
    OVERWRITE_FASTAS = ["ATPIF1"]
    #Convert Gene ID to list of Protein IDs corresponding to transcript variant sequences
    # for AGS_gid in AGS_gids[:1]:
    for idx,row in AGS_gene_id_df.iterrows():
        symbol = row["Gene Symbol"]
        print("==={0}===".format(symbol))
        fasta_fpath = "{0}/{1}_AGS.fasta".format(NCBI_records_dirpath,symbol)
        if not os.path.exists(fasta_fpath) or symbol in OVERWRITE_FASTAS:
            AGS_gid = row[field_name]
            print("AGS Gene ID: {0}".format(AGS_gid))
            elink_req = "elink.fcgi?dbfrom=gene&db=protein&id={0}&api_key={1}".format(AGS_gid,NCBI_API_KEY)
            gp_elink_url = ENTREZ_BASE_URL+elink_req

            file = urllib.request.urlopen(gp_elink_url)
            xml_data = file.read()
            file.close()

            root = ET.fromstring(xml_data)
            #Check XML formatting of elink pages - update xpath accordingly if functionality breaks
            #Pulls Record IDs for Protein specifically; use gene_protein_refseq for Protein RefSeqs
            protein_IDs = [link.text for link in root.findall(".//LinkSetDb[LinkName='gene_protein']/Link/Id")]
            id_str = ','.join(protein_IDs)
            AGS_gene_id_df.loc[idx,"Prot_UIDs"] = id_str    

            efetch_req = "efetch.fcgi?db=protein&id={0}&rettype=fasta&retmode=text&api_key={1}".format(id_str,NCBI_API_KEY)
            efetch_url = ENTREZ_BASE_URL + efetch_req
            subprocess.run(args=['wget',efetch_url,'-O',fasta_fpath])
    return AGS_gene_id_df

In [7]:

def format_odb_field(field):
    """Remove spaces, commas, and capitalization from alias/ odb fields to search for string matches.
    If field is empty (np.nan), return empty string"""
    if(type(field)) == str:
        field = field.replace(" ","")
        field = field.replace(",","")
        field = field.replace("\n","")
        return field.lower()
    elif type(field) == float and np.isnan(field):
        return ""
    
def write_aliases_f(aliases,aliases_fpath):
    """Write aliases data from GeneCards to a txt file list at aliases_fpath"""
    aliases_f = open(aliases_fpath,'wt')
    for a in aliases:
        aliases_f.write(a.strip()+'\n')
    aliases_f.close()
    
def download_alias_data(gene_name):
    """Queries GeneCards for alias data for gene_name. Should only be called if aliases_fpath doesn't exist 
    (ie if query has not been previously run and written to file). Attempts GeneCards query - if gene_name
    leads to a single entry page, pulls aliases from page html. If query leads to a query results page, 
    checks all linked entries to see if any contain gene_name. If none do (or other WebDriver issues arise),
    raises a GeneCardsError. 
    
    If query was successful (either single result page or successfully chose linked result from query results),
    return aliases and gc_name (the gene identifier used by GeneCards). gc_name stored separately since alias html 
    extraction will miss it otherwise. Also writes alias data to aliases_fpath
    
    Updated 01/10/2020. If function is consistently failing, check xpath class names against orthodb website"""
    aliases_fpath = "aliases_data/"+gene_name+"_aliases.txt"
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()  
    chrome_options.add_argument("--headless")  
    WINDOW_SIZE = "1920,1080"
    chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
    import time 
    gene_cards_url = "https://www.genecards.org/cgi-bin/carddisp.pl?gene={0}".format(gene_name.upper())
    list_xpath = "//ul[@class='list-unstyled list-spacious']/li"
    elem_xpaths = [list_xpath]
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get(gene_cards_url)
    aliases = []
    for xpath in elem_xpaths:
        elems = driver.find_elements_by_xpath(xpath)
        innerHTMLs = [elem.get_attribute("innerHTML") for elem in elems]
        col_aliases = [BeautifulSoup(markup).find(text=True).strip() for markup in innerHTMLs]
        aliases.extend(col_aliases)
    if len(aliases) > 0:  
        #Means gene_name query to GeneCards autoredirected to a single page - normal aliases scraping
        #HTML parsing for GeneCards website - end result is list of trimmed alias strings
        gc_re = re.search("gene=([A-Z0-9]+)",gene_cards_url)
        gc_name = gc_re.groups()[0].strip()
        if gc_name not in aliases:
            aliases.insert(0,gc_name)
        #Cache aliases to aliases_fpath
        write_aliases_f(aliases,aliases_fpath)
        driver.quit()
        return aliases, gc_name
    else: 
        #Try search results page for gene_name; raise GeneCardsError if no results or check each page 
        #for alias matching gene_name otherwise
        query_url = "https://www.genecards.org/Search/Keyword?queryString={0}".format(gene_name)
        links_xpath = "//td[@class='gc-gene-symbol gc-highlight symbol-col']/a"
        link_elems = driver.find_elements_by_xpath(links_xpath)
        if link_elems: 
            for elem in links_elems:
                elem_href = elem.get_attribute("href")
                driver.get(elem_href)
                query_url = driver.current_url
                elem_gc_name = re.search("gene=([A-Z0-9]+)",query_url).groups()[0].strip()
                elem_aliases = []
                for xpath in elem_xpaths:
                    elems = driver.find_elements_by_xpath(xpath)
                    innerHTMLs = [elem.get_attribute("innerHTML") for elem in elems]
                    col_aliases = [BeautifulSoup(markup).find(text=True).strip() for markup in innerHTMLs]
                    elem_aliases.extend(col_aliases)
                if gene_name in elem_aliases or gene_name == elem_gc_name:
                    #Found query result with gene_name 
                    driver.quit()
                    if elem_gc_name not in elem_aliases:
                        elem_aliases.insert(0,elem_gc_name)
                    write_aliases_f(elem_aliases,aliases_fpath)
                    return elem_aliases, elem_gc_name
        #If either no link_elems (empty search results page), or none correspond to gene_name:
        driver.quit()
        raise GeneCardsError(0,"Could not automatically fetch alias data from GeneCards - consider searching manually")

def find_ref_seqs(gene_name, tsv_df,errors_fpath):
    """Returns a list of the orthodb ids of the reference sequences from an OrthoDB tsv_df and a set containing 
    gene_name and the GeneCards primary alias for gene_name (if it differs from gene_name)
    These reference sequences are defined as records with pub_gene_id, og_name, or description having a
    text match to either gene_name or one of the GeneCards listed aliases for gene_name. 
    Alias data is fetched from GeneCards automatically and stored in the aliases directory as text file lists. 
    """ 
    ref_ids = []
    #Go to genecards page for gene_name, extract information for aliases from the webpage/ html 
    aliases_fpath = "aliases_data/"+gene_name+"_aliases.txt"
    
    #If file doesn't exist or was improperly downloaded to yield only one line, repeat 
    #fetching alias names 
    if (not os.path.exists(aliases_fpath)) or len(open(aliases_fpath,'r').readlines()) == 1:
        try: 
            aliases, gc_name = download_alias_data(gene_name)
            matches = set((gene_name.upper(),gc_name.upper()))
        except GeneCardsError as gc_error:
            aliases = [gene_name]
            matches = set((gene_name.upper(),))
            write_errors(errors_fpath,gene_name,gc_error)
    else:
        #Read aliases information previously downloaded from GeneCards
        aliases_f = open(aliases_fpath,'r')
        aliases = aliases_f.readlines()
        aliases_f = open(aliases_fpath,'r')
        gc_name = aliases_f.readline().strip()
        matches = set((gene_name.upper(),gc_name.upper()))
    #Remove spaces, commas, new line chars, and capitalization from alias strings
    formatted_aliases = [format_odb_field(alias) for alias in aliases]
    #Search fields in search_fields for matches to the alias strings provided by GeneCards
    #Iterate tsv_df rows, save all reference ids which have matches 
    search_fields = ["pub_gene_id","og_name","description"]
    aliases_pat = "|".join(formatted_aliases)
    for idx,row in tsv_df.iterrows():
#         for field in search_fields:
            #Current behavior: exact matches in formatted pub_gene_id, og_name, or description only.
            #TODO: Add in partial string matching. Difficulties with distinguishing gene_names
        for alias in formatted_aliases: 
            for field in search_fields:
                formatted_field = format_odb_field(str(row[field]))
                try:
                    if re.search(alias,formatted_field):
                        if idx not in ref_ids:
                            ref_ids.append(idx)#["int_prot_id"])
                            break
                except Exception as e:
                    #Special regexp characters present in alias 
                    if alias in formatted_field:
                        if idx not in ref_ids:
                            ref_ids.append(idx)#["int_prot_id"])
                            break
    #matches is a tuple of strings used to filter reference sequences in pg_id_df; either 
    #one entry or gene_name and then the entry used by the GeneCards page 
    return ref_ids, matches


In [8]:
#Fasta file reading functions: 
#filter_fasta_infile reads input files and outputs all records corresponding to filtered_ids to a new file
#Remaining functions provide conversions between fasta files, pandas Series, and pandas dataframes 
#having alignment positions as columns 

def filter_fasta_infile(filtered_ids, infile_path, outfile_path=None,ordered=False):
    #If outfile_path is provided, write filtered fasta to outfile_path
    """Generates new fasta file to outfile_path using the subset of sequences in infile_path
    which have ids in filtered_ids
    ordered: if true, sequences will be returned/ written in order of filtered_ids
             if false, uses sequence order of sequences in infile_path
    """
    def filtered_generator(filtered_ids, infile_path):
        fasta_seqs = SeqIO.parse(open(infile_path),'fasta')
        for fasta in fasta_seqs:
            if fasta.id in filtered_ids:
                yield fasta 
    def ordered_filtered_generator(filtered_ids, infile_path):
        for id_ in filtered_ids:
            fasta_seqs = SeqIO.parse(open(infile_path),'fasta')
            for fasta in fasta_seqs:
                if fasta.id == id_:
                    yield fasta
                    break
    if outfile_path:
        if ordered:
            filtered = ordered_filtered_generator(filtered_ids, infile_path)
        else:
            filtered = filtered_generator(filtered_ids, infile_path)
        SeqIO.write(filtered,outfile_path,"fasta")
    if ordered:
        filtered = ordered_filtered_generator(filtered_ids, infile_path)
    else:
        filtered = filtered_generator(filtered_ids, infile_path)
    filtered_srs = pd.Series(index=filtered_ids)
    for fasta in filtered:
        filtered_srs[fasta.id] = str(fasta.seq)
    return filtered_srs


def srs_to_fasta(seq_srs, outfile_path):
    #Write records in seq_srs to outfile_path in fasta format 
    def record_generator(seq_srs):
        for idx, seq in seq_srs.iteritems():
            record = SeqRecord(Seq(seq,IUPAC.protein),id=idx)
            yield record
    records = record_generator(seq_srs)
    SeqIO.write(records,outfile_path,"fasta")

def fasta_to_srs(fasta_path):
    fasta_seqs = SeqIO.parse(open(fasta_path),'fasta')
    id_seq_map = OrderedDict()
    for fasta in fasta_seqs:
        record_id = fasta.id
        seq = str(fasta.seq)
        id_seq_map[record_id] = seq
    return pd.Series(name="seq",data=id_seq_map)

def align_srs_to_df(align_srs):
    #Returns DataFrame object from series of aligned sequences; columns are 1-indexed positions
    #Values are characters in alignment, index is ODB sequence IDs
    n_seq = len(align_srs)
#     display(align_srs)
#     display(align_srs.iloc[0])
    seq_len = len(align_srs.iloc[0])
    align_df = pd.DataFrame(index=align_srs.index,columns=range(seq_len))
    for idx, seq in align_srs.iteritems():
        align_df.loc[idx,:] = list(seq)
    align_df.columns += 1
    return align_df

def seq_srs_to_align_df(seq_srs,align_in_fpath,align_out_fpath):
    """Transform seq_srs (pandas Series containing sequence texts) to a DataFrame for which each column
    is an alignment position and column. Writes input fasta and output fastas for alignment to align_in_fpath
    and align_out_fpath respectively. Also returns average (non-diagonal) identity distances"""
    srs_to_fasta(seq_srs,align_in_fpath)
    n, ordered_ids, id_dm, align_srs = construct_id_dm(seq_srs,align_in_fpath,align_out_fpath)
    align_df = align_srs_to_df(align_srs)
    dist_srs = avg_dist_srs(align_srs.index,id_dm)
    return align_df, dist_srs
    
def align_srs_to_seq_srs(align_srs,outfile_path=None):
    #Return new Series (same index) of sequences with gap characters dropped
    #If outfile_path is provided, write un-aligned record seqs to new fasta file
    seq_srs = pd.Series(index=align_srs.index)
    for idx, align_seq in align_srs.iteritems():
        seq = align_seq.replace("-","")
        seq_srs[idx] = seq
    if outfile_path:
        srs_to_fasta(seq_srs,outfile_path)
    return seq

def align_df_to_srs(align_df):
    #Returns series of aligned sequences from array of aligned positions
    align_srs = pd.Series(index=align_df.index)
    for idx,record in align_df.iterrows():
#       #seq is a string joining all characters with no delimiter (i.e. the original aligned sequence with gaps)
        seq = ''.join(record.values)
        align_srs[idx] = seq
    return align_srs


In [9]:
def load_ka_distmat(fasta_infile,align_outfile="tmp/ka_distmat_align.fasta",distmat_file="tmp/ka_distmat.tsv"):
    """REQUIRES: Modified KAlign source code to include distance matrix output, written to tmp/ka_distmat.tsv
    Uses KAlign's modified distance metric (Wu-Manber, partial scoring for 3aa patterns with one error 
    tolerated), outputs to tsv and reads and stores as an ndarray (n x n) with n = number of sequences in fasta_infile
    
    Also return a list of the ODB ids of sequences (in order) corresponding to the distmat/ alignment order
    """
    proc = subprocess.run(args=["kalign",'-i',fasta_infile,"-o",align_outfile,"-f","fasta"])
    distmat_flines = open(distmat_file).readlines()
    n = len(distmat_flines)
    distmat = np.ndarray((n,n))
    
    for i,line in enumerate(distmat_flines):
        as_list = line.split()
        line_arr = np.array(as_list).astype(np.float)
        distmat[i] = line_arr
    ordered_ids = []
    align_seqs = SeqIO.parse(open(align_outfile),'fasta')
    for record in align_seqs:
        ordered_ids.append(record.id)
    return n, ordered_ids, distmat, align_outfile
def construct_id_dm(seq_df,seq_fpath,align_outpath="tmp/iddm_align.fasta",ordered=False):
    """
    seq_df: DataFrame of OrthoDB/ NCBI sequence records; should only contain records for which identity 
    distance matrix will be computed 
    seq_fpath: Path of fasta file containing at least all of the records in seq_df. Can contain more records -
    a temporary file containing only the records in seq_df.index will be generated 
    align_outpath: Optional filepath. If provided, the resulting alignment will be stored there. Otherwise,
    written to a temporary file (tmp/iddm_align.fasta)
    
    Returns n (number of aligned sequences), ordered_ids (list of record ids in alignment), 
    id_dm (np.ndarray corresponding to the identity distance matrix computed by AlignIO), and align_srs
    (pandas Series object containing aligned sequences)
    """
    from Bio.Phylo.TreeConstruction import DistanceCalculator
    from Bio import AlignIO
    #Filter records in seq_fpath to new fasta only containing records in seq_df.index
    filtered_outpath = "tmp/iddm.fasta"
    filter_fasta_infile(seq_df.index,seq_fpath,outfile_path=filtered_outpath,ordered=ordered)
    #KAlign sequences in filtered_outpath, write to align_outpath
    n, ordered_ids, ka_dm, align_outfile = load_ka_distmat(filtered_outpath,align_outfile=align_outpath)
    align_srs = fasta_to_srs(align_outpath)
    aln = AlignIO.read(open(align_outpath), 'fasta')
    calculator = DistanceCalculator('identity')
    id_dm_obj = calculator.get_distance(aln)
    #Convert AlignIO object to np.ndarray
    for i,r in enumerate(id_dm_obj):
        if i == 0:
            id_dm = np.array(r)
        else:
            id_dm = np.vstack((id_dm,r))
    return n, ordered_ids, id_dm, align_srs
def avg_dist_srs(index,distmat):
    #index is a pandas Index object with entries corresponding to the distmat (i.e. lengths should be equal)
    #Calculate mean of non-self record distances (diagonal distances generally force-set to 0, so 
    #sum functions as intended)
    n = len(distmat)
    avg_dists = np.sum(distmat, axis=1)/(n-1)
    dist_srs = pd.Series(data=avg_dists,index=index,name="dist")
    return dist_srs



In [1]:
def filter_ref_seqs4(gene_name,matches,ref_fasta_path,ref_df,seq_qc_fpath,known_spec_list=["10090_0","9606_0","43179_0"]):
    """
    Returns a DataFrame of records (at most one per species) corresponding to best internal matches for gene_name
    
    gene_name: gene symbol identifier 
    matches: set containing gene_name and GeneCards accepted alias for gene_name if different
    ref_fasta_path: file path for fasta file containing all reference sequences 
    seq_qc_fpath: pseudo error log for records containing significant length differences or lacking supporting
                  OrthoDB field information for matches with gene_name
    known_spec_list: optional, contains tax_ids for species considered core to analysis 
    (generally species of interest and then well-annotated species i.e. human/mouse)
    
    Briefly: first records for human, mouse, and test species (13LGS) are filtered independently 
    with select_known_species_records, ultimately selecting a set of one record per species 
    that most consistently match the records (measured by percent identity in alignment) 
    present for the given gene symbol. 
    For other species, sequences with the highest identity to the established human/mouse/13LGS sequences
    are selected but discarded if they do not meet the identity threshold (see select_species_records) or
    if the length of the sequence is > 10% different from the median length. 
    
    """
    ref_ids = list(ref_df.index)
#     ksr_tsv_df = ref_tsv.loc[ref_tsv["organism_taxid"].isin(known_spec_list)]
    ksr_ref_df = ref_df.loc[ref_df["organism_taxid"].isin(known_spec_list)]
    #Create new tmp fasta file with known species annotated records only
    ks_refseqs_fpath = "tmp/ks_refseqs.fasta"
    known_spec_records = filter_fasta_infile(ksr_ref_df.index,ref_fasta_path,outfile_path=ks_refseqs_fpath)
    
    #Search ksr_full_df for pub_gene_id field matches to main aliases (matches)
    upper_matches = ["{0}$|{0}[;]".format(match.upper()) for match in matches]
    matches_pat = "|".join(upper_matches)
    ksr_pgid_df = ksr_ref_df.loc[ksr_ref_df["pub_gene_id"].str.upper().str.contains(matches_pat)]
    final_ksr_df = select_known_species_records(ksr_pgid_df,ksr_ref_df,ks_refseqs_fpath)
    #Quality checking of final_ksr_df:
    final_ksr_df_QC(gene_name,matches,seq_qc_fpath,final_ksr_df,known_spec_list)
    ref_pgid_df = ref_df.loc[ref_df["pub_gene_id"].str.upper().str.contains(matches_pat)]
    final_df = select_species_records(ref_pgid_df,ref_df,final_ksr_df,ref_fasta_path)
    length_filter = True
    if length_filter:
        #Remove records whose length differs by more than 10% from the median (but keep representative seqs for 
        #human, mouse, GS)
        med_len = final_df["length"].median()
        len_filtered = final_df.loc[(np.abs((final_df["length"]-med_len)/med_len) < 0.1) | (final_df["organism_taxid"].isin(known_spec_list)),:]
        final_df = len_filtered
#     final_align_df, dist_srs = seq_srs_to_align_df(final_df["seq"],MSA_input_fasta,MSA_output_fasta)
    return final_df

def select_known_species_records(ksr_pgid_df,ksr_ref_df,ks_refseqs_fpath):
    """Returns a DataFrame of best records available for mouse, 13LGS, and human.
    
    ksr_pgid_df: DataFrame of known species (mouse, 13LGS, human) records for which pub_gene_id matched gene_name
    ksr_ref_df: DataFrame of known species records meeting the match criteria established in find_ref_seqs
    ks_refseqs_fpath: File path to a fasta containing the sequences corresponding to ksr_ref_df
    
    Return a dataframe of at most one record per species in KS_taxids, selecting first if there is only 
    one pubgene_id match record for that species and otherwise selecting the record with maximum identity
    with other single pgid matched records, preferably from pgid_df but from ksr_ref_df if no pgid match exists
    for that species.
    If no species has only one record with a pubgene_id_match, takes manual user input to select a 
    representative record to base other record selection off of."""
    TEST_ID = "43179_0"
    KS_TAXIDS = ["10090_0","43179_0","9606_0"]
    ksr_taxid_uniques = ksr_ref_df["organism_taxid"].unique()
    pgid_taxid_uniques = ksr_pgid_df["organism_taxid"].unique()
    
    #Case Handling: No human or mouse reference sequences for gene
    if len(ksr_taxid_uniques) == 1 and TEST_ID in ksr_taxid_uniques:
        raise SequenceDataError(3,"No Human or Mouse reference sequences")
            
    #Case Handling - if ksr_pgid_df is not composed of one sequence each for human, mouse, 13LGS
    if len(ksr_pgid_df) > 3 or len(pgid_taxid_uniques) <3:
        single_match_pgid_records = pd.DataFrame(columns=ksr_pgid_df.columns)
        for ks_id in KS_TAXIDS:
            ks_pgid_df = ksr_pgid_df.loc[ksr_pgid_df["organism_taxid"]==ks_id,:]
            if len(ks_pgid_df) == 1:
                single_match_pgid_records = single_match_pgid_records.append(ks_pgid_df)
                
        if single_match_pgid_records.empty:
            #Case handling if no single_match_records (ie CALM1): 
            #Allow manual input selection from either ksr_pgid_df or ksr_ref_df, raise SequenceDataError if both empty
            if not ksr_pgid_df.empty: 
                print("pubgeneID matched records")
                display(ksr_pgid_df)
                selection_df = ksr_pgid_df
            elif not ksr_ref_df.empty:
                print("GeneCards alias matched records")
                display(ksr_ref_df)
                selection_df = ksr_ref_df
            else:
                raise SequenceDataError(2,"No GeneCards alias matched sequence records for human/mouse/GS")
            try: 
                input_idx = input("Enter 0-indexed position of representative sequence for analysis")
                int_idx = int(input_idx)
                selection_row = selection_df.iloc[int_idx,:]
                
            except (IndexError, ValueError) as e:
                print("Bad Input")
                while (not re.search("^\d+$",input_idx)) or (int(input_idx)>=len(selection_df) or int(input_idx)<0):
                    input_idx = input("Enter a number between 0 and {0}".format(len(selection_df)-1))
                int_idx = int(input_idx)
                selection_row = selection_df.iloc[int_idx,:]
            single_match_pgid_records = single_match_pgid_records.append(selection_row)

        final_ksr_df = pd.DataFrame(columns=single_match_pgid_records.columns)
        sm_record_ids = single_match_pgid_records.index                   
            
        for ks_id in KS_TAXIDS:
            if ks_id not in single_match_pgid_records["organism_taxid"].unique():
            #For known_species taxids with 0 or 2+ pgid records: first check pgid matches (2+ pgid), then ref seqs (0 pgid)
                if ks_id in pgid_taxid_uniques:
                    #2+ pgid records: Construct id_dm of pgid matched records; select best ks_id 
                    #based on max identity with single_match_records
                    n, ordered_ids, id_dm, align_srs = construct_id_dm(ksr_pgid_df,ks_refseqs_fpath)
                elif ks_id in ks_id in ksr_taxid_uniques:
                    #0 pgid records; construct id_dm from ksr_ref_df records, select max identity record
                    n, ordered_ids, id_dm, align_srs = construct_id_dm(ksr_ref_df,ks_refseqs_fpath)
                else:
                    continue
                #Maximum identity = minimum id_dm value based on AlignIO implementation
                md_row, min_dist = min_dist_spec_record(ks_id,id_dm,ordered_ids,sm_record_ids,ksr_ref_df)
                final_ksr_df = final_ksr_df.append(md_row)
            else:
                sm_row = single_match_pgid_records.loc[single_match_pgid_records["organism_taxid"]==ks_id,:]
                final_ksr_df = final_ksr_df.append(sm_row)
        return final_ksr_df
                
    else:
        #ksr_pgid_df has only one record each for mouse, 13LGS, human
        #Sort ksr_pgid_df to order of taxids in KS_TAXIDS
        final_ksr_df = pd.DataFrame(columns=ksr_pgid_df.columns)
        for tax_id in KS_TAXIDS:
            row = ksr_pgid_df.loc[ksr_pgid_df["organism_taxid"]==tax_id,:]
            final_ksr_df = final_ksr_df.append(row)
        return final_ksr_df
    
def select_species_records(ref_pgid_df,ref_df,final_ksr_df,refseqs_fpath):
    """
    Selects species records for secondary species (non human/ mouse/ 13LGS) from OrthoDB query input.
    
    ref_pgid_df: DataFrame of sequence records from OrthoDB for which pub_gene_id matched specified gene symbol
    ref_df: DataFrame of sequence records from OrthoDB input which match specifications in find_ref_seqs
    final_ksr_df: DataFrame of human/13LGS/mouse records selected by select_known_species_records
    refseqs_fpath: Fasta file path for fasta containing all records in ref_df.
    
    First constructs alignment of final_ksr_df records, sets acceptable identity threshold to be the average of
    non-diagonal entries in distance_matrix * 1.5. For each other species represented in ref_df, select minimum 
    distance record to HS/GS/MM sequences and add to final sequence set if it is under the identity threshold
    """
    TEST_ID = "43179_0"
    KS_TAXIDS = ["10090_0","43179_0","9606_0"]
    pgid_taxids = [tax_id for tax_id in ref_pgid_df["organism_taxid"].unique() if tax_id not in KS_TAXIDS]
    ref_taxids = [tax_id for tax_id in ref_df["organism_taxid"].unique() if tax_id not in KS_TAXIDS]
    
    #Distance calculations for final set of known species records - check internal identity values
    #Set identity threshold - other species sequences above this value will not be included
    ksr_dm_fpath = "tmp/ksr_dm_ka.fasta"
    n, ksr_ordered_ids, ksr_id_dm, ksr_align_srs = construct_id_dm(final_ksr_df,refseqs_fpath,ksr_dm_fpath,ordered=True)
    non_diagonal_avg = ksr_id_dm.sum(axis=0)/(n-1)
    max_dist_idx = non_diagonal_avg.argmax()
    max_dist_id = ksr_ordered_ids[max_dist_idx]
    max_dist = non_diagonal_avg[max_dist_idx]
    identity_threshold = np.mean(non_diagonal_avg)*1.5
    
    final_df = final_ksr_df.copy()
    unfiltered_df = final_ksr_df.copy()
    for tax_id in ref_taxids:
        try:
            tax_df = ref_df.loc[ref_df["organism_taxid"]==tax_id]
            tax_records = tax_df.index
            #tax_dm_filtered_ids: list of record ids in final_ksr_df followed by all records corresponding to tax_id
            tax_dm_filtered_ids = list(final_ksr_df.index)
            tax_dm_filtered_ids.extend(tax_records) 
            tax_dm_df = ref_df.loc[tax_dm_filtered_ids,:]
            
            tax_dm_df = tax_dm_df.loc[~tax_dm_df.index.duplicated(keep='first')]
            tax_dm_fpath = "tmp/{0}_dm.fasta".format(tax_id)
            n, tax_ordered_ids, tax_id_dm, tax_align_srs = construct_id_dm(tax_dm_df,refseqs_fpath,tax_dm_fpath,ordered=True)
            md_row, min_dist = min_dist_spec_record(tax_id,tax_id_dm,tax_ordered_ids,final_ksr_df.index,tax_dm_df)
            if min_dist <= identity_threshold:
                final_df = final_df.append(md_row)
        except ValueError as e:
            #Debugging edge case for OrthoDB data error with duplicate entries (Irf2bp2)
            #Above code should handle duplicate entries sufficiently s.t. this block isn't run ever
            #But who knows ‾\_(ツ)_/‾
            print(e)
            display(tax_dm_df)
            raise SequenceDataError(5,"Duplicate Sequence Entry")
    return final_df    
        
def final_ksr_df_QC(gene_name,matches,seq_qc_fpath,final_ksr_df,ks_taxids):
    """Writes warnings about compiled final known species records to specified file path.
    
    Current behavior: writes to file if 1) any taxid from ks_taxids is not represented in final dataframe
    2) if length of any individual record differs from the median length by >10% 
    3) if any record has a pub_gene_id that not represented by matches (ie LOC[XXXX...])"""
    if len(final_ksr_df) < len(ks_taxids):
        for tax_id in ks_taxids:
            if tax_id not in final_ksr_df["organism_taxid"].unique():
                message_txt = "No reference sequence for tax_id: {0}".format(tax_id)
                write_ref_seq_QC(seq_qc_fpath,gene_name,message_txt)
    length_srs = final_ksr_df["length"]
    median_len = length_srs.median()
    for record_id in final_ksr_df.index:
        id_len = length_srs[record_id]
        if (np.abs(id_len-median_len)/median_len) >= 0.1:
            message_txt = "Record_id {0} has length {1} which is greater than 10% different from the median ({2})".format(record_id,id_len,median_len)
            write_ref_seq_QC(seq_qc_fpath,gene_name,message_txt)
    upper_matches = [match.upper() for match in matches]
    upper_matches = [match+"$|"+match+"[;]" for match in upper_matches]
    pat = "|".join(upper_matches)
#     final_pgid_df = final_ksr_df.loc[final_ksr_df["pub_gene_id"].str.upper().str.contains(pat)]
    for record_id,pgid in final_ksr_df["pub_gene_id"].iteritems():
        if not re.search(pat,pgid.upper()):
            message_txt = "Record_id {0} has pub_gene_id {1} which doesn't match gene_name ({2})".format(record_id,pgid,gene_name)
            write_ref_seq_QC(seq_qc_fpath,gene_name,message_txt)
def write_ref_seq_QC(seq_qc_fpath,gene_name,message):
    """Writes warning messages to seq_qc_fpath and prints (will not write duplicate entries)"""
    if not os.path.exists(seq_qc_fpath):
        seq_qc_f = open(seq_qc_fpath,'wt')
        seq_qc_f.write("gene\tmessage\n")
    else:
        seq_qc_f = open(seq_qc_fpath,'at')
        seq_qc_df = pd.read_csv(seq_qc_fpath,delimiter='\t')
        if gene_name in seq_qc_df["gene"].unique():
            gene_df = seq_qc_df[seq_qc_df["gene"]==gene_name]
            if message not in gene_df["message"].unique():
                fline = "{0}\t{1}\n".format(gene_name,message)
                seq_qc_f.write(fline)
            else:
                stored_message = seq_qc_df[seq_qc_df["gene"]==gene_name]["message"].iloc[0]
                print("Cached QC Warning:")
                message = stored_message
        else:
            fline = "{0}\t{1}\n".format(gene_name,message)
            seq_qc_f.write(fline)
    print("{0}: {1}".format(gene_name,message))

    
def min_dist_spec_record(spec_taxid,distmat,dm_record_ids,accepted_record_ids, ref_df):
    """Returns row from ref_df which has minimum average distance against accepted records.
    
    spec_taxid: String identifier (ie prefix) that should be contained in all record ids specific to the
    species of interest. Ex: 43179_, XP_ (if only one species has records from NCBI)
    distmat: np.ndarray of pairwise distance values 
    dm_record_ids: ordered record ids corresponding to order of rows in distmat
    accepted_record_ids: entries in dm_record_ids against which distance will be calculated in order to 
    select minimum distance entry.
    ref_df: DataFrame containing record information. This function will return a row from this DataFrame
    
    Given a species taxid (spec_taxid), an np.ndarray distance matrix (distmat), a list of record_ids
    corresponding to the rows in distmat (dm_record_ids),a list of accepted record ids (accepted_record_ids)
    against which record distances will be averaged, and a DataFrame of sequences (ref_df):
    Calculates the average distance of every record containing spec_taxid against accepted records, then
    returns the row from ref_df corresponding to the record with lowest average distance""" 
    #TODO: Can update spec_records selection to function accepting list of species_record_ids rather than taxid
    spec_records = [(i,id_) for i,id_ in enumerate(dm_record_ids) if re.search(spec_taxid,id_)]
    spec_dm_idxs = [t[0] for t in spec_records] 
    accepted_records = [(i,id_) for i,id_ in enumerate(dm_record_ids) if id_ in accepted_record_ids]
    accepted_dm_idxs = [t[0] for t in accepted_records]
    #Select region of interest from DataFrame (ie columns corresponding to spec_records, rows 
    #corresponding to accepted_records)
    spec_dm = distmat[:,spec_dm_idxs]
    sub_dm = spec_dm[accepted_dm_idxs,:]
    #Average into array of average distances for each record in spec_record
    if len(sub_dm) > 1:
        avg_dist = sub_dm.mean(axis=0)
    else:
        avg_dist = sub_dm[0]
    min_idx = np.argmin(avg_dist)
    min_dist_id = spec_records[min_idx][1]
    min_dist = avg_dist[min_idx]
    md_row = ref_df.loc[min_dist_id,:]
    return md_row, min_dist


In [7]:
def process_ODB_input(gene_name, run_name, test_species, errors_fpath, seq_qc_fpath,drop_spec_list=None):
    #Generates final record and final alignment dataframes from raw input files. 
    #If errors arise (see write_errors), updates errors_fpath accordingly

    input_fasta_fpath = "{0}/input/{1}.fasta".format(run_name,gene_name)
    input_tsv_fpath = "{0}/input/{1}.tsv".format(run_name,gene_name)
    gene_output_dir = "{0}/output/{1}".format(run_name,gene_name)
    create_directory(gene_output_dir)
    MSA_input_fasta = "{0}/output/{1}/{1}.fasta".format(run_name,gene_name)
    MSA_output_fasta = "{0}/output/{1}/{1}_MSA.fasta".format(run_name,gene_name)
    ref_fasta_path = "tmp/ref_seqs.fasta"
    try: 
        tsv_df = pd.read_csv(input_tsv_fpath,delimiter='\t')
#         unfiltered_n = len(tsv_df["organism_taxid"].unique())
        tsv_df = tsv_df.set_index(keys="int_prot_id",drop=True)#drop=False)
    except (EmptyDataError, FileNotFoundError) as e:
        raise OrthoDBQueryError(0,"No OrthoDB results for query")
    if drop_spec_list:
        tsv_df = tsv_df.loc[~tsv_df["organism_taxid"].isin(drop_spec_list),:] 
    ref_ids, matches = find_ref_seqs(gene_name,tsv_df,errors_fpath)
    if len(ref_ids) == 0:
        raise SequenceDataError(0,"No reference sequences could be found")
    elif test_species not in tsv_df["organism_name"].values:
        raise SequenceDataError(1,"Test Species has no sequence in input")
    ref_tsv = tsv_df.loc[ref_ids,:]
    
    filtered_seq_srs = filter_fasta_infile(ref_ids,input_fasta_fpath,outfile_path=ref_fasta_path)
    ref_seq_df = pd.DataFrame(filtered_seq_srs,columns=["seq"])
    ref_seq_df["length"] = [len(seq) for seq in ref_seq_df["seq"]]
    ref_df = ref_tsv.join(ref_seq_df,how="inner")
    
    if test_species not in ref_tsv["organism_name"].values:
        raise SequenceDataError(4,"Test Species has no reference sequence in input (but a record is present in unfiltered ODB query)")
    final_records_df = filter_ref_seqs4(gene_name,matches,ref_fasta_path, ref_df,seq_qc_fpath)
    final_align_df, dist_srs = seq_srs_to_align_df(final_records_df["seq"],MSA_input_fasta,MSA_output_fasta)

    return final_records_df, final_align_df


In [None]:
#Parse through AGS downloaded record files. Align to 13LGS, HS, MM sequences. If the record sharing
#maximum identity to 13LGS and sharing max identity to HS/MM, display records and take user input
#to resolve which to use. 

# ss_errors_fpath = "{0}/summary/errors.tsv".format(spec_subs_run_dir)
# if os.path.exists(ss_errors_fpath):
#     errors_df = pd.read_csv(ss_errors_fpath,sep='\t',index_col='gene')
#     data_errors = errors_df.loc[~(errors_df["error_type"]=="SequenceAnalysisError"),:]
#     data_errors_genes = data_errors.index
def select_AGS_record(run_name,gene_name, final_ODB_records_df):
    ""
    KS_TAX_IDS = ["10090_0","43179_0","9606_0"]
    TAX_IDS_13LGS = ["43179_0"]
    TAX_IDS_HSMM = ["10090_0","9606_0"]
    TAX_ID_DICT = {"Urocitellus parryii":9999}
    #Manual handling of inconsistent gene symbols between ODB and NCBI input files
#     MISMATCHED_SYMBOLS = {"TOIP1":"TOR1AIP1","HNRH1":"HNRNPH1","ATP5MC3":"ATP5G3","ATP5G1 (ATP5MC1)":"ATP5G1",\
#                          "ATP5MC1":"ATP5G1"}
    REPEAT_MANUAL_SELECTION = False

    #Generator function to yield SeqIO fasta objects from 1) ss_fasta_fpath corresponding to ksr_ids
    #2) all AGS records in ags_fasta_fpath
    def ksr_ags_generator(ksr_ids,ss_fasta_fpath,ags_fasta_fpath):
        for id_ in ksr_ids:
            ss_fastas = SeqIO.parse(open(ss_fasta_fpath),'fasta')
            for fasta in ss_fastas:
                if fasta.id == id_:
                    yield fasta
                    break
        ags_fastas = SeqIO.parse(open(ags_fasta_fpath),'fasta')
        for fasta in ags_fastas:
            yield fasta

    AGS_fasta_fpath = "{0}/input/NCBI/{1}_AGS.fasta"
    ss_fasta_fpath = "{0}/{1}/{1}.fasta".format(output_dir,gene_name)
    ss_records_fpath = "{0}/{1}/{1}_records.csv".format(output_dir,gene_name)
    #Conditionally needed based on calling this function on gene set provided in AGS Gene ID file and not via SS
#     if not os.path.exists(ss_fasta_fpath):
#         if gene_name in data_errors.index:
#             return
#         elif gene_name in MISMATCHED_SYMBOLS:
#             AGS_dir_gene_name = gene_name
#             gene_name = MISMATCHED_SYMBOLS[gene_name]
#             ss_fasta_fpath = "{0}/{1}/{1}.fasta".format(output_dir,gene_name)
#             ss_records_fpath = "{0}/{1}/{1}_records.csv".format(output_dir,gene_name)
#         else:
#             dir_errors.append(gene_name)
#             return
#     if not os.path.exists(ss_fasta_fpath):
#         
        #cool
        
    ss_records_df = final_ODB_records_df.copy()
    ks_records_df = ss_records_df.loc[ss_records_df["organism_taxid"].isin(KS_TAX_IDS)]
    
    ks_record_ids = ks_records_df.index
    ks_record_srs = filter_fasta_infile(ks_record_ids,ss_fasta_fpath,ordered=True)
    ks_records_df["seq"] = ks_record_srs
    ags_fasta = SeqIO.parse(ags_fasta_fpath,'fasta')
    #Process NCBI fasta file input into DataFrame with fields specified below 
    ags_fasta_df = pd.DataFrame(columns=["NCBI_id","organism_taxid","organism_name","description","length","seq"])
    for f in ags_fasta:
        organism_name = re.search("\[(\w+ \w+)\]",f.description).groups()[0].strip()
        organism_taxid = TAX_ID_DICT[organism_name]
        trunc_desc = re.search("{0} (.*) \[".format(f.id), f.description).groups()[0].strip()
        f_row = pd.Series({"NCBI_id":f.id,"organism_taxid":organism_taxid,"organism_name":organism_name,\
                           "description":trunc_desc,"seq":str(f.seq),"length":len(str(f.seq))},name=f.id)
        ags_fasta_df = ags_fasta_df.append(f_row)
    #Check for multiple AGS records - if more than one, identify AGS record with maximum identity to 
    #1) 13LGS and 2) homo sapiens/ mus musculus records
    if len(ags_fasta_df) > 1:
        ks_ags_idx = ks_records_df.index.copy().append(ags_fasta_df.index)
        ks_ags_fasta_fpath = "tmp/ks_ags_records.fasta"
        ks_ags_msa_fpath = "tmp/ks_ags_records_MSA.fasta"
        fasta_generator = ksr_ags_generator(ks_record_ids,ss_fasta_fpath,ags_fasta_fpath)
        SeqIO.write(fasta_generator,ks_ags_fasta_fpath,"fasta")
        n,dm_record_ids,id_dm,align_srs = construct_id_dm(ks_ags_idx,ks_ags_fasta_fpath,align_outpath=ks_ags_msa_fpath)

        spec_records = [(i,id_) for i,id_ in enumerate(dm_record_ids) if re.search("XP_",id_)]
        record_ids_13LGS, record_ids_hsmm = ks_records_df[ks_records_df["organism_taxid"].isin(TAX_IDS_13LGS)].index, \
                                             ks_records_df[ks_records_df["organism_taxid"].isin(TAX_IDS_HSMM)].index
        md_row_13LGS, md_13LGS = min_dist_spec_record("XP_",id_dm,dm_record_ids,record_ids_13LGS,ags_fasta_df)
        md_row_hsmm, md_hsmm = min_dist_spec_record("XP_",id_dm,dm_record_ids,record_ids_hsmm,ags_fasta_df)
        
        md13LGS_id = md_row_13LGS["NCBI_id"]
        mdhsmm_id = md_row_hsmm["NCBI_id"]
        if not md13LGS_id == mdhsmm_id and REPEAT_MANUAL_SELECTION:
            with pd.option_context('display.max_colwidth', -1):
                print("==={0}===".format(gene_name))
                print("Mouse (10090), 13LGS (43179), Human (9606) Records")
                display(ks_records_df)
                print("AGS isoforms")
                display(ags_fasta_df)
                print("Min Dist Row to 13LGS: ")
                print("{0}\t{1}\t{2}".format(md_row_13LGS["NCBI_id"],md_row_13LGS["length"],md_row_13LGS["seq"]))
                print("Identity to 13LGS: {0}".format(md_13LGS))
                print("Min Dist Row to HS, MM: ")
                print("{0}\t{1}\t{2}".format(md_row_hsmm["NCBI_id"],md_row_hsmm["length"],md_row_hsmm["seq"]))
                print("Identity to human, mouse: {0}".format(md_hsmm))

                row_idx = input("Enter 0-indexed position of record")
                while not re.search("^\d+$",row_idx) or (int(row_idx)<0 or int(row_idx)>=len(ags_fasta_df)):
                    row_idx = input("Enter 0-indexed position of record")
                selection = ags_fasta_df.iloc[int(row_idx),:]
                try: 
                    input_idx = input("Enter 0-indexed position of representative sequence for analysis")
                    int_idx = int(input_idx)
                    selection_row = ags_fasta_df.iloc[int_idx,:]
                except (IndexError, ValueError) as e:
                    print("Bad Input")
                    while (not re.search("^\d+$",input_idx)) or (int(input_idx)>=len(selection_df) or int(input_idx)<0):
                        input_idx = input("Enter a number between 0 and {0}".format(len(selection_df)-1))
                    int_idx = int(input_idx)
                    selection_row = ags_fasta_df.iloc[int_idx,:]

                manual_selections[gene_name] = selection_row
#         elif gene_name in DISPLAY_RECORDS or gene_name in OVERWRITE_FASTAS:
        elif gene_name in OVERWRITE_FASTAS:
            print("==={0}===".format(gene_name))
            print("Mouse (10090), 13LGS (43179), Human (9606) Records")
            display(ks_records_df)
            print("AGS isoforms")
            display(ags_fasta_df)

            print("Min Dist Row to 13LGS: ")
            print("{0}\t{1}\t{2}".format(md_row_13LGS["NCBI_id"],md_row_13LGS["length"],md_row_13LGS["seq"]))
            print("Identity to 13LGS: {0}".format(md_13LGS))
            print("Min Dist Row to HS, MM: ")
            print("{0}\t{1}\t{2}".format(md_row_hsmm["NCBI_id"],md_row_hsmm["length"],md_row_hsmm["seq"]))
            print("Identity to human, mouse: {0}".format(md_hsmm))
            
    
    