**Important notes:** This notebook pipeline only works with protein sequences (no headers) or uploads of zipped fasta files (with headers). To upload zipped fasta files, click the "File" icon on the left panel, and click the "Upload to session storage" icon. These files will be erased at the end of each session, or after the time limit imposed by Google Colaboratory (~12 hours).

In [None]:
#@title Set inputs

# Adapted from AlphaFold2
from google.colab import files
import os.path
import re
import hashlib
import random
import glob2

def add_hash(x,y):
  '''
  Give jobname unique identifier
  '''
  return x+"_"+hashlib.sha1(y.encode()).hexdigest()[:5]

# Acquire user input query sequence
# Need to add test to force amino acid letters and - only

non_AAs = {'O', 'U', 'X', 'B', 'Z', 'J'}
is_input = False
query_sequence = 'BANANA' #@param {type:"string"}
query_sequence = "".join(re.split('; |, |\*|\n|-| ', query_sequence)).upper() # remove whitespaces
if query_sequence:
  is_input = True
  if non_AAs.intersection(query_sequence): 
    print("Warning: Invalid AA(s) in sequence.") 
else:
  query_sequence = "empty"

# Regardless of whether you've input a sequence or not
# Override if true
Upload_zipped_fastas = True #@param {type:"boolean"}
if Upload_zipped_fastas:
  zipfiles = glob2.glob("*.zip")
  if zipfiles:
    is_input = True
    print("Zipped file(s) found:", *zipfiles)
    print("Unzipping file(s)")
  else:
    print("Zipped file(s) not found.")

if is_input:
  # Acquire user input jobname and add unique identifier
  jobname = 'Test' #@param {type:"string"}
  basejobname = "".join(jobname.split()) # remove whitespaces
  basejobname = re.sub(r'\W+', '', basejobname)
  jobname = add_hash(basejobname, query_sequence)
  while os.path.isfile(f"{jobname}.csv"):
    jobname = add_hash(basejobname, ''.join(random.sample(query_sequence,len(query_sequence))))
  with open(f"{jobname}.csv", "w") as text_file:
      text_file.write(f"id,sequence\n{jobname},{query_sequence}")
  print("Setting up job:", jobname)

  # Placeholder - this will be the final output
  os.mkdir(jobname)
  # queries_path=f"{jobname}/{jobname}.csv"

  motif_options = "custom" #@param ["none", "E75","custom"]
  #@markdown * Warning: Do not select more than one motif upload.
  if motif_options == "E75":
    use_motif = True
    custom_motif_path = None #
  elif motif_options == "custom":
    print("Input MEME-formatted motif .txt file:")
    custom_motif_path = f"{jobname}/motif/"
    os.mkdir(custom_motif_path)
    uploaded_motif = files.upload()
    use_motif = True
    for fn in uploaded_motif.keys():
      os.rename(fn, f"{jobname}/motif/{fn}")
  else:
    custom_motif_path = None
    use_motif = False
else:
  print("No inputs found.")

In [None]:
#@title <- Install dependencies (~6min)
from time import perf_counter

t1_start = perf_counter()
# install conda package manager
# For more information: https://docs.conda.io/en/latest/
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c bioconda meme=5.3.0
# !conda install -c conda-forge biopython
# import Bio
import pandas as pd

t1_stop = perf_counter()
print("Elapsed time (seconds):", t1_stop-t1_start)

In [None]:
#@title <- Build sequences
t1_start = perf_counter()

# Make input directory
custom_seq_path = f"{jobname}/seqs"
os.mkdir(custom_seq_path)

# Create .fa for input query and send to directory
if query_sequence:
  with open(f"{custom_seq_path}/{jobname}.fa", 'w') as inputfa:
    inputfa.write(f">{jobname}\n")
    inputfa.write(f"{query_sequence}\n")

# Unzip and send .fa files to directory
if zipfiles:
  for zip in zipfiles:
    # very quietly (qq) unzip the file(s)
    !unzip $zip -qq -d $custom_seq_path

# Create directory for FIMO output (next step)
custom_FIMO_path = f"{jobname}/FIMO_out"

t1_stop = perf_counter()
print("Elapsed time (seconds):", t1_stop-t1_start)

In [None]:
#@title FIMO Search motif against all inputs (~10min)
#@markdown * Still need pval and motif
#@markdown * This needs to be written in Python with os
%%bash

# Exit immediately if exit status is non-zero (failed operation)
set -e

# Create FIMO output directory
echo "Creating directory: $custom_FIMO_path."
mkdir $custom_FIMO_path
echo "Writing files to: $custom_FIMO_path."

# Run FIMO on every file in specified directory
echo "Running recursive_FIMO..."
for f in $custom_seq_path/*.fa; do
  echo "Processing $f file..."
  # get basename of filepath
  baseFname=$(basename $f .fa)
  # FIMO on each file
  fimo --oc $custom_FIMO_path --verbosity 1 --text --thresh $pval --max-stored-scores 8000000 \
  $motif $f > "$custom_FIMO_path/$baseFname""_fimo.tsv" 2> /dev/null
done
echo "recursive_FIMO completed."

# Delete empty .tsv files created by FIMO
echo "Cleaning empty FIMO .tsv files..."
find $custom_FIMO_path -size 0 -print -delete
echo "Directory cleaned."

In [None]:
#@title Summarize all FIMO .tsv inputs (~15min)

import argparse
import glob
import os.path
import pandas as pd
import sys
from time import perf_counter

def parse_args():
    parser = argparse.ArgumentParser(prog = 'concat-hitsum.py', conflict_handler = 'resolve')
    # parser.add_argument('-bl', type = str, required = True, help = '=> .txt with organism blacklist e.g. mm10')
    parser.add_argument('-fimodir', type = str, required = True, help = '=> path/to/fimo_directory')
    parser.add_argument('-alndir', type = str, required = False, 
                        help = '=> path/to/alignments_directory. ONLY if you want to list orgs w/o motif hits')
    parser.add_argument('-PSGdir', type = str, required = False, 
                        help = '=> path/to/directory_with_FUBAR_outfiles. ONLY if you want to position-specific +ve info')
    parser.add_argument('-o', type = str, required = True, help = '=> path/to/outfile.csv')
    return(parser.parse_args())

def glob_files(path: str) -> list[str]:
    return(glob.glob(f'{path}/*.tsv'))

def expand_motif_aln(aln_dir: str, in_seqID: str, seq_sites: list[int]) -> (int, list[str]):
    '''Default shows only the species sequences with motif hits.
    When an aln dir is specified, this function is invoked to
    show the motif alignment across both hits and nonhits.'''
    from Bio.SeqIO.FastaIO import SimpleFastaParser

    # collect species name from title and relevant motif alignment from sequence
    with open(f'{aln_dir}/{in_seqID}.12taxa.fa') as aln_file:
        species_regions = []
        seq_length = 0
        for title, sequence in SimpleFastaParser(aln_file):
            species_name = title.split('_')[-1]
            if species_name == 'hg38':
                seq_length = len(sequence.replace('-',''))
            species_regions.append([f'{species_name}: {sequence[pos-1:pos+7]}' for pos in seq_sites])
    return(seq_length, list(map(list, zip(*species_regions))))

def map_PSRs(PSG_dir: str, in_seqID: str, seq_sites: list[int]) -> list[list[str]]:
    '''Returns stringmap of Positive Selection at Residues (PSRs) from dir of FUBAR files, 
    if relevant to the motif range (pos-1:pos+7). PSRs are recorded as '+', 
    and non-PSRs are recorded as '-'.'''
    site_map = [list(range(pos-1,pos+7)) for pos in seq_sites]
    PSRs = []
    try: # File may or may not exist, but if it does, collect PSR entries
        with open(f'{PSG_dir}/{in_seqID}.12taxa.fa.12taxon.tree.grid_info.posteriors.csv') as PSG_file:
            next(PSG_file) # Skip first line
            for line in PSG_file:
                PSR = int(line.split('0.')[0])
                PSRs.append(PSR)
        
        # map sites with presence '+' or absence '_' of PSR
        for site_i, site in enumerate(site_map):
            for pos_i, pos in enumerate(site):
                if pos in PSRs:
                    site_map[site_i][pos_i] = '+'
                else:
                    site_map[site_i][pos_i] = '-'
            site_map[site_i] = ''.join(site)
        return(site_map)
    except IOError: #if file doesn't exist, return default string
        return(['--------' for _ in range(len(seq_sites))])
    except StopIteration: #if file exists, but there are no sites, return default string
        return(['--------' for _ in range(len(seq_sites))])

def exclude_hits(hits_to_exclude: list[str], all_hits: list[str]) -> list[str]:
    nonhit_regions = []
    for hit_index, species in enumerate(hits_to_exclude):
        nonhit_regions.append(set(all_hits[hit_index]).symmetric_difference(species))
    return(nonhit_regions)

# pandas .agg func rename
def Num_Unique(series: pd.Series) -> int:
    return(len(set(series)))

def human_hit(series: pd.Series) -> str:
    if series.str.contains('hg38').any():
        return('Yes')
    else:
        return('No')

def main():
    """
    Takes fimo files and aln files (optional) and generates
    summary dataframe containing one motif hit per line
    with the following info: 
    seq ID|AA pos|species hit: seq|min pval|hg38? (yes/no)|hg38 site|hg38 pval|species absent
    """
    t1_start = perf_counter()

    args = parse_args()

    if os.path.isfile(args.o):
        sys.exit(f"File {args.o} exists")
    else:
        print (f"Building {args.o}...")

    #Set files and columns to extract from
    infimo_files = glob_files(args.fimodir.rstrip('/'))
    infile_ind = [1, 2, 6, 8] # 'sequence name', 'start', 'p-value', 'matched sequence'

    agg_func_text = {'seqIDs': ['first'], # get one representative seqID (first occurrence)
                    'start': ['first', 'count'], # get representative start val, and count of species hits
                    'species_seqs': [tuple], # summarize species seq hits as tuple
                    'matchedseq': [Num_Unique], # num of unique seq hits found
                    'species_pvals': [tuple], # scores for each species hit
                    'pvalue': 'min', # best hit, no matter what species
                    'species': [human_hit]} # Is this a human hit? Yes or No

    for ind, file in enumerate(infimo_files):

        #Create dataframe with selected data from fimo file
        tsv_data = pd.read_csv(file, sep = '\t', usecols = infile_ind, 
                                names = ['seqname', 'start', 'pvalue', 'matchedseq'])
        tsv_data[['seqIDs', 'species']] = tsv_data.seqname.str.split('.12taxa.fa_', expand=True)
        tsv_data['species_seqs'] = tsv_data['species'].astype(str) + ': ' + tsv_data['matchedseq']
        tsv_data['species_pvals'] = tsv_data['species'].astype(str) + ': ' + tsv_data['pvalue'].astype(str)

        #Retain unmerged data: hg38 matchedseq and pvalue data
        hg_data = tsv_data[tsv_data['species'] == 'hg38'].sort_values('start', axis = 0, ascending = True)

        #collapse tsv_data to one line per motif hit across orgs
        tsv_data = (tsv_data.iloc[1: , 1:]
                    .groupby(['seqIDs', 'start'], as_index = False)
                    .agg(agg_func_text))
        
        # hard-coded -- this order doesn't need to change
        tsv_data.columns = ['sequenceID', 'start', 'count', 'concat_sites', 'Num_Unique', 'org_pvals', 'best_pval', 'human_hit'] # replace w/ readable colnames 

        #merge tsv_data to retained hg38 data and export
        merged_data = pd.merge(tsv_data, hg_data[['start', 'matchedseq', 'pvalue']],on='start', how='left')

        #OPTIONAL: use seqID, start pt, and species hits to scrape sequences of orgs with no detectable motif
        if args.alndir:
            #collect species-relevant motif info
            aln_directory = args.alndir.rstrip('/')
            grp_seqID = tsv_data['sequenceID'][0]
            mstarts = tsv_data.start.astype(int) #motif start sites
            sp_hits_to_exclude = tsv_data.concat_sites

            #extract protein (AA) seq length, [aln of each motif across all primates] regardless of score
            AA_length, sp_mregions = expand_motif_aln(aln_directory, grp_seqID, mstarts)

            #exclude hits already examined
            nonhit_mregions = exclude_hits(sp_hits_to_exclude, sp_mregions)
            
            #create nonhit df to merge
            nonhit_df = pd.DataFrame(columns = ['start', 'Non_hits'])
            nonhit_df['Non_hits'] = nonhit_mregions
            nonhit_df['start'] = tsv_data.start
            nonhit_df['AA_seqlength'] = AA_length

            #merge nonhit df to merged_data
            merged_data = pd.merge(merged_data, nonhit_df[['start', 'Non_hits', 'AA_seqlength']],on='start', how='left')

        #OPTIONAL: use seqID and start pt to scrape residues under pos sel (PSRs)
        if args.PSGdir:
            #collect PSG relevant info
            PSG_directory = args.PSGdir.rstrip('/')
            grp_seqID = tsv_data['sequenceID'][0]
            mstarts = tsv_data.start.astype(int) #motif start sites

            #extract protein (AA) seq length, [aln of each motif across all primates] regardless of score
            PSR_stringmap = map_PSRs(PSG_directory, grp_seqID, mstarts)

            #create nonhit df to merge
            PSR_df = pd.DataFrame(columns = ['start', 'PSRs'])
            PSR_df['start'] = tsv_data.start
            PSR_df['FUBAR_PSRs'] = PSR_stringmap

            #merge nonhit df to merged_data
            merged_data = pd.merge(merged_data, PSR_df[['start', 'FUBAR_PSRs']],on='start', how='left')

        #Create csv file if first glob file initiated, otherwise append to existing csv
        if ind == 0:
            merged_data.rename(columns={'matchedseq': 'human_site', 'pvalue': 'pval_hg38'}, inplace=True)
            merged_data.to_csv(args.o, index = False, mode = 'w', header=True)
        else:
            merged_data.to_csv(args.o, index = False, mode = 'a', header=False)

    t1_stop = perf_counter()
    print("Elapsed time (seconds):", t1_stop-t1_start)

main()

In [None]:
#@title Create orthogonal dataset .csv (<1min)

import argparse
import glob
import pandas as pd

def parse_args():
    parser=argparse.ArgumentParser(prog='merge-dbs.py', conflict_handler='resolve')
    parser.add_argument('-db', type=str, required=True, help='=> path/to/main_db.csv')
    parser.add_argument('-db_dir', type=str, required=True, help='=> path/to/database_directory')
    parser.add_argument('-o', type=str, required=True, help='=> path/to/merged_outfile.csv')
    return(parser.parse_args())

def glob_files(path: str) -> list[str]:
    return(glob.glob(f'{path}/*'))

def auto_merge(in_df: pd.DataFrame, glob_files: list[str]) -> tuple[pd.DataFrame, list[str]]:
    leftovers=[]
    for file in glob_files:
        # Create dataframe with selected data from each compatible db files in loop
        filename=file.split('/')[-1]
        db=pd.read_csv(file)
        cols=list(db)
        col_intersect=list(in_df.columns.intersection(db.columns)) # get key col, assume one

        # If shared key col exists, unpack to unique var, else skip/report file error
        if col_intersect:
            cols.insert(0, cols.pop(cols.index(col_intersect[0]))) # move intersected key col to the front
            db=db.loc[:, cols]
            sharedID, *othercols=db.columns
            print(f'{filename}: joined at {sharedID}')
        else:
            print(f'{filename}: Missing shared key column')
            leftovers.append(file)
            continue
        # Merge cleaned up dfs
        db.drop_duplicates(subset=sharedID, keep='first', inplace=True)
        in_df=pd.merge(in_df, db[[sharedID, *othercols]],on=sharedID, how='left')
    return(in_df, leftovers)


def main():
    """
    Appends column-specific data from other .csv files to a designated .csv file
    """
    args=parse_args()

    # Set main db file
    db_csv=pd.read_csv(args.db)

    # Set directory of db files to merge
    db_files=glob_files(args.db_dir.rstrip('/'))

    # Merge each db_file to in_csv by shared key col, record unmerged files
    db_csv, db_leftovers=auto_merge(db_csv, db_files)
    
    # Retry with unmerged files, if applicable, then export .csv
    if db_leftovers:
        db_csv, remaining_leftovers=auto_merge(db_csv, db_leftovers)
        db_csv.to_csv(args.o, index=False, mode='w', header=True)
        if remaining_leftovers:
            remaining_filenames=[file.split('/')[-1] for file in remaining_leftovers]
            print(f'Remaining un-merged files: {remaining_filenames}')
    else:
        db_csv.to_csv(args.o, index=False, mode='w', header=True)

main()


In [None]:
#@title Merge orthogonal dataset to .csv summary file (<1min)

import argparse
import pandas as pd

def parse_args():
    parser=argparse.ArgumentParser(prog='merge-to-db.py', conflict_handler='resolve')
    parser.add_argument('-i', type=str, required=True, help='=> path/to/infile.csv')
    parser.add_argument('-db', type=str, required=True, help='=> path/to/main_db')
    parser.add_argument('-keycolumn', type=str, required=True, help='=> key column to merge on')
    parser.add_argument('-o', type=str, required=True, help='=> path/to/merged_outfile.csv')
    return(parser.parse_args())

def main():
    """
    Appends column-specific data from a db.csv file to a designated .csv file
    """
    args=parse_args()

    # Set main input df to merge file
    in_df=pd.read_csv(args.i)

    # Set main db file and columns to merge using config (.ini)
    db_df=pd.read_csv(args.db)
    keycol=args.keycolumn
    # Re-order db_df and merge to input df on keycol val
    sID_col = db_df.pop(keycol)
    db_df.insert(0, sID_col.name, sID_col)
    sID, *othercols = db_df.columns
    in_df=pd.merge(in_df, db_df[[sID, *othercols]],on=keycol, how='left')

    # Hard-coded, this order doesn't need to change
    final_cols_order = ['sequenceID', 'Gene_Sym', 'description', 'AA_seqlength', 'start', 'count', 'Num_Unique', 'concat_sites', 'org_pvals', 'Non_hits', 'best_pval', 'human_site', 'pval_hg38', 'FUBAR_PSRs', 'Resource_Plate', 'Resource_Position', 'hORF_Length', 'PC1', 'Omega', 'calc_AF', 'log_calc_AF', 'human_hit', 'Ifn_u2', 'Ifn_u5', 'Ifn_d2', 'Ifn_d5']
    in_df=in_df.loc[:, final_cols_order]
    in_df.to_csv(args.o, index=False, mode='w', header=True)

main()

In [None]:
#@title Package and download results
#@markdown If you are having issues downloading the result archive, try disabling your adblocker and run this cell again. If that fails click on the little folder icon to the left, navigate to file: `jobname.result.zip`, right-click and select \"Download\" (see [screenshot](https://pbs.twimg.com/media/E6wRW2lWUAEOuoe?format=jpg&name=small)).

# if msa_mode == "custom":
#   print("Don't forget to cite your custom MSA generation method.")

# !zip -FSr $jobname".result.zip" config.json $jobname*".json" $jobname*".a3m" $jobname*"relaxed_rank_"*".pdb" "cite.bibtex" $jobname*".png"
# files.download(f"{jobname}.result.zip")

# if save_to_google_drive == True and drive:
#   uploaded = drive.CreateFile({'title': f"{jobname}.result.zip"})
#   uploaded.SetContentFile(f"{jobname}.result.zip")
#   uploaded.Upload()
#   print(f"Uploaded {jobname}.result.zip to Google Drive with ID {uploaded.get('id')}")