**Important notes:** This notebook pipeline only works with protein sequences (no headers) or uploads of zipped fasta files (with headers). To upload zipped fasta files, click the "File" icon on the left panel, and click the "Upload to session storage" icon. These files will be erased at the end of each session, or after the time limit imposed by Google Colaboratory (~12 hours).

I had intended to make a dropdown menu for downloadable species. Unfortunately Google Colab makes this challenging. Instead, search here by **species name** to see if the genome is available [here](https://colab.research.google.com/drive/168xYMKDXyaDP9jqINjQC38z6R9dfNotn#scrollTo=VFfNMG873Piv)

In [2]:
#@title <- Click this to install dependencies before reading the info below. (~3-6min)
#@markdown The kernel MUST restart for you to proceed with the pipeline.

# download prebuilt motif files
# Does this replace the file everytime this cell is run?
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UBvY0MCEinCZi_VuP1CqXAHnn6nURjpH' -O Tsu-and-Beierschmitt-2021.txt

!wget -q 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/assembly_summary.txt' -O mammal_assembly_summary.txt

# install conda package manager
# For more information: https://docs.conda.io/en/latest/
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -y -c bioconda meme=4.11.2 &>/dev/null #5.3.0
!conda install -y -c conda-forge biopython &>/dev/null

✨🍰✨ Everything looks OK!


# How to run this pipeline:
1. Upload any zipped custom fasta-formatted sequences (gaps are ok), zipped FUBAR posterior.csv files, and prebuilt db.csv. If you are unsure about whether these file formats match, please use the input checker notebook [**In Progress**].<br>
2. Then, set inputs (1. Set inputs).<br>
* If you uploaded custom fasta seqs/alns (seqs.zip), fill checkbox "Upload_zipped_fastas"
* If you selected custom motif options, see Step 3. and then follow ### prompt to upload the MEME-formatted motif file. <br>
* If you uploaded custom FUBAR posterior.csv files (posteriors.zip), fill checkbox "enable PSRS"
* If you uploaded a custom orthogonal dataset to merge to your motif results (db.csv), fill checkbox "enable prebuiltdb" 
<br>
<br>
3. Click "Runtime" from the top menu and "Run all"

In [4]:
#@title 1. Set inputs

# =====================================================
# import libraries
# =====================================================
import argparse
import glob2
import hashlib
import os.path
import pandas as pd
import random
import re
import sys

from google.colab import files
from time import perf_counter

# =====================================================
# User-define functions
# =====================================================
def add_hash(x,y):
  '''
  Give jobname unique identifier
  '''
  return x+"_"+hashlib.sha1(y.encode()).hexdigest()[:5]

# =====================================================
# Set inputs from google form
# =====================================================
#@markdown Fill in or select one of the three:
RefSeq_Species = '' #@param {type:"string"}
#@markdown OR
# Acquire user input query sequence
query_sequence = '' #@param {type:"string"}
#@markdown OR

# Regardless of whether you've input a sequence or not
# Acquire boolean (checkbox) value for zipped fasta
Upload_zipped_fastas = True #@param {type:"boolean"}

# Acquire user input jobname
jobname = 'primate_seqs' #@param {type:"string"}

# Acquire user selected drop-down motif option
motif_options = "Tsu-and-Beierschmitt-2021 (Rec pval: 0.00581)" #@param ["none", "Tsu-and-Beierschmitt-2021 (Rec pval: 0.00581)","custom"]
#@markdown * Warning: Do not select more than one motif upload.

# Acquire user input pval number
pval = 0.00581 #@param {type:"number"}
#@markdown * This must be a numerical value between 0 and 1.

# Acquire boolean check for pos. sel. residue (PSR) files
enable_PSRs = True #@param {type:"boolean"}

# Acquire boolean check for prebuilt db .csv
enable_prebuiltdb = True #@param {type:"boolean"}

# =====================================================
# Set input sequence source
# =====================================================

is_input = False

is_RefSeq_Species = False
# Refseq proteome
if RefSeq_Species:
  is_input = True
  # Create dataframe from summary file
  infile_ind = [7, 15, 19] # 'organism_name', 'asm_name', 'ftp_path'
  mammal_df = pd.read_csv(
      "mammal_assembly_summary.txt", 
      sep = '\t',
      skiprows=1,
      usecols = infile_ind
  )
  mammal_match = mammal_df[mammal_df['organism_name']==RefSeq_Species]
  if len(mammal_match) == 1:
      print(f"Selecting RefSeq organism: {RefSeq_Species}")
      matched_asm_name = mammal_match['asm_name'].iloc[0]
      matched_ftp = mammal_match['ftp_path'].iloc[0]
      ftp_base = matched_ftp.split("/")[-1].split('protein')[0]
      proteome_ftp = f"{matched_ftp}/{ftp_base}_protein.faa.gz"
      is_RefSeq_Species = True
  else:
      print("Check for the species name in Check_Inputs.ipynb.")
      is_RefSeq_Species = False
      
# user-inputted query
query_sequence = "".join(re.split('; |, |\*|\n|-| ', query_sequence)).upper() # remove whitespaces
if query_sequence:
  is_input = True
  non_AAs = {'O', 'U', 'X', 'B', 'Z', 'J'}
  if non_AAs.intersection(query_sequence): 
    print("Warning: Invalid AA(s) in sequence.")
else:
  query_sequence = "empty"

if Upload_zipped_fastas:
  seq_zipfiles = glob2.glob("*seqs.zip")
  if seq_zipfiles:
    is_input = True
    query_sequence = "empty"
    print("Zipped file(s) found:", *seq_zipfiles)
  else:
    print("Zipped file(s) not found. Try reuploading with a seqs.zip suffix.")

# =====================================================
# Create job ID and associated folder
# =====================================================

if is_input:
  # Acquire user input jobname and add unique identifier
  basejobname = "".join(jobname.split()) # remove whitespaces
  basejobname = re.sub(r'\W+', '', basejobname)
  jobname = add_hash(basejobname, query_sequence)
  while os.path.isfile(f"{jobname}.csv"):
    jobname = add_hash(basejobname, ''.join(random.sample(query_sequence,len(query_sequence))))
  with open(f"{jobname}.csv", "w") as text_file:
    text_file.write(f"id,sequence\n{jobname},{query_sequence}")
  print("Setting up job:", jobname)

  # Placeholder - this will be the final output
  os.mkdir(jobname)
  # queries_path=f"{jobname}/{jobname}.csv"
else:
  print("No input sequences found.")

# =====================================================
# Check motif input
# =====================================================

# Check for motif input
if motif_options == "Tsu-and-Beierschmitt-2021 (Rec pval: 0.00581)":
  use_motif = True
  custom_motif_path = "/content/"
  motif = f"{custom_motif_path}Tsu-and-Beierschmitt-2021.txt"
elif motif_options == "custom":
  print("Input MEME-formatted motif .txt file:")
  custom_motif_path = f"{jobname}/motif/"
  os.mkdir(custom_motif_path)
  uploaded_motif = files.upload()
  use_motif = True
  motif = ""
  for fn in uploaded_motif.keys():
    motif = f"{custom_motif_path}{fn}"
    os.rename(fn, motif)
else:
  print("Please select/upload a motif.")
  custom_motif_path = None
  use_motif = False

# =====================================================
# Check other inputs
# =====================================================

# check for PSR input
is_PSRs = False
if enable_PSRs:
  PSRs = glob2.glob("*posteriors.zip")
  if PSRs:
    is_PSRs = True
  else:
    print("PSR file(s) not found. Skipping PSRs for now. Try reuploading with a posteriors.zip suffix.")

# check for uploaded, prebuilt db
if enable_prebuiltdb:
  dbs = glob2.glob("*db.csv")
  use_db = True
  prebuiltdb = ""
  for db in dbs:
    prebuiltdb = db
  if prebuiltdb:
    is_prebuiltdb = True
else:
  is_prebuiltdb = False
  use_db = False

# =====================================================
# Check overriding priorities
# =====================================================

# only include PSRs if user set uploaded files
if is_PSRs and query_sequence == "empty" and not is_RefSeq_Species:
  print("Including PSR file(s) to be merged.")
else:
  is_PSRs = False

# only include prebuilt db if user set uploaded files
if is_prebuiltdb and query_sequence == "empty" and not is_RefSeq_Species:
  print("Including prebuiltdb file(s) to be merged.")
else:
  custom_prebuiltdb_path = None
  use_db = False

Zipped file(s) found: whole_seqs_AA_12_seqs.zip
Setting up job: primate_seqs_0b0c1
Including PSR file(s) to be merged.
Including prebuiltdb file(s) to be merged.


In [5]:
#@title 2. Build sequences and related files
t1_start = perf_counter()

# Make input directory
custom_seq_path = f"{jobname}/seqs"
os.mkdir(custom_seq_path)

# Create bash script to split multifasta into singular fastas
fa_sh = """
#!/bin/zsh

# "$#" = value of the total number of command line args passed
# As long as "$#" is greater than (-gt) 0 args, keep while loop alive
while [[ "$#" -gt 0 ]]; do
    # Check each case (options/flags) until match is found
    case $1 in
        # get input following arg option, then shift to next str
        -i|--inputfa) inputfa="$2"; shift ;;
        -s|--suffix) suffix="$2"; shift ;;
        -o|--outdir) outdir="$2"; shift ;;
        
        # if extra, unmatched options show up, exit
        # Exit code 0 - Success
        # Exit code 1 - General/misc errors, such as "divide by zero" and other impermissible operations
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    
    # end case search (case spelled backwards)
    esac
    shift # to the next str, if any, then loop
done

while read line
do
    if [[ ${line:0:1} == ">" ]]
    then
        outbase_headless=${line#">"}
        outbase_tailless=${outbase_headless%"$suffix"}
        outfile="$outdir/${outbase_tailless}"
        echo $line > $outfile
    else
        echo $line >> $outfile
    fi
done < $inputfa
"""
with open('fa_filter.sh', 'w') as file:
  file.write(fa_sh)

# Create .fa for RefSeq query and send to directory
if is_RefSeq_Species:
    orig_fapath = f"{jobname}/proteome.faa"
    !wget -q $proteome_ftp -O proteome.faa.gz
    !gunzip -d /content/proteome.faa.gz
    !mv /content/proteome.faa $orig_fapath

    # import biopython to format seq names
    from Bio import SeqIO
    
    # Referencing the original fasta, create new fasta with new headers
    print(f"Building {matched_asm_name} proteome fastas...")
    corrected_fapath = f"{jobname}/corrected_proteome.faa"
    with open(orig_fapath) as original_faa:
        with open(corrected_fapath, 'w') as corrected_faa:
            records = SeqIO.parse(original_faa, 'fasta')
            for record in records:
                record.id = f"{record.id}.12taxa.fa_{matched_asm_name}"
                record.description = ""
                SeqIO.write(record, corrected_faa, 'fasta')
    !bash fa_filter.sh -i $corrected_fapath -o $custom_seq_path -s "_"$matched_asm_name
    

# Create .fa for input query and send to directory
if query_sequence != "empty":
  with open(f"{custom_seq_path}/{jobname}.12taxa.fa", 'w') as inputfa:
    inputfa.write(f">{jobname}.12taxa.fa_QuerySeq\n")
    inputfa.write(f"{query_sequence}\n")

# Unzip very quietly (qq) and send .fa files to directory
if Upload_zipped_fastas:
  for seq_zip in seq_zipfiles:
    !unzip -qq -j $seq_zip -d $custom_seq_path

# Unzip very quietly (qq) and send .posterior.csv files to directory
if is_PSRs:
  custom_PSR_path = f"{jobname}/posteriors"
  os.mkdir(custom_PSR_path)
  for PSR in PSRs:
    !unzip -qq -j $PSR -d $custom_PSR_path
else:
  custom_PSR_path = ""

t1_stop = perf_counter()
print("Elapsed time (seconds):", t1_stop-t1_start)

Elapsed time (seconds): 7.9033499110000776


In [6]:
#@title 3. Experimental - FIMO Search motif against all inputs (~10min)
#@markdown parallelized across ~2 processes given by colab
t1_start = perf_counter()

# Create directory for FIMO output
print(f"Creating FIMO output directory.")
custom_FIMO_path = f"{jobname}/FIMO_out"
os.mkdir(custom_FIMO_path)
print(f"Writing files to: {custom_FIMO_path}.")

# Create bash script to run FIMO on every file in specified directory
sh = """
#!/bin/zsh

# "$#" = value of the total number of command line args passed
# As long as "$#" is greater than (-gt) 0 args, keep while loop alive
while [[ "$#" -gt 0 ]]; do
    # Check each case (options/flags) until match is found
    case $1 in
        # get input following arg option, then shift to next str
        -i|--inputdir) inputdir="$2"; shift ;;
        -m|--motif) motif="$2"; shift ;;
        -p|--pval) pval="$2"; shift ;;
        -o|--oc) outputdir="$2"; shift ;;
        
        # if extra, unmatched options show up, exit
        # Exit code 0 - Success
        # Exit code 1 - General/misc errors, such as "divide by zero" and other impermissible operations
        *) echo "Unknown parameter passed: $1"; exit 1 ;;
    
    # end case search (case spelled backwards)
    esac
    shift # to the next str, if any, then loop
done
N=4
(
for f in $inputdir/*.fa
do
    ((i=i%N)); ((i++==0)) && wait
    # get basename of filepath
    baseFname=$(basename $f .fa)
    # FIMO on each file
    fimo --oc "$outputdir" --verbosity 1 --text --thresh $pval --max-stored-scores 8000000 "$motif" "$f" > "$outputdir/$baseFname""_fimo.tsv" &
done
)
"""
with open('parallel_FIMO.sh', 'w') as file:
  file.write(sh)

# Run parallel FIMO script
print(f"Running iterative FIMO...")
!bash parallel_FIMO.sh -i $custom_seq_path -m $motif -p $pval -o $custom_FIMO_path &>/dev/null
print(f"iterative_FIMO completed.")

# Delete empty .tsv files created by FIMO
print(f"Cleaning empty FIMO .tsv files...")
!find $custom_FIMO_path -size 0 -print -delete &>/dev/null
print(f"Directory cleaned.")

t1_stop = perf_counter()
print("Elapsed time (seconds):", t1_stop-t1_start)

Creating FIMO output directory.
Writing files to: primate_seqs_0b0c1/FIMO_out.
Running iterative FIMO...
iterative_FIMO completed.
Cleaning empty FIMO .tsv files...
Directory cleaned.
Elapsed time (seconds): 533.1267852909996


In [7]:
#@title 4. Summarize all FIMO .tsv inputs (~15-25min)

from dataclasses import dataclass
from typing import List

@dataclass
class summary_args:
    csv_out: str
    fimo_tsvs: List[str]
    fasta_dir: str
    PSG_dir: str
    def __post_init__(self):
        if not self.PSG_dir:
            self.PSG_dir = 'skip'

def glob_files(path: str) -> List[str]:
    return(glob2.glob(f'{path}/*.tsv'))

def expand_motif_aln(aln_dir: str, in_seqID: str, seq_sites: List[int], motif_length = int): # Need correct typehint -> (int, List[str]):
    '''Default shows only the species sequences with motif hits.
    When an aln dir is specified, this function is invoked to
    show the motif alignment across both hits and nonhits.'''
    from Bio.SeqIO.FastaIO import SimpleFastaParser

    # collect species name from title and relevant motif alignment from sequence
    with open(f'{aln_dir}/{in_seqID}.12taxa.fa') as aln_file:
        species_regions = []
        seq_length = 0
        for title, sequence in SimpleFastaParser(aln_file):
            species_name = title.split('_')[-1]
            if species_name == 'hg38' or species_name == 'QuerySeq':
                seq_length = len(sequence.replace('-',''))
            species_regions.append([f'{species_name}: {sequence[pos-1:pos+motif_length-1]}' for pos in seq_sites])
    return(seq_length, list(map(list, zip(*species_regions))))

def map_PSRs(indir: str, in_seqID: str, seq_sites: List[int]) -> List[List[str]]:
    '''Returns stringmap of Positive Selection at Residues (PSRs) from dir of FUBAR files, 
    if relevant to the motif range (pos-1:pos+7). PSRs are recorded as '+', 
    and non-PSRs are recorded as '-'.'''
    site_map = [list(range(pos-1,pos+7)) for pos in seq_sites]
    PSRs = []
    try: # File may or may not exist, but if it does, collect PSR entries
        with open(f'{indir}/{in_seqID}.12taxa.fa.12taxon.tree.grid_info.posteriors.csv') as PSG_file:
            next(PSG_file) # Skip first line
            for line in PSG_file:
                PSR = int(line.split('0.')[0])
                PSRs.append(PSR)
        
        # map sites with presence '+' or absence '_' of PSR
        for site_i, site in enumerate(site_map):
            for pos_i, pos in enumerate(site):
                if pos in PSRs:
                    site_map[site_i][pos_i] = f'{pos}'
                else:
                    site_map[site_i][pos_i] = '-'
            site_map[site_i] = ''.join(site)
        return(site_map)
    except IOError: #if file doesn't exist, return default string
        return(['--------' for _ in range(len(seq_sites))])
    except StopIteration: #if file exists, but there are no sites, return default string
        return(['--------' for _ in range(len(seq_sites))])

def exclude_hits(hits_to_exclude: List[str], all_hits: List[str]) -> List[str]:
    nonhit_regions = []
    for hit_index, species in enumerate(hits_to_exclude):
        nonhit_regions.append(set(all_hits[hit_index]).symmetric_difference(species))
    return(nonhit_regions)

# pandas .agg func rename
def Num_Unique(series: pd.Series) -> int:
    return(len(set(series)))

def human_hit(series: pd.Series) -> str:
    if series.str.contains('hg38').any():
        return('Yes')
    else:
        return('No')

def main():
    """
    Takes fimo files and aln files (optional) and generates
    summary dataframe containing one motif hit per line
    with the following info: 
    seq ID|AA pos|species hit: seq|min pval|hg38? (yes/no)|hg38 site|hg38 pval|species absent
    """
    t1_start = perf_counter()

    s_args = summary_args(
        f"{jobname}/{jobname}.csv",
        custom_FIMO_path,
        f"{custom_seq_path}",
        custom_PSR_path
    )

    # Check if this file already exists. If so, do nothing.
    # if os.path.isfile(s_args.csv_out):
    #     sys.exit(f"File {s_args.csv_out} exists")
    # else:
    #     print (f"Building {s_args.csv_out}...")

    #Set files and columns to extract from
    infimo_files = glob_files(s_args.fimo_tsvs.rstrip('/'))
    infile_ind = [1, 2, 6, 8] # 'sequence name', 'start', 'p-value', 'matched sequence'
    agg_func_text = {'seqIDs': ['first'], # get one representative seqID (first occurrence)
                    'start': ['first', 'count'], # get representative start val, and count of species hits
                    'species_seqs': [tuple], # summarize species seq hits as tuple
                    'matchedseq': [Num_Unique], # num of unique seq hits found
                    'species_pvals': [tuple], # scores for each species hit
                    'pvalue': 'min', # best hit, no matter what species
                    'species': [human_hit]} # Is this a human hit? Yes or No
    mlength = 0
    for ind, file in enumerate(infimo_files):
        # Create dataframe with selected data from fimo file
        tsv_data = pd.read_csv(file, sep = '\t', usecols = infile_ind)
        tsv_data = tsv_data.rename({'sequence name': 'seqname', 'start': 'start', 'p-value': 'pvalue', 'matched sequence': 'matchedseq'}, axis=1)
        if ind == 0:
            mlength = len(tsv_data['matchedseq'].iloc[0])
        # Temporary hack, not intended to have .12taxa.fa_
        tsv_data[['seqIDs', 'species']] = tsv_data.seqname.str.split('.12taxa.fa_', expand=True)
        tsv_data['species_seqs'] = tsv_data['species'].astype(str) + ': ' + tsv_data['matchedseq']
        tsv_data['species_pvals'] = tsv_data['species'].astype(str) + ': ' + tsv_data['pvalue'].astype(str)
        #Retain unmerged data: hg38 matchedseq and pvalue data
        hg_data = tsv_data[tsv_data['species'] == 'hg38'].sort_values('start', axis = 0, ascending = True)
        
        #collapse tsv_data to one line per motif hit across orgs
        tsv_data = (tsv_data.iloc[0: , 0:]
                    .groupby(['seqIDs', 'start'], as_index = False)
                    .agg(agg_func_text))
        
        # hard-coded -- this order doesn't need to change
        tsv_data.columns = ['sequenceID',
                            'start',
                            'count',
                            'concat_sites',
                            'Num_Unique',
                            'org_pvals',
                            'best_pval',
                            'human_hit'] # replace w/ readable colnames 

        #merge tsv_data to retained hg38 data and export
        merged_data = pd.merge(tsv_data, hg_data[['start', 'matchedseq', 'pvalue']],on='start', how='left')
        #use seqID, start pt, and species hits to scrape sequences of orgs with no detectable motif
        #collect species-relevant motif info
        aln_directory = s_args.fasta_dir.rstrip('/')
        grp_seqID = tsv_data['sequenceID'].iloc[0]
        mstarts = tsv_data.start.astype(int) #motif start sites
        sp_hits_to_exclude = tsv_data.concat_sites

        #extract protein (AA) seq length, [aln of each motif across all primates] regardless of score
        AA_length, sp_mregions = expand_motif_aln(aln_directory, grp_seqID, mstarts, mlength)

        #exclude hits already examined
        nonhit_mregions = exclude_hits(sp_hits_to_exclude, sp_mregions)
        
        #create nonhit df to merge
        nonhit_df = pd.DataFrame(columns = ['start', 'Non_hits'])
        nonhit_df['Non_hits'] = nonhit_mregions
        nonhit_df['start'] = tsv_data.start
        nonhit_df['AA_seqlength'] = AA_length

        #merge nonhit df to merged_data
        merged_data = pd.merge(merged_data, nonhit_df[['start', 'Non_hits', 'AA_seqlength']],on='start', how='left')

        #OPTIONAL: use seqID and start pt to scrape residues under pos sel (PSRs)
        if s_args.PSG_dir != 'skip':
            #collect PSG relevant info
            PSG_directory = s_args.PSG_dir.rstrip('/')
            grp_seqID = tsv_data['sequenceID'][0]
            mstarts = tsv_data.start.astype(int) #motif start sites

            #extract protein (AA) seq length, [aln of each motif across all primates] regardless of score
            PSR_stringmap = map_PSRs(PSG_directory, grp_seqID, mstarts)

            #create nonhit df to merge
            PSR_df = pd.DataFrame(columns = ['start', 'PSRs'])
            PSR_df['start'] = tsv_data.start
            PSR_df['FUBAR_PSRs'] = PSR_stringmap

            #merge nonhit df to merged_data
            merged_data = pd.merge(merged_data, PSR_df[['start', 'FUBAR_PSRs']],on='start', how='left')

        #Create csv file if first glob file initiated, otherwise append to existing csv
        if ind == 0:
            merged_data.rename(columns={'matchedseq': 'human_site', 'pvalue': 'pval_hg38'}, inplace=True)
            merged_data.to_csv(s_args.csv_out, index = False, mode = 'w', header=True)
        else:
            merged_data.to_csv(s_args.csv_out, index = False, mode = 'a', header=False)

    t1_stop = perf_counter()
    print("Elapsed time (seconds):", t1_stop-t1_start)

main()

Elapsed time (seconds): 1394.0104284500003


In [8]:
#@title 5. Merge orthogonal dataset to .csv summary file (<1min)

@dataclass
class merge_args:
    csv_in: str
    csv_db: str
    csv_out: str
    key_column: str

def main():
    """
    Appends column-specific data from a db.csv file to a designated .csv file
    """
    m_args=merge_args(
        f"{jobname}/{jobname}.csv",
        prebuiltdb,
        f"{jobname}/annotated_{jobname}.csv",
        "sequenceID" # currently hard-coded
    )

    # Set main input df to merge file
    in_df=pd.read_csv(m_args.csv_in)

    # Set main db file and columns to merge using config (.ini)
    db_df=pd.read_csv(m_args.csv_db)
    keycol=m_args.key_column
    # Re-order db_df and merge to input df on keycol val
    sID_col = db_df.pop(keycol)
    db_df.insert(0, sID_col.name, sID_col)
    sID, *othercols = db_df.columns
    in_df=pd.merge(in_df, db_df[[sID, *othercols]],on=keycol, how='left')

    # Hard-coded, this order doesn't need to change
    final_cols_order = ['sequenceID', 'Gene_Sym', 'description', 'AA_seqlength', 'start', 'count', 'Num_Unique', 'concat_sites', 'org_pvals', 'Non_hits', 'best_pval', 'human_site', 'pval_hg38', 'FUBAR_PSRs', 'Resource_Plate', 'Resource_Position', 'hORF_Length', 'PC1', 'Omega', 'calc_AF', 'log_calc_AF', 'human_hit', 'Ifn_u2', 'Ifn_u5', 'Ifn_d2', 'Ifn_d5']
    in_df=in_df.loc[:, final_cols_order]
    in_df.to_csv(m_args.csv_out, index=False, mode='w', header=True)

if use_db:
    main()
else:
    concat_csv = f"{jobname}/{jobname}.csv"
    desired_csv = f"{jobname}/annotated_{jobname}.csv"
    os.rename(concat_csv, desired_csv)

In [9]:
#@title 6. Package and download results
#@markdown If you are having issues downloading the result archive, try disabling your adblocker and run this cell again. If that fails click on the little folder icon to the left, navigate to file: `jobname.result.zip`, right-click and select \"Download\".

files.download(f"{jobname}/annotated_{jobname}.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>