In [1]:
import os, shutil, gzip
import csv

# PULPy Helper

This notebook is intended to help streamline working with PULpy. Using it you can:

* rename input files according to PULpy's requirements
* move input files from a provided directory to PULpy's input subdirectories
* clear the PULpy working directory of input and output files
* export input and output files from the PULpy working directory
* run PULpy

### Directories
Provide the notebook with your PULpy working directory and the paths to your input and output directories, if relevant.

In [3]:
pulpyd = '/home/ecutts/PULpy/' #absolute path to PULpy directory
idir = '/home/ecutts/Bacteroidetes/MAGs/' # absolute path to input directory
odir = '/home/ecutts/Bacteroidetes/PULs_MAGs' # absolute path to output directory

# Copy results & inputs to export directory
This cell will copy the results and input files currently in the PULpy working directory to an export directory (`odir`, defined in the 2nd cell). 

### Flat output or subdirectories?
Set `flat = True` to copy all files to a flat directory. Set `flat = False` to create subdirectories in the output directory for each genome represented in the PULpy working directory. If you want to copy your results to a new directory preserving PULpy's default subdirectory organization (i.e. directories for pfam, dbcan, feature_table, puls, etc.), you can simply run the following command in your terminal:

```
$ cp -r genomes proteins pfam dbcan feature_table ncbi_feature_table puls OUTPUT_DIR

```

**Important: If genome subdirectories will be constructed based on the contents of the `puls` directory.** Files associated with genomes not represented by files in `puls` won't be copied if `flat = False`. 

In [9]:
flat = False

id_map = {'genomes': '_genomic.fna.gz', 'proteins': '_protein.faa', 'puls': '.puls',\
         'dbcan': '.out', 'feature_table': '_ft.txt', 'ncbi_feature_table': '_feature_table.txt',\
         'pfam': '.pfam'}

os.chdir(pulpyd)

def get_idd(f, d):
    if f.endswith('_prodigal.gff'):
        idd = f.split('_prodigal.gff')[0]
    elif f.endswith('.gff'):
        idd = f.split('.gff')[0]
    else:
        idd = f.split(id_map[d])[0]
    return idd
        
for d in id_map.keys():
    for f in os.listdir(d):
        idd = get_idd(f, d)
        if flat:
            shutil.copy(os.path.join(d, f), os.path.join(odir, f))
        else:
            dest = os.path.join(odir, idd)
            if not os.path.exists(dest):
                os.makedirs(dest)
            shutil.copy(os.path.join(d, f), os.path.join(dest, f))

# Prep an input directory

Run the following cells to prepare a directory of input files for analysis by PULpy. The files will be copied to the appropriate PULpy input directories and renamed according to PULpy's naming conventions. You can choose to clear pre-existing input and output files within the PULpy working directory or to simply add your input files. The program identifies files to move by looking for .fna or .fna.gz files, which are required by PULpy even if using annotations.

### Input directory requirements
Your input directory should be either (1) a directory of directories, with each directory containing the files associated with a single genome or (2) a flat directory containing all of the files to be input. 

In [6]:
os.chdir(idir)

dd = False # Is the directory a directory of directories?
for f in os.listdir():
    if not os.path.isfile(f):
        dd = True

def find_fnas(d):
    fnas = {}
    for f in os.listdir(d):
        split = f.split('.')[1] # splits used because IMG downloads use ".genes.fna" as well as ".fna"
        if f.endswith('.fna.gz'): # favor .fna.gz files over .fna
            idd = f.split('.fna.gz')[0]
            fnas[idd] = f
        elif split == 'fna':
            idd = f.split('.fna')[0]
            fnas[idd] = f
    return fnas
        
def find_files(d, ids):
    gffs = {}
    faas = {}
    for f in os.listdir(d):
        idd = f.split('.')[0]
        if idd in ids:
            if f.endswith('.gff'):
                gffs[idd] = f
            elif f.endswith('.faa'): #IMG protein fastas end in .genes.faa, this may be changed for others
                faas[idd] = f
    return gffs, faas


# get list of files to move (relative paths to idir)
if dd == False:
    fnas = find_fnas(idir)
    ids = fnas.keys()
    files = dict.fromkeys(ids)
    found  = find_files(idir, ids)
    for idd in files.keys():
        files[idd] = [fnas[idd], found[0][idd],found[1][idd]]
    
else:
    files = {}
    
    for d in os.listdir():
        fna = find_fnas(d)
        idd = list(fna.keys())[0]
        files[idd] = [d + '/' + f for f in fna.values()] # add .fna to file list
        
        found = find_files(d, idd)
        files[idd].append(d + '/' + found[0][idd])
        files[idd].append(d + '/' + found[1][idd])
    
for idd in files.keys():
    if not os.path.exists(os.path.join(pulpyd, 'genomes/')):
        os.makedirs(os.path.join(pulpyd, 'genomes/'))
    if not os.path.exists(os.path.join(pulpyd, 'proteins/')):
        os.makedirs(os.path.join(pulpyd, 'proteins/'))
    
    arr = files[idd]
    if arr[0].endswith('.fna.gz'):
        shutil.copy(arr[0], os.path.join(pulpyd, 'genomes/' + idd + '_genomic.fna.gz'))
    else: # if a .fna file was provided, then compress to .fna.gz
        with open(arr[0], 'rb') as f_in:
            with gzip.open(os.path.join(pulpyd, 'genomes/' + idd + '_genomic.fna.gz'), 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    shutil.copy(arr[2], os.path.join(pulpyd, 'proteins/' + idd + '_protein.faa'))
    shutil.copy(arr[1], os.path.join(pulpyd, 'proteins/' + idd + '.gff'))

# Clear PULpy of inputs and outputs
You can run the cells below if you want to remove your PULpy inputs and outputs. 

In [None]:
# SAFETY SWITCH: You must switch clear to True if you want to clear your PULpy directory
# Don't accidentally delete your data!

clear = False 

In [None]:
def cleardir(d):
    for filename in os.listdir(d):
        filepath = os.path.join(d, filename)
        os.remove(filepath)

def clearPULpy(pulpyd):
    clear_dirs = ['dbcan', 'pfam', 'feature_table', 'genomes', 'proteins', 'puls']
    os.chdir(pulpyd)
    for d in os.listdir():
        if d in clear_dirs:
            cleardir(d)

if clear: #if clear is True, then the PULpy output directories will be cleared
    clearPULpy(pulpyd)