In [1]:
import os
import re
import sys
import subprocess
from datetime import datetime
from collections import defaultdict

import pandas as pd

from util.agave_helper import create_client

### Create agavepy client

In [4]:
ag = create_client()
ag.profiles.get()

{'first_name': 'Devin',
 'last_name': 'Strickland',
 'full_name': 'strcklnd',
 'email': 'dvn.strcklnd@gmail.com',
 'phone': '',
 'mobile_phone': '',
 'status': '',
 'create_time': '20170926192702Z',
 'uid': 845019,
 'username': 'strcklnd'}

{'first_name': 'Devin',
 'last_name': 'Strickland',
 'full_name': 'strcklnd',
 'email': 'dvn.strcklnd@gmail.com',
 'phone': '',
 'mobile_phone': '',
 'status': '',
 'create_time': '20170926192702Z',
 'uid': 845019,
 'username': 'strcklnd'}

### Method for dealing with agavepy date formatting

In [None]:
def json_serial(obj):
    """JSON serializer for objects not serializable by default json code"""
    if isinstance(obj, datetime):
        serial = obj.isoformat()
        return serial
    raise TypeError ("Type not serializable")

### Method for getting a long list of files without dotfiles

In [None]:
def list_files(file_path=None, system_id=None, limit=100, offset=0, ignore_dotfiles=True):
    all_files = []
    
    while True:
        batch = ag.files.list(filePath=file_path, systemId=system_id, limit=limit, offset=offset)
        more_files = len(batch) == limit
        
        if ignore_dotfiles:
            batch = [item for item in batch if not item['name'].startswith(".")]
        
        all_files.extend(batch)
        offset += limit
        
        if not more_files:
            break
        
    return all_files

## Step 0: Change variables for your project

In [None]:
# This name should match a BaseSpace Project name
project_name = 'Protstab'

# This is the path on TACC where you want the data to go so that it can be with its friends
tacc_service = 'data-sd2e-projects.sd2e-project-11'

# This is a subdirectory of tacc_path with the same name as the Aq plan
# It is not created automatically
plan = 'Plan_37976'

# Don't change this
tacc_path = os.path.join('/home/jupyter', tacc_service.replace('.', '/').replace('data-', ''))
plan_path = os.path.join(tacc_path, plan)

## Step 1: Get all the files in your BaseSpace Project of interest

In [None]:
project_dir = 'Projects/%s/Samples/' % project_name #'BioSamples/'
system_id = 'data-sd2e-basespace-biofab'

sample_list = list_files(
    file_path=project_dir, 
    system_id=system_id,
    limit=250, offset=0
)

sample_list.sort(key=lambda x: x['name'])

print(len(sample_list))

## Step 2: find a subset of samples

**Method 1: Arbitray file attributes**

In [None]:
grouped = defaultdict(list)
these_samples = []

for s in sample_list:
    name = s['name']
    key = name[0:6]
    
    grouped[key].append(s)
    
for key, samples in grouped.items():
    samples.sort(key=lambda x: x['length'])
    s = samples[-1]
    
    name = s['name']
    if len(name) == 6: name = name + '    '
    
    date = json_serial(s['lastModified'])
    size = s['length'] * 1024 * 1024
    
    print("%s  %s  %d" % (name, date, size))
    these_samples.append(s)

print(len(these_samples))

**Method 2: Sample IDs that are close to one another**

In [None]:
ids = ["359420", "359421", "359422", "359423", "359424", "359425", "359426"]

these_samples = [item for item in sample_list if item['name'] in ids]

print(len(these_samples))

**Method 3: Use a manifest!**

This assumes that the plan_path already exists and contains a file named 'manifest.csv'

In [None]:
manifest_path = os.path.join(plan_path, 'manifest.csv')

manifest = pd.read_csv(manifest_path)

these_sample_names = [str(i) for i in list(manifest.aq_item_id)]

# these_sample_names = [x + " (2)" for x in these_sample_names]

these_samples = [item for item in sample_list if item['name'] in these_sample_names]

print(len(these_samples))

## Step 3: Move the files

In [None]:
ngs_data_path = os.path.join(plan, 'ngs_data')

for item in these_samples:
    s = item['name'].replace(' ', '%20')
    
    this_from_path = os.path.join('agave://data-sd2e-basespace-biofab', project_dir, s, "Files")
    this_to_path = os.path.join(ngs_data_path, s)
    
    if not os.path.isdir(this_to_path):
        cmd = ['files-mkdir', '-S', tacc_service, '-N', s, ngs_data_path]
        msg = subprocess.check_output(cmd)
        print(msg)

#     os.chdir(this_to_path)
#     print(os.getcwd())
    
    # TODO: Convert this to agavepy
#     cmd = ['files-get', '-r', '-S', 'data-sd2e-basespace-biofab', this_from_path]
    cmd = ['files-import', '-W', 'strcklnd@uw.edu', '-S', tacc_service, '-U', this_from_path, this_to_path]

    msg = subprocess.check_output(cmd)
    print(msg)
        