### Upload all data into an S3 bucket

In [None]:
import os
import pandas as pd
import boto3
import botocore
from glob import glob

In [None]:
def list_files(path, ext = 'npy'):
    return [y for x in os.walk(path) for y in glob(os.path.join(x[0], '*.{}'.format(ext) ))]

def check_exists(s3, bucket_name, filename):
    '''
    helper to speed things up by not uploading images if they already exist, can be overriden 
    '''
    try:
        s3.Object(bucket_name,filename).load()    
        return True
    except botocore.exceptions.ClientError as e:    
        if (e.response['Error']['Code'] == "404"):
            print('The object does not exist.')
            return False
        else:
            print('Something else has gone wrong with {}'.format(filename))
            print('error is {}'.format(e.response['Error']))

In [None]:
analysis_dir = os.getcwd()
project_dir  = os.path.abspath('..')
results_dir  = os.path.join(project_dir,'results')
plot_dir     = os.path.join(results_dir,'plots')
csv_dir      = os.path.join(results_dir,'csv')
feature_dir  = os.path.join(project_dir, 'features')
gallery_dir  = os.path.join(project_dir, 'gallery')

Get the files we want in csv directory

In [None]:
csv_files = list_files(csv_dir, ext = 'csv')
csv_df    = pd.Series(csv_files).str.split('\\', expand = True).drop(columns=0)
csv_df[1] = '..'
csv_df    = csv_df[csv_df[4].str.contains('photodraw')].drop(15)
csv_files = [os.path.join(*csv_df.values.tolist()[i]) for i in range(len(csv_df))]

Get the files we want in feature directory

In [None]:
feature_files = ['..\\features\\photodraw12\\photodraw_instance_features.npy',
                 '..\\features\\photodraw12\\FEATURES_FC6_photodraw_sketch.npy',
                 '..\\features\\FEATURES_FC6_photodraw2x2_image.npy',
                 '..\\features\\FEATURES_FC6_photodraw2x2_sketch.npy',
                 '..\\features\\photodraw2x2_instance_features.npy']

Get the png data we want

In [None]:
photodraw2x2_sketches = list_files('..\\sketches\\photodraw2x2', ext='png')
photodraw2x2_stims    = list_files('..\\stimuli\\photodraw32_stims', ext='png')
photodraw_sketches    = list_files('..\\sketches\\photodraw', ext='png')
photodraw_stims       = list_files('..\\stimuli\\photodraw_stims', ext='png')
participant_gallery   = list_files('..\\gallery', ext='png')
stims_2x2_gallery     = list_files('..\\gallery', ext='pdf')

Consolidate paths into one big list

In [None]:
data_paths = [*csv_files, 
              *feature_files, 
              *photodraw2x2_sketches, 
              *photodraw2x2_stims, 
              *photodraw_sketches, 
              *photodraw_stims, 
              *participant_gallery, 
              *stims_2x2_gallery]
data_paths = [path[2:] for path in data_paths]

Upload data into photodraw-public s3 bucket

In [None]:
bucket_name='photodraw-public'

## tell user some useful information

print('Uploading to this bucket: {}'.format(bucket_name))

## establish connection to s3 
s3 = boto3.resource('s3')

## create a bucket with the appropriate bucket name
try: 
    b = s3.create_bucket(Bucket=bucket_name) 
    print('Created new bucket.')
except:
    b = s3.Bucket(bucket_name)
    print('Bucket already exists.')

## do we want to overwrite files on s3?
overwrite = False

## set bucket and objects to public
b.Acl().put(ACL='public-read') ## sets bucket to public

## now let's loop through data paths and actually upload to s3 
for i, path_to_file in enumerate(data_paths):
    filename = os.path.split(path_to_file)[-1]
    dirname  = os.path.split(path_to_file)[-2]
    keyname  = os.path.join(dirname,filename).replace('\\', '/')[1:]

    if ((check_exists(s3, bucket_name, keyname)==False) | (overwrite==True)):
        print('Now uploading {} | {} of {}'.format(path_to_file.split('/')[-1],(i+1),len(data_paths)))

        # extra insurance that the stuff we don't want public isnt public
        if filename.split('.')[-1] == 'csv':
            df = pd.read_csv('..'+path_to_file)
            for bad_colname in ['workerID', 'prolificID', 'Unnamed: 0', 'Unnamed: 1']:
                if any(str(col) == bad_colname for col in df.columns.values):
                    df = df.drop(columns=bad_colname)
                    df.to_csv('..'+path_to_file, index=False)
                    print(f'Removed {bad_colname} from {filename}.')


        s3.Object(bucket_name,keyname).upload_file('..'+path_to_file) ## upload stimuli
        s3.Object(bucket_name,keyname).Acl().put(ACL='public-read') ## set access controls
    else: 
        print('Skipping {} | {} of {} because it already exists.'.format(path_to_file.split('/')[-1],(i+1),len(data_paths)))

In [None]:
sum(1 for _ in b.objects.all())