In [1]:

import numpy as np
import glob
import io
import os
import concurrent.futures
import tempfile

from astropy.table import Table
import boto3
import pandas
import fastparquet

In [2]:
from pysparkling import Context
sc = Context(max_retries=1, pool=concurrent.futures.ProcessPoolExecutor(16))

In [3]:
s3 = boto3.resource('s3')
ptf_bucket = s3.Bucket("palomar-transient-factory")

bucket_keys = [x.key for x in ptf_bucket.objects.filter(Prefix="input_catalogs/")]
catalog_keys = [x for x in filter(lambda x: x.endswith("ctlg"), bucket_keys)]

In [4]:
def readPTFFile_pandas_s3(key):
    s3 = boto3.resource('s3')
    ptf_bucket = s3.Bucket("palomar-transient-factory")

    with io.BytesIO() as f:
        ret = ptf_bucket.download_fileobj(key, f)
        f.seek(0)
        ptf_table = Table.read(f, format="fits")
        
    # These columns are arrays, unsupported by spark
    ptf_table.remove_columns(('MAG_APER', 'MAGERR_APER', 'FLUX_APER', 'FLUXERR_APER', 'FLUX_RADIUS'))
    df = ptf_table.to_pandas()
    
    # Convert flags from short into to 32 bit, otherwise unsupported by spark.
    df['FLAGS'] = df['FLAGS'].astype("int32")
    
    return (key, df)

def write_parquet_s3(input_tuple):
    key, df = input_tuple
    s3 = boto3.resource('s3')
    ptf_bucket = s3.Bucket("palomar-transient-factory")
    
    output_key = os.path.join('input_parquet2', key.partition('/')[-1].replace('ctlg', 'parquet'))
    with tempfile.NamedTemporaryFile(dir='/tmp2') as tmp_file:
        filename = tmp_file.name 
        fastparquet.write(filename, df) 
        ptf_bucket.upload_file(filename, output_key)
    return output_key
    

#split_records = sc.parallelize(catalog_keys, 500).map(readPTFFile_pandas_s3).

In [None]:
%%time
output_keys = sc.parallelize(catalog_keys, 16).map(readPTFFile_pandas_s3).map(write_parquet_s3).collect()

In [57]:
len(output_keys)

1999

In [58]:
len(catalog_keys)

67997

In [34]:
df_tuples = sc.parallelize(catalog_keys, 16).map(readPTFFile_pandas_s3).collect()

In [20]:
keyname, df = readPTFFile_pandas_s3(catalog_keys.__next__())