In [None]:
import pandas as pd
import swifter # To parallelise pandas operations
import h5py
import sys
sys.path.append('../../pyriodogram/')
import ndft_features as ndft
import tensorflow as tf
import os
import numpy as np
import collections

# Read augmented dataset

In [5]:
flux_data = pd.read_hdf('./kyle_final_augment.h5', 'df')
meta_data = pd.read_hdf('./kyle_final_augment.h5', 'meta')

In [6]:
flux_data = flux_data.astype({'object_id':float, 'mjd':float,
                              'passband':int, 'flux': float,
                             'flux_err':float, 'detected':int})

In [7]:
meta_data = meta_data.astype({'object_id':float, 'ra':float, 'decl':float,
                             'gal_l':float, 'gal_b':float, 'ddf':int,
                             'hostgal_specz':float, 'hostgal_photoz':float,
                             'hostgal_photoz_err':float,'distmod':float,
                             'mwebv':float, 'target':int,'fold':int})

# Group and rename dynamic features

In [8]:
def reduce_arrays(df):
    df = df.sort_values('mjd')
    return df['mjd'].values, df['flux'].values,df['flux_err'].values, df['detected'].values

In [9]:
df_dynfeat = flux_data.groupby(['object_id',
                                'passband']).apply(reduce_arrays)
df_dynfeat = pd.DataFrame(df_dynfeat)

In [10]:
def name_cols(ds):
    mjd, flux, flux_err, detected = ds[0]
    return pd.Series({'object_id': ds['object_id'],'passband':ds['passband'] ,
                      'mjd': mjd, 'flux': flux, 'flux_err': flux_err,
                    'detected': detected})
    

In [11]:
df_dynfeat = df_dynfeat.reset_index().swifter.apply(name_cols, axis=1)

Pandas Apply: 100%|██████████| 1626846/1626846 [10:27<00:00, 2592.78it/s]


# Obtain Fourier features

In [9]:
def extract_fourier_feats(ds):
    freqs, mag, phase, Pn, proba = ndft.extract(ds['mjd'],
                            ds['flux'], oversampling = 4, tolerance = 1e-5)
    ds['freqs'] = freqs
    ds['mag'] = mag
    ds['phase'] = phase
    ds['period'] = Pn
    ds['proba'] = proba
    return ds

In [None]:

df_dynfeat = df_dynfeat.swifter.apply(extract_fourier_feats, axis=1)

  proba = np.power(Ix, effm)
  Ix = 1. - np.power(1 - 2 * Pn / n, 0.5 * (n - 3))
  df = 1.0 / (oversampling * (t[-1] - t[0]))
  return 2 * k * (t - tmin) / trange - k
  **kwargs)
  ret = ret.dtype.type(ret / rcount)
Pandas Apply:  95%|█████████▍| 1543399/1626846 [3:06:17<08:49, 157.46it/s]   

In [None]:
df = pd.merge(df_dynfeat, meta_data[['object_id','fold','target']], on='object_id', how='left')

In [None]:
df.to_pickle('todos.pkl')

# Write to tfrecords

In [None]:
def _int64_list_feature(values):
    """Returns a TF-Feature of int64_list.

    Args:
      values: A scalar or list of values.

    Returns:
      A TF-Feature.
    """
    # Flat numpy array (we actually need a list)
    if isinstance(values, np.ndarray):
        values = np.reshape(values, [-1])
        
    if not isinstance(values, collections.Iterable):
        values = [values]

    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

def _float_list_feature(values):
    """Returns a TF-Feature of FloatList.

    Args:
      values: A scalar or list of values.

    Returns:
      A TF-Feature.
    """
    
    # Flat numpy array (we actually need a list)
    if isinstance(values, np.ndarray):
        values = np.reshape(values, [-1])
    
    if not isinstance(values, collections.Iterable):
        values = [values]

    return tf.train.Feature(float_list=tf.train.FloatList(value=values))


def _bytes_list_feature(values):
    """Returns a TF-Feature of bytes.

    Args:
      values: A string.

    Returns:
      A TF-Feature.
    """
    def norm2bytes(value):
        return value.encode() if isinstance(value, str) and six.PY3 else value
    
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[norm2bytes(values)]))

def row_to_tfexample(rows):
    """Converts band rows for one object to tf example.
    Args:
      rows: data frame with object data.
    Returns:
      tf example.
    """
    NUM_BANDS = 6
    
    # Timeless features
    features = {'object/id': _float_list_feature(df.iloc[0]['object_id']),
                'object/target': _int64_list_feature(CLASSIFIER_CATEGORIES[df.iloc[0]['target']])}
    for band in range(NUM_BANDS):
        row = df[df['passband'] == band].iloc[0]
        # Time dependent features by band
        features.update({'band_%i/num_samples'%band: _int64_list_feature(len(row['detected'])),
                         'band_%i/detected'%band: _int64_list_feature(row['detected']),
                         'band_%i/flux'%band: _float_list_feature(row['flux']),
                         'band_%i/flux_err'%band: _float_list_feature(row['flux_err']),
                         'band_%i/mjd'%band: _float_list_feature(row['mjd']),
                         'band_%i/dft/freqs'%band: _float_list_feature(row['freqs']),
                         'band_%i/dft/mag'%band: _float_list_feature(row['mag']),
                         'band_%i/dft/phase'%band: _float_list_feature(row['phase']),
                         'band_%i/dft/periodogram'%band: _float_list_feature(row['period']),
                         'band_%i/dft/proba'%band: _float_list_feature(row['proba'])})
    return tf.train.Example(features=tf.train.Features(feature=features))

def convert_subset(df, examples_per_record, output_path, fold):
    """Converts fold  to tf records
    Args:
        df: pandas dataframe,
        examples_per_record: number of samples saved in one tf record,
        output_path: path to save tf records,
        fold: cross validation fold.
                
    
    """
    def _get_output_filename(output_path, idx, num_files):
        if idx is None:
            idx = 0; num_files=0;
        return '%s-%02d-of-%02d.tfrecord'%(output_path, idx, num_files)

    if len(df) == 0:
        print('-> %s fold is empty'%fold)
        return
    print('\n-> Processing %s fold...'%fold)
    # Initialize progress bar and counter
    # Initialize tfrecord idx counter
    if examples_per_record is None:
        tfrecord_idx = None
    else:
        tfrecord_idx = 1
    # tf writer
    object_ids = df['object_id'].unique()

    num_records = int(np.ceil(len(object_ids)/examples_per_record)) 
    print(_get_output_filename(output_path, tfrecord_idx, num_records))
    writer = tf.python_io.TFRecordWriter(_get_output_filename(output_path, tfrecord_idx,
                                                              num_records))
                 
             
    df = df.set_index('object_id')
    idx = 1
    progress = tf.keras.utils.Progbar(len(object_ids), interval=0.05)

    for object_id in object_ids:
        rows = df.loc[object_id]
        # Prepare example
        example = row_to_tfexample(rows)
        writer.write(example.SerializeToString())
        progress.update(idx)
        if examples_per_record is not None and idx%examples_per_record==0:
            # Close current writer and set a new one into a new file
            tfrecord_idx += 1
            writer.close()
            writer = tf.python_io.TFRecordWriter(_get_output_filename(output_path, 
                                                                      tfrecord_idx, num_records))
        idx += 1
    writer.close()

In [None]:
# Save dataset descriptors
TFRECORDS_DIR = 'records/total/'
EXAMPLES_PER_RECORD = 5
PLASTICC_CATEGORIES = [6, 15 ,16 ,42 ,52 ,53 ,62 ,64 ,65 ,67 ,88 ,90 ,92 ,95 ,99]
CLASSIFIER_CATEGORIES = {cat:idx for idx, cat in enumerate(PLASTICC_CATEGORIES)}
NUM_BANDS = 6
Nfolds = df['fold'].unique()
for i in Nfolds: 
    if not os.path.exists(os.path.dirname(TFRECORDS_DIR)):
        try:
            os.makedirs(os.path.dirname(TFRECORDS_DIR))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    convert_subset(df[df['fold'] == i], EXAMPLES_PER_RECORD, 
                   TFRECORDS_DIR + 'fold_%02d_of_%02d'%(i,len(Nfolds)),i )


## Test tfrecords

In [61]:
import glob
records = glob.glob(TFRECORDS_DIR + 'fold_*.tfrecord')
for example in tf.python_io.tf_record_iterator(records[0]):
    result = tf.train.Example.FromString(example)
    break

# Create metadata

In [None]:
 metadatas = []
for i in range(n_folds):
    class_frequency = folds_train[i]['target'].value_counts(normalize=True)
    plasticc_class_weights = (1/(class_frequency)).to_dict()
    classifier_class_weights = {CLASSIFIER_CATEGORIES[k]:v for k, v in plasticc_class_weights.items()}
    classifier_class_weights_sorted_list = [v for k, v in sorted(classifier_class_weights.items())]
    metadatas.append({'train_objects':folds_train[i]['object_id'].tolist(),
            'val_objects':folds_val[i]['object_id'].tolist(),
            'train_class_weights':classifier_class_weights,
            'train_class_weights_sorted_list':classifier_class_weights_sorted_list,
            'train_stats':[]})#folds_train_dft_stats[i]})