In [25]:
%reload_ext autoreload
%autoreload 2

In [1]:
DATA_PATH = '../data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [2]:
import numpy as np
import pandas as pd
import measurements, extract
import matplotlib.pyplot as plt

In [3]:
def unique_ids_list(df_lcs):
    return df_lcs.index.get_level_values('ID').unique().format()

def print_num_ids_shape(df_lcs):
    unique_ids = unique_ids_list(df_lcs)
    print('Num IDs: {}  Shape: {}'.format(len(unique_ids), df_lcs.shape))

#### Import

Import __transient__ lightcurves

In [160]:
filename = 'transient_lightcurves_clean.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_tra = pd.read_pickle(filepath)
print_num_ids_shape(df_tra)

Num IDs: 4869  Shape: (440469, 3)


Import __non-transient__ light curves

In [161]:
filename = 'nontransient_lightcurves_clean.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_nont = pd.read_pickle(filepath)
print_num_ids_shape(df_nont)

Num IDs: 16940  Shape: (1802695, 3)


#### Filter

In [162]:
def filter_light_curves(df_lcs, min_obs):
    df_count = df_lcs.groupby('ID', as_index=True).count()
    df_count['ObsCount'] = df_count['Mag']
    df_count = df_count[['ObsCount']]
    df_lcs_with_counts = df_lcs.join(df_count, how='inner')
    # Remove objects with less than min_obs
    df_filtered = df_lcs_with_counts[df_lcs_with_counts.ObsCount >= min_obs]
    # Remove ObsCount
    df_filtered = df_filtered.drop(['ObsCount'], axis=1)
    return df_filtered

def sample(df_lcs, num_samples):
    # Set random seed
    np.random.seed(42)
    # Sample non-transient subset of same size as transients
    IDs = np.random.choice(unique_ids_list(df_lcs), size=num_samples, replace=False)
#     print(IDs); return
    df_sampled = df_nont.loc[IDs]
    return df_sampled

Filter __transient__ light curves

In [163]:
df_tra_5 = filter_light_curves(df_tra, 5)
print_num_ids_shape(df_tra_5)

df_tra_10 = filter_light_curves(df_tra, 10)
print_num_ids_shape(df_tra_10)

del df_tra

Num IDs: 4269  Shape: (438897, 3)
Num IDs: 3615  Shape: (434513, 3)


Filter __non-transient__ lightcurves

In [164]:
df_nont_5 = filter_light_curves(df_nont, 5)
print_num_ids_shape(df_nont_5)
df_nont_5 = sample(df_nont_5, len(unique_ids_list(df_tra_5)))
print_num_ids_shape(df_nont_5)

print('---')

df_nont_10 = filter_light_curves(df_nont, 10)
print_num_ids_shape(df_nont_10)
df_nont_10 = sample(df_nont_10, len(unique_ids_list(df_tra_10)))
print_num_ids_shape(df_nont_10)

Num IDs: 15193  Shape: (1798465, 3)
Num IDs: 4269  Shape: (513533, 3)
---
Num IDs: 13801  Shape: (1788967, 3)
Num IDs: 3615  Shape: (468588, 3)


#### Oversample

In [165]:
def oversample(df_lcs, copies=0):
    df_oversample = df_lcs.copy()
    df_oversample['copy_num'] = 0
    for i in range(1, copies+1):
        df_temp = df_lcs.copy()
        df_temp['copy_num'] = i
        df_temp['Mag'] = np.random.normal(df_lcs.Mag, df_lcs.Magerr)
        df_oversample = df_oversample.append(df_temp)
        
    df_oversample = df_oversample.set_index(['copy_num'], append=True)
    return df_oversample

Oversample __transient__ light curves

In [166]:
df_tra_5_os = oversample(df_tra_5, 6)
print_num_ids_shape(df_tra_5_os)

df_tra_10_os = oversample(df_tra_10, 6)
print_num_ids_shape(df_tra_10_os)

Num IDs: 4269  Shape: (3072279, 3)
Num IDs: 3615  Shape: (3041591, 3)


In [167]:
df_tra_5 = oversample(df_tra_5, 0)
df_tra_10 = oversample(df_tra_10, 0)

"Oversample" __nontransient__ light curves

In [168]:
df_nont_5_os = oversample(df_nont_5, 0)
print_num_ids_shape(df_nont_5)

df_nont_10_os = oversample(df_nont_10, 0)
print_num_ids_shape(df_nont_10)

Num IDs: 4269  Shape: (513533, 3)
Num IDs: 3615  Shape: (468588, 3)


In [169]:
df_nont_5 = oversample(df_nont_5, 0)
df_nont_10 = oversample(df_nont_10, 0)

#### Feature Extraction

In [196]:
def extract_features(df_lcs):
    # Extract num_copy list
    num_copy_list = df_lcs.index.get_level_values('copy_num').unique()    
    num_copies = len(num_copy_list)
    
    # Extract IDs list
    unique_ids_list = df_lcs.index.get_level_values('ID').unique()
    num_ids = len(unique_ids_list)

    # Create empty feature dict
    feats_dict = extract.feature_dict(31)
    # Add 'ID' and 'copy_num' index lists
    index_id_list = []
    index_copy_num_list = []
    
    for num_copy in num_copy_list:
        for i, obj_id in enumerate(unique_ids_list):
            # Print status
            if(i% int(num_ids*num_copies/10) == 0):
                print(i, '/', num_ids*num_copies)
            # Get current object light curve
            df_object = df_lcs.loc[obj_id,:,num_copy]
            # Get features
            obj_feats = extract.features(df_object, feats_dict)
            # Append features
            for k,v in feats_dict.items():
                feats_dict[k].append(obj_feats[k])
            # Append Indexes
            index_id_list.append(obj_id)
            index_copy_num_list.append(num_copy)
    # Create feature dataframe
    df_feats = pd.DataFrame(feats_dict).set_index([index_id_list,index_copy_num_list])
    df_feats.index.names = ['ID', 'copy_num']
    return df_feats

def save_features(df_feats, obj_type, min_obs, oversample):
    outdir = FEATURES_PATH
    filename_raw = '{}_{}obs_{}feat{}.pickle'
    oversample_text = '_os' if oversample else ''
    
    drop_feats_list = [
        [], ['poly4_a', 'poly4_b', 'poly4_c', 'poly4_d'],
        ['poly1_a','poly2_a','poly2_b','poly3_a','poly3_b','poly3_c']
    ]
    num_features_list = [30, 26, 20]
    
    for i in range(3):
        df_feats = df_feats.drop(drop_feats_list[i], axis=1)
        num_features = df_feats.shape[1]
        filename = filename_raw.format(obj_type, min_obs, num_features, oversample_text)
        assert(num_features==num_features_list[i])
        df_feats.to_pickle(outdir + filename)

#### Generate Features

In [197]:
def generate_features(df_all, transient, min_obs, oversample):
    obj_type = 'T' if transient else 'NT'
    # Generate features based on light curves
    df_feats = extract_features(df_all)
    # Save features
    save_features(df_feats, obj_type, min_obs, oversample)
    # Log Finished
    print('Finished task type={} obs={} is_oversampled={}'.format(
        obj_type, min_obs, oversample)
         )
    return df_feats

Generate features __transient__ light curves

In [198]:
df_tra_feats_5 = generate_features(df_tra_5, transient=True, min_obs=5, oversample=False)
df_tra_feats_5_os = generate_features(df_tra_5_os, transient=True, min_obs=5, oversample=True)

df_tra_feats_10 = generate_features(df_tra_10, transient=True, min_obs=10, oversample=False)
df_tra_feats_10_os = generate_features(df_tra_10_os, transient=True, min_obs=10, oversample=True)

0 / 4269
426 / 4269
852 / 4269
1278 / 4269
1704 / 4269
2130 / 4269




2556 / 4269




2982 / 4269




3408 / 4269




3834 / 4269




4260 / 4269
Finished task type=T obs=5 is_oversampled=False
0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




0 / 29883




2988 / 29883




Finished task type=T obs=5 is_oversampled=True
0 / 3615
361 / 3615
722 / 3615
1083 / 3615
1444 / 3615
1805 / 3615
2166 / 3615
2527 / 3615
2888 / 3615
3249 / 3615
3610 / 3615
Finished task type=T obs=10 is_oversampled=False
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
0 / 25305
2530 / 25305
Finished task type=T obs=10 is_oversampled=True


In [199]:
df_nont_feats_5 = generate_features(df_nont_5, transient=False, min_obs=5, oversample=False)
df_nont_feats_5_os = generate_features(df_nont_5_os, transient=False, min_obs=5, oversample=True)

df_nont_feats_10 = generate_features(df_nont_10, transient=False, min_obs=10, oversample=False)
df_nont_feats_10_os = generate_features(df_nont_10_os, transient=False, min_obs=10, oversample=True)

0 / 4269
426 / 4269
852 / 4269
1278 / 4269




1704 / 4269
2130 / 4269




2556 / 4269
2982 / 4269
3408 / 4269
3834 / 4269




4260 / 4269
Finished task type=NT obs=5 is_oversampled=False
0 / 4269
426 / 4269
852 / 4269
1278 / 4269




1704 / 4269
2130 / 4269




2556 / 4269
2982 / 4269
3408 / 4269
3834 / 4269




4260 / 4269
Finished task type=NT obs=5 is_oversampled=True
0 / 3615
361 / 3615
722 / 3615
1083 / 3615
1444 / 3615
1805 / 3615
2166 / 3615
2527 / 3615
2888 / 3615
3249 / 3615
3610 / 3615
Finished task type=NT obs=10 is_oversampled=False
0 / 3615
361 / 3615
722 / 3615
1083 / 3615
1444 / 3615
1805 / 3615
2166 / 3615
2527 / 3615
2888 / 3615
3249 / 3615
3610 / 3615
Finished task type=NT obs=10 is_oversampled=True


#### EXTRA

Count number of features

In [None]:
# df_tran_features.count()

Count number of unique features

In [None]:
# df_tran_features.T.apply(lambda x: x.nunique(), axis=1)

Extract nontransient features

In [None]:
# df_nontran_features = extract_features(df_nont)

Count number of features

In [None]:
# df_feat_nontran.count()

Count number of unique features

In [None]:
# df_feat_nontran.T.apply(lambda x: x.nunique(), axis=1)

Save nontransient features