In [1]:
DATA_PATH = '../data/'
LIGHTCURVES_PATH = DATA_PATH + 'lightcurves/'
FEATURES_PATH = DATA_PATH + 'features/'

In [2]:
import numpy as np
import pandas as pd
import measurements, extract
import matplotlib.pyplot as plt

In [3]:
np.random.seed(42)

Import transient lightcurves

In [4]:
filename = 'transient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_tra = pd.read_pickle(filepath)
df_tra.shape

(451474, 4)

Filter transient lightcurves

In [5]:
# Minimum number of observations for each light curve used
min_obs = 10

In [6]:
# Delete rows of blended observations
df_tra = df_tra.drop_duplicates(['TransientID','MJD'], keep='first')
# Add observation count to every transient
df_count = df_tra.groupby('TransientID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['TransientID', 'ObsCount']]
df_tra = df_tra.merge(df_count, how='inner')
# Remove objects with less than min_obs
df_tra = df_tra[df_tra.ObsCount >= min_obs]

Import non-transient light curves

In [7]:
filename = 'nontransient_lightcurves.pickle'
indir = LIGHTCURVES_PATH; filepath = indir + filename
df_nont = pd.read_pickle(filepath)
df_nont.shape

(1924409, 4)

Filter non-transient lightcurves

In [8]:
# Delete rows of blended observations
df_nont = df_nont.drop_duplicates(['ID','MJD'], keep='first')
# Add observation count to every nontransient
df_count = df_nont.groupby('ID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['ID', 'ObsCount']]
df_nont = df_nont.merge(df_count, how='inner')
# Remove nontransient objects with less than 5 observations
df_nont = df_nont[df_nont.ObsCount >= min_obs]
df_nont.shape

(1788967, 5)

In [9]:
# Sample non-transient subset of same size as transients
sample_size = df_tra.TransientID.unique().shape[0]
IDs = np.random.choice(df_nont.ID.unique(), size=sample_size, replace=False)
df_nont = df_nont[df_nont.ID.isin(IDs)]
df_nont.ID.unique().shape, df_nont.shape

((3727,), (480536, 5))

Feature dict creation method

In [10]:
def feature_dict(num_features=21):
    features = [
        'ID', 'skew', 'std', 'kurtosis', 'beyond1st', 'stetson_j', 'stetson_k', 'max_slope',
        'amplitude', 'median_absolute_deviation', 'median_buffer_range_percentage', 'pair_slope_trend',
        'percent_amplitude', 'percent_difference_flux_percentile', 'flux_percentile_ratio_mid20', 
        'flux_percentile_ratio_mid35', 'flux_percentile_ratio_mid50','flux_percentile_ratio_mid65', 
        'flux_percentile_ratio_mid80', 'linear_trend'
    ]
    if num_features > 19:
        features.extend(['small_kurtosis','pair_slope_trend_last_30'])
    if num_features > 21:
        features.extend(['poly1_a','poly2_a','poly2_b','poly3_a','poly3_b','poly3_c'])
    if num_features > 27:
        features.extend(['poly4_a', 'poly4_b', 'poly4_c', 'poly4_d'])
    return { k:[] for k in features}

Extract transient features

In [11]:
# Create empty feature dict
tran_feats = feature_dict(31)
for trID in df_tra.TransientID.unique():
    # Get current object light curve
    df = df_tra[df_tra.TransientID == trID]
    # Get features
    obj_feats = extract.features(df, feature_dict)
    # Append features
    for k,v in tran_feats.items():
        if k != 'ID': tran_feats[k].append(obj_feats[k])
    tran_feats['ID'].append(trID)
# Create feature dataframe
df_feat_tran = pd.DataFrame(tran_feats)

Count number of features

In [12]:
df_feat_tran.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
pair_slope_trend_last_30              3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
poly1_a                               3727
poly2_a                               3727
poly2_b                               3727
poly3_a                               3727
poly3_b                               3727
poly3_c                               3727
poly4_a    

Count number of unique features

In [13]:
df_feat_tran.T.apply(lambda x: x.nunique(), axis=1)

ID                                    3727
amplitude                             3498
beyond1st                             1749
flux_percentile_ratio_mid20           3615
flux_percentile_ratio_mid35           3615
flux_percentile_ratio_mid50           3615
flux_percentile_ratio_mid65           3615
flux_percentile_ratio_mid80           3615
kurtosis                              3615
linear_trend                          3615
max_slope                             3567
median_absolute_deviation             3615
median_buffer_range_percentage        1569
pair_slope_trend                      1319
pair_slope_trend_last_30                21
percent_amplitude                     3604
percent_difference_flux_percentile    3615
poly1_a                               3615
poly2_a                               3615
poly2_b                               3615
poly3_a                               3615
poly3_b                               3615
poly3_c                               3615
poly4_a    

Save transient features

In [14]:
outdir = FEATURES_PATH

In [15]:
filename_form = 'transient_{}obs_{}feats.pickle'
# Save all 31 features
num_features = df_feat_tran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_tran.to_pickle(outdir + filename)
# Save all 27 features
df_feat_tran = df_feat_tran.drop(['poly4_a', 'poly4_b', 'poly4_c', 'poly4_d'], axis=1)
num_features = df_feat_tran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_tran.to_pickle(outdir + filename)
# Save all 21 features
df_feat_tran = df_feat_tran.drop(['poly1_a','poly2_a','poly2_b','poly3_a','poly3_b','poly3_c'], axis=1)
num_features = df_feat_tran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_tran.to_pickle(outdir + filename)
# Save all 19 features
df_feat_tran = df_feat_tran.drop(['small_kurtosis','pair_slope_trend_last_30'], axis=1)
num_features = df_feat_tran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_tran.to_pickle(outdir + filename)

Extract nontransient features

In [16]:
# Create empty feature dict
nontran_feats = feature_dict(31)
for ID in df_nont.ID.unique():
    # Get current object light curve
    df = df_nont[df_nont.ID == ID]
    # Get features
    obj_feats = extract.features(df, feature_dict)
    # Append features
    for k,v in nontran_feats.items():
        if k != 'ID': nontran_feats[k].append(obj_feats[k])
    nontran_feats['ID'].append(trID)
# Create feature dataframe
df_feat_nontran = pd.DataFrame(nontran_feats)

Count number of features

In [17]:
df_feat_nontran.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
pair_slope_trend_last_30              3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
poly1_a                               3727
poly2_a                               3727
poly2_b                               3727
poly3_a                               3727
poly3_b                               3727
poly3_c                               3727
poly4_a    

Count number of unique features

In [18]:
df_feat_nontran.T.apply(lambda x: x.nunique(), axis=1)

ID                                       1
amplitude                              830
beyond1st                             1818
flux_percentile_ratio_mid20           3700
flux_percentile_ratio_mid35           3721
flux_percentile_ratio_mid50           3722
flux_percentile_ratio_mid65           3721
flux_percentile_ratio_mid80           3716
kurtosis                              3727
linear_trend                          3727
max_slope                             3668
median_absolute_deviation             3725
median_buffer_range_percentage        1999
pair_slope_trend                      1450
pair_slope_trend_last_30                20
percent_amplitude                     2080
percent_difference_flux_percentile    3235
poly1_a                               3727
poly2_a                               3727
poly2_b                               3727
poly3_a                               3727
poly3_b                               3727
poly3_c                               3727
poly4_a    

Save nontransient features

In [19]:
filename_form = 'nontransient_{}obs_{}feats.pickle'
# Save all 31 features
num_features = df_feat_nontran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_nontran.to_pickle(outdir + filename)
# Save all 27 features
df_feat_nontran = df_feat_nontran.drop(['poly4_a', 'poly4_b', 'poly4_c', 'poly4_d'], axis=1)
num_features = df_feat_nontran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_nontran.to_pickle(outdir + filename)
# Save all 21 features
df_feat_nontran = df_feat_nontran.drop(['poly1_a','poly2_a','poly2_b','poly3_a','poly3_b','poly3_c'], axis=1)
num_features = df_feat_nontran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_nontran.to_pickle(outdir + filename)
# Save all 19 features
df_feat_nontran = df_feat_nontran.drop(['small_kurtosis','pair_slope_trend_last_30'], axis=1)
num_features = df_feat_nontran.shape[1]-1
filename = filename_form.format(min_obs, num_features) 
df_feat_nontran.to_pickle(outdir + filename)