In [1]:
DATA_PATH = '../../data/CRTS2/'

In [2]:
import numpy as np
import pandas as pd
import sys
sys.path.append("../..")
import measurements, extract
import matplotlib.pyplot as plt
from scipy import interpolate
from scipy.optimize import curve_fit

In [3]:
np.random.seed(42)

Import transient lightcurves

In [4]:
filename = 'transient_lightcurves.pickle'
indir = DATA_PATH; filepath = indir + filename
df_tra = pd.read_pickle(filepath)
df_tra.shape

(451474, 4)

Filter transient lightcurves

In [5]:
# Minimum number of observations for each light curve used
min_obs = 10

In [6]:
# Delete rows of blended observations
df_tra = df_tra.drop_duplicates(['TransientID','MJD'], keep='first')
# Add observation count to every transient
df_count = df_tra.groupby('TransientID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['TransientID', 'ObsCount']]
df_tra = df_tra.merge(df_count, how='inner')
# Remove objects with less than min_obs
df_tra = df_tra[df_tra.ObsCount >= min_obs]

Import non-transient light curves

In [7]:
filename = 'permanent_lightcurves.pickle'
indir = DATA_PATH; filepath = indir + filename
df_per = pd.read_pickle(filepath)
df_per.shape

(1924409, 4)

Filter non-transient lightcurves

In [8]:
# Delete rows of blended observations
df_per = df_per.drop_duplicates(['ID','MJD'], keep='first')
# Add observation count to every permanent
df_count = df_per.groupby('ID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['ID', 'ObsCount']]
df_per = df_per.merge(df_count, how='inner')
# Remove objects with less than 5 observations
df_per = df_per[df_per.ObsCount >= min_obs]
df_per.shape

(1788967, 5)

In [9]:
# Sample non-transient subset of same size as transients
sample_size = df_tra.TransientID.unique().shape[0]
IDs = np.random.choice(df_per.ID.unique(), size=sample_size, replace=False)
df_per = df_per[df_per.ID.isin(IDs)]
df_per.ID.unique().shape, df_per.shape

((3727,), (480536, 5))

Feature dict creation method

In [10]:
def feature_dict(num_features=21):
    features = [
        'ID', 'skew', 'std', 'kurtosis', 'beyond1st', 'stetson_j', 'stetson_k', 'max_slope',
        'amplitude', 'median_absolute_deviation', 'median_buffer_range_percentage', 'pair_slope_trend',
         'flux_percentile_ratio_mid20', 'flux_percentile_ratio_mid35', 'flux_percentile_ratio_mid50',
         'flux_percentile_ratio_mid65', 'flux_percentile_ratio_mid80', 'percent_amplitude',
         'percent_difference_flux_percentile', 'linear_trend', 'percent_difference_flux_percentile', 'linear_trend'
    ]
    if num_features > 21:
        features.append(['poly1_a','poly2_a','poly2_b','poly3_a','poly3_b','poly3_c'])
    if num_features > 27:
         features.append(['poly4_a', 'poly4_b', 'poly4_c', 'poly4_d'])
    return { k:[] for k in features}

In [48]:
# Define number of features to be extracted
num_features = 21

Extract transient features

In [35]:
# Create empty feature dict
tran_feats = feature_dict(num_features)
for trID in df_tra.TransientID.unique():
    # Get current object light curve
    df = df_tra[df_tra.TransientID == trID]
    # Get features
    obj_feats = extract.features(df, feature_dict)
    # Append features
    for k,v in tran_feats.items():
        if k != 'ID': tran_feats[k].append(obj_feats[k])
    tran_feats['ID'].append(trID)
# Create feature dataframe
df_feat_tran = pd.DataFrame(tran_feats)

Count number of features

In [36]:
df_feat_tran.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
skew                                  3727
std                                   3727
stetson_j                             3727
stetson_k                             3727
dtype: int64

Count number of unique features

In [37]:
df_feat_tran.T.apply(lambda x: x.nunique(), axis=1)

ID                                    3727
amplitude                             3498
beyond1st                             1749
flux_percentile_ratio_mid20           3615
flux_percentile_ratio_mid35           3615
flux_percentile_ratio_mid50           3615
flux_percentile_ratio_mid65           3615
flux_percentile_ratio_mid80           3615
kurtosis                              3615
linear_trend                          3615
max_slope                             3567
median_absolute_deviation             3615
median_buffer_range_percentage        1569
pair_slope_trend                      1319
percent_amplitude                     3604
percent_difference_flux_percentile    3615
skew                                  3615
std                                   3615
stetson_j                             3615
stetson_k                             3615
dtype: int64

Save transient features

In [38]:
num_features = df_feat_tran.shape[1]-1
outdir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
outpath = outdir + filename
df_feat_tran.to_pickle(outpath)

Extract permanent features

In [44]:
# Create empty feature dict
nontran_feats = feature_dict(num_features)
for ID in df_per.ID.unique():
    # Get current object light curve
    df = df_per[df_per.ID == ID]
    # Get features
    obj_feats = extract.features(df, feature_dict)
    # Append features
    for k,v in nontran_feats.items():
        if k != 'ID': nontran_feats[k].append(obj_feats[k])
    nontran_feats['ID'].append(trID)
# Create feature dataframe
df_feat_nontran = pd.DataFrame(nontran_feats)

Count number of features

In [45]:
df_feat_nontran.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
skew                                  3727
std                                   3727
stetson_j                             3727
stetson_k                             3727
dtype: int64

Count number of unique features

In [46]:
df_feat_nontran.T.apply(lambda x: x.nunique(), axis=1)

ID                                       1
amplitude                              830
beyond1st                             1818
flux_percentile_ratio_mid20           3700
flux_percentile_ratio_mid35           3721
flux_percentile_ratio_mid50           3722
flux_percentile_ratio_mid65           3721
flux_percentile_ratio_mid80           3716
kurtosis                              3727
linear_trend                          3727
max_slope                             3668
median_absolute_deviation             3725
median_buffer_range_percentage        1999
pair_slope_trend                      1450
percent_amplitude                     2080
percent_difference_flux_percentile    3235
skew                                  3727
std                                   3727
stetson_j                             3727
stetson_k                             3727
dtype: int64

Save permanent features

In [47]:
num_features = df_feat_nontran.shape[1]-1
outdir = DATA_PATH
filename = 'permanent_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
outpath = outdir + filename
df_feat_nontran.to_pickle(outpath)