In [1]:
DATA_PATH = '../../data/CRTS2/'

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
sys.path.append("../..")
import measurements
import astropy.time as astime

Import transient lightcurves

In [3]:
filename = 'transient_lightcurves.pickle'
indir = DATA_PATH; filepath = indir + filename
df_tra = pd.read_pickle(filepath)
df_tra.shape

(451474, 4)

Filter transient lightcurves

In [4]:
# Delete rows of blended observations
df_tra = df_tra.drop_duplicates(['TransientID','MJD'], keep='first')

In [21]:
min_obs = 5

In [6]:
# Add observation count to every transient
df_count = df_tra.groupby('TransientID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['TransientID', 'ObsCount']]
df_tra = df_tra.merge(df_count, how='inner')

In [7]:
# Remove objects with less than 5 observations
df_tra = df_tra[df_tra.ObsCount >= min_obs]

Import permanent lightcurves

In [8]:
filename = 'permanent_lightcurves.pickle'
indir = DATA_PATH; filepath = indir + filename
df_per = pd.read_pickle(filepath)
df_per.shape

(1924409, 4)

In [9]:
# Delete rows of blended observations
df_per = df_per.drop_duplicates(['ID','MJD'], keep='first')
df_per.shape

(1802695, 4)

In [10]:
# Add observation count to every permanent
df_count = df_per.groupby('ID', as_index=False).count()
df_count['ObsCount'] = df_count['Mag']
df_count = df_count[['ID', 'ObsCount']]
df_per = df_per.merge(df_count, how='inner')

In [11]:
# Remove objects with less than 5 observations
df_per = df_per[df_per.ObsCount >= 5]
df_per.shape

(1798465, 5)

In [12]:
df_per.ID.unique().shape

(15193,)

In [13]:
# Sample subset of same size as transients
sample_size = df_tra.TransientID.unique().shape[0]
np.random.seed(42)
IDs = np.random.choice(df_per.ID.unique(), size=sample_size, replace=False)
df_per = df_per[df_per.ID.isin(IDs)]
df_per.ID.unique().shape, df_per.shape

((3727,), (444973, 5))

 Define functionality to extract features 

In [14]:
def extract_features(df, feature_dict):
    df = df.copy()
    df['Flux'] = measurements.__mag_to_flux__(df.Mag)
    df['Date'] = astime.Time(df.MJD, format='mjd').datetime
    df = df.sort_values('Date')
    # Curve fitting params
    p1_params, p2_params, p4_params = curve_params(df.Mag, df.Magerr, df.MJD)
    feature_dict['poly1_a'] = p1_params[0]
    feature_dict['poly2_a'] = p2_params[0]
    feature_dict['poly2_b'] = p2_params[1]
    feature_dict['poly2_b'] = p2_params[0]
    feature_dict['poly4_a'] = p4_params[0]
    feature_dict['poly4_b'] = p4_params[1]
    feature_dict['poly4_c'] = p4_params[2]
    feature_dict['poly4_d'] = p4_params[3]
    # Curve statistics measurements
    feature_dict['skew'].append(measurements.skew(df.Mag))
    feature_dict['kurtosis'].append(measurements.kurtosis(df.Mag))
    feature_dict['small_kurtosis'].append(measurements.small_kurtosis(df.Mag))
    feature_dict['std'].append(measurements.std(df.Mag))
    feature_dict['beyond1st'].append(measurements.beyond1st(df.Mag, df.Magerr))
    feature_dict['stetson_j'].append(measurements.stetson_j(df.Mag, df.Magerr, df.Date))
    feature_dict['stetson_k'].append(measurements.stetson_k(df.Mag, df.Magerr))
    feature_dict['max_slope'].append(measurements.max_slope(df.Mag, df.Date))
    feature_dict['amplitude'].append(measurements.amplitude(df.Mag))
    feature_dict['median_absolute_deviation'].append(measurements.median_absolute_deviation(df.Mag))
    feature_dict['median_buffer_range_percentage'].append(measurements.median_buffer_range_percentage(df.Flux))
    feature_dict['pair_slope_trend'].append(measurements.pair_slope_trend(df.Mag, df.Date))
    feature_dict['pair_slope_trend_last_30'].append(measurements.pair_slope_trend_last_30(df.Mag, df.Date))
    feature_dict['flux_percentile_ratio_mid20'].append(measurements.flux_percentile_ratio_mid20(df.Flux))
    feature_dict['flux_percentile_ratio_mid35'].append(measurements.flux_percentile_ratio_mid35(df.Flux))
    feature_dict['flux_percentile_ratio_mid50'].append(measurements.flux_percentile_ratio_mid50(df.Flux))
    feature_dict['flux_percentile_ratio_mid65'].append(measurements.flux_percentile_ratio_mid65(df.Flux))
    feature_dict['flux_percentile_ratio_mid80'].append(measurements.flux_percentile_ratio_mid80(df.Flux))
    feature_dict['percent_amplitude'].append(measurements.percent_amplitude(df.Flux))
    feature_dict['percent_difference_flux_percentile'].append(measurements.percent_difference_flux_percentile(df.Flux))
    feature_dict['linear_trend'].append(measurements.linear_trend(df.Flux, df.Date))

Extract transient features

In [15]:
feature_dict = {'ID':[], 'skew':[], 'std':[], 'kurtosis':[], 'small_kurtosis': [], 'beyond1st':[],'stetson_j':[], 'stetson_k':[], 'max_slope':[],'amplitude':[], 'median_absolute_deviation':[], 'median_buffer_range_percentage':[], 'pair_slope_trend':[], 'pair_slope_trend_last_30':[], 'flux_percentile_ratio_mid20':[], 'flux_percentile_ratio_mid35':[], 'flux_percentile_ratio_mid50':[], 'flux_percentile_ratio_mid65':[], 'flux_percentile_ratio_mid80':[], 'percent_amplitude':[], 'percent_difference_flux_percentile':[], 'linear_trend':[], 'poly1_a':[], 'poly2_a':[], 'poly2_b':[], 'poly4_a':[], 'poly4_b':[], 'poly4_c':[], 'poly4_d':[]}
for trID in df_tra.TransientID.unique():
    df = df_tra[df_tra.TransientID == trID]
    feature_dict['ID'].append(trID)
    extract_features(df, feature_dict)
df_feat_tran = pd.DataFrame(feature_dict)

In [16]:
df_feat_tran.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
pair_slope_trend_last_30              3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
skew                                  3727
small_kurtosis                        3727
std                                   3727
stetson_j                             3727
stetson_k                             3727
dtype: int64

Save transient features

In [17]:
num_features = df_feat_tran.shape[1]-1
outdir = DATA_PATH
filename = 'transient_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
outpath = outdir + filename
df_feat_tran.to_pickle(outpath)

Extract permanent features

In [18]:
feature_dict = {'ID':[], 'skew':[], 'std':[], 'kurtosis':[], 'small_kurtosis': [], 'beyond1st':[],'stetson_j':[], 'stetson_k':[], 'max_slope':[],'amplitude':[], 'median_absolute_deviation':[], 'median_buffer_range_percentage':[], 'pair_slope_trend':[], 'pair_slope_trend_last_30':[], 'flux_percentile_ratio_mid20':[], 'flux_percentile_ratio_mid35':[], 'flux_percentile_ratio_mid50':[], 'flux_percentile_ratio_mid65':[], 'flux_percentile_ratio_mid80':[], 'percent_amplitude':[], 'percent_difference_flux_percentile':[], 'linear_trend':[], 'poly1_a':[], 'poly2_a':[], 'poly2_b':[], 'poly4_a':[], 'poly4_b':[], 'poly4_c':[], 'poly4_d':[]}
for ID in df_per.ID.unique():
    df = df_per[df_per.ID == ID]
    feature_dict['ID'].append(ID)
    extract_features(df, feature_dict)
df_feat_perm = pd.DataFrame(feature_dict)

In [19]:
df_feat_perm.count()

ID                                    3727
amplitude                             3727
beyond1st                             3727
flux_percentile_ratio_mid20           3727
flux_percentile_ratio_mid35           3727
flux_percentile_ratio_mid50           3727
flux_percentile_ratio_mid65           3727
flux_percentile_ratio_mid80           3727
kurtosis                              3727
linear_trend                          3727
max_slope                             3727
median_absolute_deviation             3727
median_buffer_range_percentage        3727
pair_slope_trend                      3727
pair_slope_trend_last_30              3727
percent_amplitude                     3727
percent_difference_flux_percentile    3727
skew                                  3727
small_kurtosis                        3727
std                                   3727
stetson_j                             3727
stetson_k                             3727
dtype: int64

Save permanent features

In [20]:
num_features = df_feat_perm.shape[1]-1
outdir = DATA_PATH
filename = 'permanent_features_{}obs_{}feats.pickle'.format(min_obs, num_features) 
outpath = outdir + filename
df_feat_perm.to_pickle(outpath)