In [None]:
import os
os.environ['MKL_NUM_THREADS']="1"
os.environ['NUMEXPR_NUM_THREADS']="1"
os.environ['OMP_NUM_THREADS']="1" 

In [None]:
import axs
import numpy as np
#import mysql.connector as mariadb
#from astropy.time import Time
import time
import scipy
import pandas as pd

from axs import AxsCatalog, Constants

%matplotlib notebook
import matplotlib.pyplot as plt

#import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import size as spark_size
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import pandas_udf
import pyspark.sql.functions as F

import cesium
from cesium.time_series import TimeSeries
from cesium.featurize import featurize_single_ts, featurize_time_series

from cesium import featurize
import dask

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns',200)

In [None]:
from dirac import DataBase

db = DataBase({"spark.executor.instances" : "32"})

spark = db.spark_session

In [None]:
catalogs = db.get_catalogs()
ztfsample = catalogs.load("gaiadr2").exclude_duplicates()
print(ztfsample.count(), len(ztfsample.columns))

In [None]:
ztfsample.limit(5).toPandas()

In [None]:
features_General = ["amplitude",
                    "flux_percentile_ratio_mid20",
                    "flux_percentile_ratio_mid35",
                    "flux_percentile_ratio_mid50",
                    "flux_percentile_ratio_mid65",
                    "flux_percentile_ratio_mid80",
                    "max_slope",
                    "maximum",
                    "median",
                    "median_absolute_deviation",
                    "minimum",
                    "percent_amplitude",
                    "percent_beyond_1_std",
                    "percent_close_to_median",
                    "percent_difference_flux_percentile",
                    "period_fast",
                    "qso_log_chi2_qsonu",
                    "qso_log_chi2nuNULL_chi2nu",
                    "skew",
                    "std",
                    "stetson_j",
                    "stetson_k",
                    "weighted_average"]

features_Cadence = ["all_times_nhist_numpeaks",
                    "all_times_nhist_peak1_bin",
                    "all_times_nhist_peak2_bin",
                    "all_times_nhist_peak3_bin",
                    "all_times_nhist_peak4_bin",
                    "all_times_nhist_peak_1_to_2",
                    "all_times_nhist_peak_1_to_3",
                    "all_times_nhist_peak_1_to_4",
                    "all_times_nhist_peak_2_to_3",
                    "all_times_nhist_peak_2_to_4",
                    "all_times_nhist_peak_3_to_4",
                    "all_times_nhist_peak_val",
                    "avg_double_to_single_step",
                    "avg_err",
                    "avgt",
                    "cad_probs_1",
                    "cad_probs_10",
                    "cad_probs_20",
                    "cad_probs_30",
                    "cad_probs_40",
                    "cad_probs_50",
                    "cad_probs_100",
                    "cad_probs_500",
                    "cad_probs_1000",
                    "cad_probs_5000",
                    "cad_probs_10000",
                    "cad_probs_50000",
                    "cad_probs_100000",
                    "cad_probs_500000",
                    "cad_probs_1000000",
                    "cad_probs_5000000",
                    "cad_probs_10000000",
                    "cads_avg",
                    "cads_med",
                    "cads_std",
                    "mean",
                    "med_double_to_single_step",
                    "med_err",
                    "n_epochs",
                    "std_double_to_single_step",
                    "std_err",
                    "total_time"]

features_LS =  ["fold2P_slope_10percentile",
                "fold2P_slope_90percentile",
                "freq1_amplitude1",
                "freq1_amplitude2",
                "freq1_amplitude3",
                "freq1_amplitude4",
                "freq1_freq",
                "freq1_lambda",
                "freq1_rel_phase2",
                "freq1_rel_phase3",
                "freq1_rel_phase4",
                "freq1_signif",
                "freq2_amplitude1",
                "freq2_amplitude2",
                "freq2_amplitude3",
                "freq2_amplitude4",
                "freq2_freq",
                "freq2_rel_phase2",
                "freq2_rel_phase3",
                "freq2_rel_phase4",
                "freq3_amplitude1",
                "freq3_amplitude2",
                "freq3_amplitude3",
                "freq3_amplitude4",
                "freq3_freq",
                "freq3_rel_phase2",
                "freq3_rel_phase3",
                "freq3_rel_phase4",
                "freq_amplitude_ratio_21",
                "freq_amplitude_ratio_31",
                "freq_frequency_ratio_21",
                "freq_frequency_ratio_31",
                "freq_model_max_delta_mags",
                "freq_model_min_delta_mags",
                "freq_model_phi1_phi2",
                "freq_n_alias",
                "freq_signif_ratio_21",
                "freq_signif_ratio_31",
                "freq_varrat",
                "freq_y_offset",
                "linear_trend",
                "medperc90_2p_p",
                "p2p_scatter_2praw",
                "p2p_scatter_over_mad",
                "p2p_scatter_pfold_over_mad",
                "p2p_ssqr_diff_over_var",
                "scatter_res_raw"]

In [None]:
print('total number of time-series features in cesium: \
      {}'.format(len(features_Cadence+features_General+features_LS)))

In [None]:
%%time

lcs = ztfsample.head(100)

In [None]:
type(lcs), len(lcs)

In [None]:
%%time

N = 100
fset_cesium = featurize.featurize_time_series([np.array(lc['jd']) for lc in lcs[:N]],
                                              [np.array(lc['dc_mag']) for lc in lcs[:N]],
                                              [np.array(lc['dc_sigmag']) for lc in lcs[:N]],
                                              features_to_use=features_General+features_LS+features_Cadence,
                                              scheduler=dask.local.get_sync)

fset_cesium