In [1]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
sys.path.append('python')
import keplerml as fc # keplerml is a feature calculator, fc for short.

Features can be generated from the terminal using the following command:
```
python keplerml.py data/filelists/ex_filelist.txt data/lightcurves/ data/output/Example_output.p 
```
This requires a Python 3+ to be the default version, replace `python` with an appropriate version if this is not the case, for example, use `python3.7`

The above terminal command is equivalent to the following:

In [2]:
path_to_filelist = './data/filelists/ex_filelist.txt'
path_to_fits = './data/lightcurves/'
output_file = 'data/output/Example_output.p'
features = fc.features_from_filelist(path_to_filelist,path_to_fits,output_file,fl_as_array=False,verbose=True,prime_feats=False)

Reading ./data/filelists/ex_filelist.txt...
Using 47 cpus to calculate features...
Importing 247 lightcurves...
Lightcurve import took 0:00:01.129264
Processing 247 files...
247/247 completed. Time for chunk: 0:01:06.296283
Features have been calculated, total time to calculate features: 0:01:06.308606
Saving output to data/output/Example_output.p
Cleaning up...
Done.


Features are returned in a Pandas DataFrame, and saved to the specified output file as a pickled dataframe, which can be read in using the pickle module

In [5]:
features.head()

Unnamed: 0,longtermtrend,meanmedrat,skews,varss,coeffvar,stds,numoutliers,numnegoutliers,numposoutliers,numout1s,...,percentamp,magratio,sautocorrcoef,autocorrcoef,flatmean,tflatmean,roundmean,troundmean,roundrat,flatrat
kplr001026032-2011271113734_llc.fits,-6.415193e-07,0.998781,-7.109529,6.7e-05,0.008196,0.008186,56,56,0,163,...,0.086309,0.076923,0.742404,0.937333,0.034865,0.108202,-0.723083,0.031835,-22.713754,0.322218
kplr001026957-2011271113734_llc.fits,8.002484e-07,1.000915,0.39655,3.2e-05,0.005688,0.005694,0,0,0,1474,...,0.013276,0.616403,-0.294359,0.999479,0.002733,0.003749,0.047433,-0.012429,-3.816354,0.728774
kplr001433962-2011271113734_llc.fits,2.901455e-07,0.998547,-0.267555,3e-05,0.005488,0.00548,0,0,0,1789,...,0.013524,0.502052,-0.486633,0.978182,0.019154,0.018673,-0.003119,0.165303,-0.018868,1.025801
kplr001571511-2011271113734_llc.fits,-3.231966e-08,0.999617,-6.939159,7e-06,0.002572,0.002571,81,81,0,94,...,0.021482,0.069174,0.51013,0.968878,0.007759,0.006047,-0.089364,0.196277,-0.455296,1.283076
kplr001725193-2011271113734_llc.fits,-1.793919e-06,0.998418,-3.711617,5.2e-05,0.007213,0.007202,117,117,0,259,...,0.042875,0.136161,0.78944,0.957196,0.05588,0.210136,-1.015293,0.573217,-1.771219,0.265923


In [6]:
import pickle

In [7]:
output_file = 'data/output/Example_output.p'
with open(output_file,'rb') as f:
    feats = pickle.load(f)
feats.head()

Unnamed: 0,longtermtrend,meanmedrat,skews,varss,coeffvar,stds,numoutliers,numnegoutliers,numposoutliers,numout1s,...,percentamp,magratio,sautocorrcoef,autocorrcoef,flatmean,tflatmean,roundmean,troundmean,roundrat,flatrat
kplr001026032-2011271113734_llc.fits,-6.415193e-07,0.998781,-7.109529,6.7e-05,0.008196,0.008186,56,56,0,163,...,0.086309,0.076923,0.742404,0.937333,0.034865,0.108202,-0.723083,0.031835,-22.713754,0.322218
kplr001026957-2011271113734_llc.fits,8.002484e-07,1.000915,0.39655,3.2e-05,0.005688,0.005694,0,0,0,1474,...,0.013276,0.616403,-0.294359,0.999479,0.002733,0.003749,0.047433,-0.012429,-3.816354,0.728774
kplr001433962-2011271113734_llc.fits,2.901455e-07,0.998547,-0.267555,3e-05,0.005488,0.00548,0,0,0,1789,...,0.013524,0.502052,-0.486633,0.978182,0.019154,0.018673,-0.003119,0.165303,-0.018868,1.025801
kplr001571511-2011271113734_llc.fits,-3.231966e-08,0.999617,-6.939159,7e-06,0.002572,0.002571,81,81,0,94,...,0.021482,0.069174,0.51013,0.968878,0.007759,0.006047,-0.089364,0.196277,-0.455296,1.283076
kplr001725193-2011271113734_llc.fits,-1.793919e-06,0.998418,-3.711617,5.2e-05,0.007213,0.007202,117,117,0,259,...,0.042875,0.136161,0.78944,0.957196,0.05588,0.210136,-1.015293,0.573217,-1.771219,0.265923


The features are optimized using the `@njit` decorator from the `numba` package. To make full use of this, the code to be optimized by numba needs to run once. This can be done manually as follows:

In [8]:
lc_path = './data/lightcurves/kplr001026032-2011271113734_llc.fits'
lc = fc.import_lcs(lc_path)
t = lc[1]
nf = lc[2]
err = lc[3]
lc_feats = fc.feats(t,nf,err)

After priming the feature calculation, the features for a filelist can be run in the same way as before.
As a note, the `features_from_filelist` method will run a primer by default using the first lightcurve. Whether primed manually as above, with the default, or even run specifically without priming, runs following the first will be optimized by `numba` and be quicker.

Note the drastically improved runtime following the manual priming above:

In [9]:
path_to_filelist = './data/filelists/ex_filelist.txt'
path_to_fits = './data/lightcurves/'
output_file = 'data/output/Example_output.p'
features = fc.features_from_filelist(path_to_filelist,path_to_fits,output_file,fl_as_array=False,verbose=True,prime_feats=False)

Reading ./data/filelists/ex_filelist.txt...
Using 47 cpus to calculate features...
Importing 247 lightcurves...
Lightcurve import took 0:00:01.126179
Processing 247 files...
247/247 completed. Time for chunk: 0:00:12.281421
Features have been calculated, total time to calculate features: 0:00:12.294860
Saving output to data/output/Example_output.p
Cleaning up...
Done.


The file list can also be fed into the feature calculator as a list of filenames, produced however. Two examples below.

In [10]:
path_to_filelist = './data/filelists/ex_filelist.txt'
with open(path_to_filelist,'r') as f:
    files = f.read().splitlines()
path_to_fits = './data/lightcurves/'
output_file = 'data/output/Example_output.p'
feats = fc.features_from_filelist(files,path_to_fits,output_file,fl_as_array=True,verbose=True)

Using 47 cpus to calculate features...
Importing 247 lightcurves...
Lightcurve import took 0:00:01.606876
Processing 247 files...
247/247 completed. Time for chunk: 0:00:12.205176
Features have been calculated, total time to calculate features: 0:00:12.274446
Saving output to data/output/Example_output.p
Cleaning up...
Done.


In [11]:
import os

In [12]:
path_to_fits = './data/lightcurves/'
files = os.listdir('data/lightcurves')
output_file = 'data/output/Example_output.p'
feats = fc.features_from_filelist(files,path_to_fits,output_file,fl_as_array=True,verbose=True)

Using 47 cpus to calculate features...
Importing 247 lightcurves...
Lightcurve import took 0:00:01.874902
Processing 247 files...
247/247 completed. Time for chunk: 0:00:14.005037
Features have been calculated, total time to calculate features: 0:00:14.058610
Saving output to data/output/Example_output.p
Cleaning up...
Done.


## Saving as a Cluster Outlier Object

In [13]:
import clusterOutliers as coo

In [None]:
"""
!!! DOES NOT WORK FOR THIS EXAMPLE DATA
!!! CLUSTER OUTLIER OBJECT IS DESIGNED TO WORK ON LARGE DATASETS, EXAMPLE IS TOO SMALL
"""
example_coo = coo.clusterOutliers(feats=feats,fitsDir=path_to_fits,output_file='example.coo')

## Speed tests 

In [2]:
from datetime import datetime

In [3]:
start = datetime.now()
lc_path = './data/lightcurves/kplr001026032-2011271113734_llc.fits'
lc = fc.import_lcs(lc_path)
t = lc[1]
nf = lc[2]
err = lc[3]
lc_feats = fc.feats(t,nf,err)
print("Time to prime: {}".format(datetime.now()-start))

Time to prime: 0:00:08.198919


In [7]:
%%timeit
lc_path = './data/lightcurves/kplr001026032-2011271113734_llc.fits'
lc = fc.import_lcs(lc_path)
t = lc[1]
nf = lc[2]
err = lc[3]
lc_feats = fc.feats(t,nf,err)

29.7 ms ± 31.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
lc_feats = fc.feats(t,nf,err)

8.12 ms ± 9.96 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
lc = fc.import_lcs(lc_path)

19.1 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
