In [13]:
!ls ../../Processed_Data/

koi_light_curves_full.npy  koi_light_curves_model_full.npy  README.md


In [12]:
!ls ../Datos/

ICA		    kepler_downloaded.txt  koi_sets.csv  PCA
kepler_dataset.csv  koi_metadata.csv	   OwnFats	 README.md


### Read Data

In [1]:
import numpy as np
import pandas as pd
import csv, warnings,time
warnings.filterwarnings('ignore')

df_sets = pd.read_csv("../Datos/koi_sets_unb.csv")
mask_train = (df_sets["Set"] == "Train").values
mask_test = (df_sets["Set"] == "Test").values

#lc_total = np.load("../../Processed_Data/koi_light_curves_full.npy") #Raw light curve
lc_total = np.load("../../Processed_Data/koi_light_curves_model_full.npy")
lc_train = lc_total[mask_train] 
lc_test = lc_total[mask_test]

file_name_metadata = "koi_metadata_p_error.csv"
df_label = pd.read_csv("../Datos/"+file_name_metadata)
df_label_train = df_label[mask_train] 
df_label_test = df_label[mask_test]
print("Read Done!")

Read Done!


## Automatic Extraction
---

In [2]:
%%time
#fourier
X_train = lc_train
X_test = lc_test
Xtrain_fourier = np.abs(np.fft.fft(X_train))
Xtest_fourier = np.abs(np.fft.fft(X_test))
print("Fourier transformation done")

Fourier transformation done
CPU times: user 6min 43s, sys: 4.81 s, total: 6min 47s
Wall time: 6min 57s


## PCA
---
#### Generate 3 files in 3 dimension: 5, 10, 25

In [3]:
%%time
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA,PCA,FastICA

std = StandardScaler(with_mean=True,with_std=False) #without variance remove --needed
std.fit(Xtrain_fourier)
print("CenterScalar done!")

Xtrain_std = std.transform(Xtrain_fourier)
Xtest_std = std.transform(Xtest_fourier)
print("Data scaled!")

CenterScalar done!
Data scaled!
CPU times: user 4.25 s, sys: 3.57 s, total: 7.81 s
Wall time: 6.53 s


In [4]:
%%time
dims= [5,10,25,50,100]

for d in dims:
    model = PCA(d)
    
    ##################### TRAIN #####################
    model.fit(Xtrain_std) 
    print('PCA training with', d, 'features')

    #################### ARRIBA ENTRENAMIENTO ###########
    df = pd.DataFrame(model.transform(Xtrain_std),columns=["Component "+str(t+1) for t in np.arange(d)])
    df.to_csv("../Datos/PCA_unb/koi_light_curves_FPCA_train"+str(d)+"_model.csv",index=False)
    print('Generate CSV of koi_light_curves_PCA_train', d)
    
    ##################### TEST ########################
    df = pd.DataFrame(model.transform(Xtest_std),columns=["Component "+str(t+1) for t in np.arange(d)])
    df.to_csv("../Datos/PCA_unb/koi_light_curves_FPCA_test"+str(d)+"_model.csv",index=False)
    print('Generate CSV of koi_light_curves_PCA_test', d)
    
print('Process of dimensionality reduction completed')

PCA training with 50 features
Generate CSV of koi_light_curves_PCA_train 50
Generate CSV of koi_light_curves_PCA_test 50
PCA training with 100 features
Generate CSV of koi_light_curves_PCA_train 100
Generate CSV of koi_light_curves_PCA_test 100
PCA training with 250 features
Generate CSV of koi_light_curves_PCA_train 250
Generate CSV of koi_light_curves_PCA_test 250
Process of dimensionality reduction completed
CPU times: user 7min 42s, sys: 24.3 s, total: 8min 6s
Wall time: 41.7 s


## ICA
---
#### Generate 3 files in 3 dimension: 5, 10 and 15

In [22]:
model = FastICA(5,whiten=True)
#model.fit(Xtrain_std)  #with std -- no funciona
#model.fit(Xtrain_std2) #without std -- funciona 
#model.fit(Xtrain_fourier) #fft -- funciona
model.fit(X_train)  #light curve -- funciona

FastICA(algorithm='parallel', fun='logcosh', fun_args=None, max_iter=200,
    n_components=5, random_state=None, tol=0.0001, w_init=None,
    whiten=True)

In [6]:
%%time 
dims= [5,10,15,20,25,50]

for d in dims:
    model = FastICA(d,whiten=True)
    
    ##################### TRAIN #####################
    model.fit(Xtrain_std)
    print('ICA trainig with', d, 'features')

    #################### ARRIBA ENTRENAMIENTO ###########
    df = pd.DataFrame(model.transform(Xtrain_std),columns=["Component "+str(t+1) for t in np.arange(d)])
    df.to_csv("../Datos/ICA_unb/koi_light_curves_FICA_train"+str(d)+"_model.csv",index=False)
    print('Generate CSV Train', d)

    ##################### TEST ########################
    df = pd.DataFrame(model.transform(Xtest_std),columns=["Component "+str(t+1) for t in np.arange(d)])
    df.to_csv("../Datos/ICA_unb/koi_light_curves_FICA_test"+str(d)+"_model.csv",index=False)
    print('Generate CSV Test', d)
    
print('Process of dimensionality reduction completed')

ICA trainig with 20 features
Generate CSV Train 20
Generate CSV Test 20
ICA trainig with 25 features
Generate CSV Train 25
Generate CSV Test 25
ICA trainig with 50 features
Generate CSV Train 50
Generate CSV Test 50
Process of dimensionality reduction completed
CPU times: user 1h 24min 39s, sys: 10min 3s, total: 1h 34min 42s
Wall time: 6min 25s


## OwnFATS (Manual)
---
#### Generate Own FATS features of light curve (manualy)

In [None]:
import FATS

time_ex = np.arange(3000)
magnitude_ex = np.random.rand(3000)

lc_example = np.array([magnitude_ex, time_ex])

time_start = time.time()
#a = FATS.FeatureSpace(Data=['magnitude','time'],excludeList= ['SlottedA_length','PeriodLS','StetsonK_AC',"Period_fit","Psi_CS","Psi_eta"])
#SACAR DE EXCLUDE PERIODLS
a = a.calculateFeature(lc_example)
print("Termino en %f segundos"%(time.time()-time_start))
a.result(method='dict')

#it take too long!

In [17]:
from scipy import stats
from scipy.signal import resample
from sklearn.linear_model import LinearRegression

def amplitude(magnitudes):
    return 0.5 * (np.max(magnitudes) - np.min(magnitudes))

def median_absolute_deviation(magnitudes):
    median = np.median(magnitudes)
    deviations = magnitudes - median
    absolute_deviations = np.absolute(deviations)
    return np.median(absolute_deviations)

def residual_bright_faint_ratio(magnitudes):    # median as a fit
    mean = np.mean(magnitudes)
    brighter = magnitudes[magnitudes > mean]
    fainter = magnitudes[magnitudes < mean]

    resid_brighter = np.mean(np.square(brighter - mean))
    resid_fainter = np.mean(np.square(fainter - mean))

    ratio = resid_fainter / (resid_brighter+1e-14)
    return ratio

def own_fats(sequence):
    time_ex =  np.arange(len(sequence))

    minim=np.min(sequence)
    maxim = np.max(sequence)
    mean = np.mean(sequence)
    std = np.std(sequence)
    iqr = stats.iqr(sequence) #q31
    skew = stats.skew(sequence)
    kurt = stats.kurtosis(sequence)
    q1 = np.percentile(sequence, 25)
    q2 = np.percentile(sequence, 50)
    model = LinearRegression(normalize=True,n_jobs=-1)
    model.fit(time_ex.reshape(-1,1),sequence)
    slope = model.coef_[0]
    #new features
    ampl = amplitude(sequence)
    mad = median_absolute_deviation(sequence)
    br_fa = residual_bright_faint_ratio(sequence)
    median = np.median(sequence)
    return np.array([minim,maxim,mean,std,iqr,skew,kurt,q1,q2,slope,ampl,mad,br_fa,median])

def metadata_columns(match,array):
    df = pd.read_csv("../Datos/"+file_name_metadata)
    metadata = df[(df["KOI Name"] == match)].values[0][3:]
    return np.hstack((array,metadata))

columns_FATS = ["Minimum","Maximum","Mean","Std","IQR","Skew","Kurtosis","Q1","Q2","Slope","Amplitude","MAD",
               "Residual Bright Faint Ratio","Median"]
aux = pd.read_csv("../Datos/"+file_name_metadata)
columns_metadata = list(aux.columns[3:])

def extract_FATS(X,df_label):
    X_fats = []
    for sequence,match in zip(X,df_label["KOI Name"]):
        aux = own_fats(sequence)
        final = metadata_columns(match,aux)
        X_fats.append(final)
    return np.asarray(X_fats)

def save(name_set,features,df_label):
    df2save = pd.DataFrame(features,columns=columns_FATS+columns_metadata)
    df2save["KOI Name"] = df_label["KOI Name"]
    df2save.to_csv("../Datos/OwnFats_unb/koi_light_curves_FATS2_metadata_"+name_set+"_p_error.csv",index=False)

In [18]:
%time features = extract_FATS(lc_train,df_label_train) #train
save("train_model",features,df_label_train)
print("Training already extracted")

%time features = extract_FATS(lc_test,df_label_test) #test
save("test_model",features,df_label_test)
print("Validation already extracted")

CPU times: user 2h 21min 21s, sys: 6min 26s, total: 2h 27min 47s
Wall time: 6min 9s
Training already extracted
CPU times: user 46min 49s, sys: 2min 7s, total: 48min 57s
Wall time: 2min 2s
Validation already extracted
