## Medium post: https://medium.com/@gabogarza/exoplanet-hunting-with-machine-learning-and-kepler-data-recall-100-155e1ddeaa95

## Github repo: https://github.com/gabrielgarza/exoplanet-deep-learning

### Import libraries

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from pathlib import Path
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
# from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from scipy import ndimage, fft
from sklearn.preprocessing import normalize
# import os
# print(os.listdir("../input"))

## Data Preprocessor

In [83]:
import pandas as pd
import numpy as np
from scipy import ndimage, fft
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler

class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft(X, n=X.size))

    def process(self, df_train_x, df_dev_x):
        # Apply fourier transform
        if self.fourier:
            print("Applying Fourier...")
            
            df_train_x = df_train_x.apply(self.fourier_transform,axis=1)
            df_dev_x = df_dev_x.apply(self.fourier_transform,axis=1)
#             print(df_train_x)
            print(df_train_x.iloc.shape)
            # Keep first half of data as it is symmetrical after previous steps
            df_train_x = df_train_x.iloc[:,:(df_train_x.shape[1]//2)].values
            df_dev_x = df_dev_x.iloc[:,:(df_dev_x.shape[0]//2)].values

        # Normalize
        if self.normalize:
            print("Normalizing...")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_dev_x = pd.DataFrame(normalize(df_dev_x))

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.filters.gaussian_filter(df_train_x, sigma=10)
            df_dev_x = ndimage.filters.gaussian_filter(df_dev_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_dev_x = std_scaler.transform(df_dev_x)

        print("Finished Processing!")
        return df_train_x, df_dev_x


### Load datasets

In [84]:
train_dataset_path = "./dataset/exoTrain.csv"
dev_dataset_path = "./dataset/exoTest.csv"

print("Loading datasets...")
df_train = pd.read_csv(train_dataset_path, encoding = "ISO-8859-1")
df_dev = pd.read_csv(dev_dataset_path, encoding = "ISO-8859-1")
print("Loaded datasets!")

# Generate X and Y dataframe sets
df_train_x = df_train.drop('LABEL', axis=1)
print(df_train_x)
df_dev_x = df_dev.drop('LABEL', axis=1)
df_train_y = df_train.LABEL
df_dev_y = df_dev.LABEL

Loading datasets...
Loaded datasets!
         FLUX.1     FLUX.2     FLUX.3    FLUX.4    FLUX.5    FLUX.6    FLUX.7  \
0         93.85      83.81      20.10    -26.98    -39.56   -124.71   -135.18   
1        -38.88     -33.83     -58.54    -40.09    -79.31    -72.81    -86.55   
2        532.64     535.92     513.73    496.92    456.45    466.00    464.50   
3        326.52     347.39     302.35    298.13    317.74    312.70    322.33   
4      -1107.21   -1112.59   -1118.95  -1095.10  -1057.55  -1034.48   -998.34   
5        211.10     163.57     179.16    187.82    188.46    168.13    203.46   
6          9.34      49.96      33.30      9.63     37.64     20.85      4.54   
7        238.77     262.16     277.80    190.16    180.98    123.27    103.95   
8       -103.54    -118.97    -108.93    -72.25    -61.46    -50.16    -20.61   
9       -265.91    -318.59    -335.66   -450.47   -453.09   -561.47   -606.03   
10       118.81     110.97      79.53    114.25     48.78      3.12     

### Process data and create numpy matrices

In [85]:
def np_X_Y_from_df(df):
    df = shuffle(df)
    df_X = df.drop(['LABEL'], axis=1)
    X = np.array(df_X)
    Y_raw = np.array(df['LABEL']).reshape((len(df['LABEL']),1))
    Y = Y_raw == 2
    return X, Y

In [86]:
# Process dataset
LFP = LightFluxProcessor(
    fourier=True,
    normalize=True,
    gaussian=True,
    standardize=True)
df_train_x, df_dev_x = LFP.process(df_train_x, df_dev_x)

# Rejoin X and Y
df_train_processed = pd.DataFrame(df_train_x).join(pd.DataFrame(df_train_y))
df_dev_processed = pd.DataFrame(df_dev_x).join(pd.DataFrame(df_dev_y))

# Load X and Y numpy arrays
X_train, Y_train = np_X_Y_from_df(df_train_processed)
X_dev, Y_dev = np_X_Y_from_df(df_dev_processed)

Applying Fourier...
0       [31822.48, 3542.697259888826, 55671.6128069181...
1       [23747.1, 18904.03483095342, 23034.01375376699...
2       [26180.51000000001, 143610.48759073665, 41912....
3       [14855.140000000003, 12371.439884872616, 33309...
4       [45527.38999999998, 310414.0846940789, 140433....
5       [57595.81, 3575.662285766386, 132513.349943679...
6       [4053.12, 8641.10498593109, 11990.45749121767,...
7       [283320.93, 93598.45551310903, 69042.747262033...
8       [189.00000000000045, 1973.170029584989, 16839....
9       [668543.64, 581679.3263434591, 2469493.4489111...
10      [56458.630000000005, 77055.7174891073, 74982.4...
11      [133399.03, 61589.88609404257, 101028.19952430...
12      [14030.41, 1437.8118968806941, 7857.0346368092...
13      [307731.64, 249444.77876781902, 196837.2360530...
14      [285487.5, 653070.567839152, 291589.1597433053...
15      [46660.450000000026, 115920.93164433476, 19808...
16      [1912.0000000000005, 15045.403256425843, 745

IndexError: tuple index out of range

### Describe datasets

In [15]:
(num_examples, n_x) = X_train.shape # (n_x: input size, m : number of examples in the train set)
n_y = Y_train.shape[1] # n_y : output size
print("X_train.shape: ", X_train.shape)
print("Y_train.shape: ", Y_train.shape)
print("X_dev.shape: ", X_dev.shape)
print("Y_dev.shape: ", Y_dev.shape)
print("n_x: ", n_x)
print("num_examples: ", num_examples)
print("n_y: ", n_y)

## Build Model, Train, and Predict

In [16]:
model = LinearSVC()

# sm = SMOTE(ratio = 1.0)
# X_train_sm, Y_train_sm = sm.fit_sample(X_train, Y_train)
X_train_sm, Y_train_sm = X_train, Y_train

# Train
print("Training...")
model.fit(X_train_sm, Y_train_sm)

train_outputs = model.predict(X_train_sm)
dev_outputs = model.predict(X_dev)
print("Finished Training!")

## Calculate and Display Metrics

In [17]:
# Metrics
train_outputs = model.predict(X_train_sm)
dev_outputs = model.predict(X_dev)
train_outputs = np.rint(train_outputs)
dev_outputs = np.rint(dev_outputs)
accuracy_train = accuracy_score(Y_train_sm, train_outputs)
accuracy_dev = accuracy_score(Y_dev, dev_outputs)
precision_train = precision_score(Y_train_sm, train_outputs)
precision_dev = precision_score(Y_dev, dev_outputs)
recall_train = recall_score(Y_train_sm, train_outputs)
recall_dev = recall_score(Y_dev, dev_outputs)
confusion_matrix_train = confusion_matrix(Y_train_sm, train_outputs)
confusion_matrix_dev = confusion_matrix(Y_dev, dev_outputs)
classification_report_train = classification_report(Y_train_sm, train_outputs)
classification_report_dev = classification_report(Y_dev, dev_outputs)

print(" ")
print(" ")
print("Train Set Error", 1.0 - accuracy_train)
print("Dev Set Error", 1.0 - accuracy_dev)
print("------------")
print("Precision - Train Set", precision_train)
print("Precision - Dev Set", precision_dev)
print("------------")
print("Recall - Train Set", recall_train)
print("Recall - Dev Set", recall_dev)
print("------------")
print("Confusion Matrix - Train Set")
print(confusion_matrix_train)
print("Confusion Matrix - Dev Set")
print(confusion_matrix_dev)
print("------------")
print(" ")
print(" ")
print("------------")
print("classification_report_train")
print(classification_report_train)
print("classification_report_dev")
print(classification_report_dev)