In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import re
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
sns.set()
blue, green, red = sns.color_palette()[:3]

%matplotlib inline

In [2]:
# Helper function to update the TCE status database table
def db_exclude_tces(cursor, kepoi_names, reason):
    query = "UPDATE tce_status SET excluded = 1, exclusion_reason = ? WHERE kepoi_name IN (%s);" % ','.join('?' for i in kepoi_names)
    cursor.execute(query, [reason] + kepoi_names)
    
# Helper function to get the number of excluded/non excluded TCEs
def db_get_number_tces(cursor, excluded=False):
    if excluded:
        excluded = '1'
    else:
        excluded = '0'
    query = "SELECT COUNT(*) FROM tce_status WHERE excluded = ?;"
    cursor.execute(query, excluded)
    return cursor.fetchone()

In [3]:
# Function to draw an horizontal bar plot with quantity text value next to each bar
def eda_barh(x, xlabel='', ylabel='', title='', figsize=(12, 6), text_offset=10, height=0.5):
    y = np.arange(x.shape[0])
    
    plt.figure(figsize=figsize)
    plt.barh(y, x, tick_label=x.index, align='center', height=height)
    for index, value in enumerate(x):
        plt.text(x=value + text_offset, y=index, s=value, va='center', color=blue, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [4]:
# Function that compute the skweness and kurtosis of all the variable distributions 
# and their logarithmic and square root transformation
def eda_skewness_kurtosis(df):
    
    # Ignore some error messages
    # => Errors can occurs when computing the sqrt and log10 transformation of some features
    old_settings = np.seterr(divide='ignore', invalid='ignore')
    
    # Skewness and Kurtosis of all the variables
    s_skew = df.skew(axis=0)
    s_kurtosis = df.kurtosis(axis=0)
    
    # Skewness and Kurtosis of all the log10 transformed variables
    s_skew_log10 = np.log10(df).skew(axis=0)
    s_kurtosis_log10 = np.log10(df).kurtosis(axis=0)
    
    # Skewness and Kurtosis of all the sqrt transformed variables
    s_skew_sqrt = np.sqrt(df).skew(axis=0)
    s_kurtosis_sqrt = np.sqrt(df).kurtosis(axis=0)
    
    # Reset to default the error handling
    np.seterr(**old_settings)
    
    # Return a dataframe
    return pd.DataFrame({'Skewness': s_skew, 
                         'Kurtosis': s_kurtosis, 
                         'Skewness log10': s_skew_log10, 
                         'Kurtosis log10': s_kurtosis_log10,
                         'Skewness sqrt': s_skew_sqrt, 
                         'Kurtosis sqrt': s_kurtosis_sqrt
                        })

In [5]:
def eda_plot_dist(df, var, var_label, var_title, min_threshold=None, max_threshold=None):

    filter = (df.fpflag_bitstring.isin(['0000', '1000', '0100', '0010', '0001'])) & (df.koi_disposition != 'CANDIDATE')

    # Distribution of the log transformed signal-to-noise ratio
    plt.figure(figsize=(20, 10))
    gridspec.GridSpec(20, 20)
    plt.subplot2grid((20, 20), (0, 0), rowspan=20, colspan=7)
    sns.distplot(df[var], kde=False)
    plt.xlabel(var_label)
    plt.ylabel('# TCEs')
    plt.title('Distribution of the ' + var_title)
    
    ylim = plt.ylim()
    if min_threshold:
        plt.plot([min_threshold, min_threshold], ylim,  'g--', linewidth=1)
    if max_threshold:
        plt.plot([max_threshold, max_threshold], ylim,  'g--', linewidth=1)

    # Violin plot of the signal-to-noise ratio per Exoplanet Archive Disposition
    plt.subplot2grid((20, 20), (0, 8), rowspan=9, colspan=13)
    sns.violinplot(df.koi_disposition, df[var])
    plt.xlabel('')
    plt.ylabel(var_label)
    plt.title('Distribution per disposition of the ' + var_title)
    
    xlim = plt.xlim()
    if min_threshold:
        plt.plot(xlim, [min_threshold, min_threshold], 'g--', linewidth=1)
    if max_threshold:
        plt.plot(xlim, [max_threshold, max_threshold], 'g--', linewidth=1)

    # Violin plot of the signal-to-noise ratio per false positive subcategories
    plt.subplot2grid((20, 20), (11, 8), rowspan=9, colspan=13)
    sns.boxplot(df.fpflag_bitstring[filter], df.loc[filter, var])
    plt.gca().set_xticklabels(['Confirmed Exoplanet', 'Ephemeris Match', 'Centroid Offset', 'Stellar Eclipse', 'Not Transit-Like'])
    plt.xlabel('')
    plt.ylabel(var_label)
    plt.title('Distribution per disposition flags of the ' + var_title)
    
    xlim = plt.xlim()
    if min_threshold:
        plt.plot(xlim, [min_threshold, min_threshold], 'g--', linewidth=1)
    if max_threshold:
        plt.plot(xlim, [max_threshold, max_threshold], 'g--', linewidth=1)

    plt.show()

In [6]:
# Function to plot a variabble distribution histrogram and violin plot by disposition
# n_sigma parameter allow to draw outlier limits at n sigma
def eda_distribution(s, fn, n_sigma=None, violinplot=True, label=''):
    # Compute the transformation of the provided Series
    if fn:
        s_transformed = fn(s)
    else:
        s_transformed = s
    
    # Get the outliers at n sigma
    if n_sigma:
        oultiers_min = np.mean(s_transformed) - n_sigma * np.std(s_transformed)
        oultiers_max = np.mean(s_transformed) + n_sigma * np.std(s_transformed)

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))

    # Draw an histrogram and outliers limits at n sigma
    sns.distplot(s_transformed, kde=False, ax=ax[0])
    y_lims = ax[0].get_ylim()
    if n_sigma:
        ax[0].plot([oultiers_min, oultiers_min], y_lims, 'g--', linewidth=1)
        ax[0].plot([oultiers_max, oultiers_max], y_lims, 'g--', linewidth=1)
    ax[0].set_xlabel(label)
    ax[0].set_ylabel('Number of observations')

    # Draw an violin plot by disposition and outliers limits at n sigma
    if violinplot:
        sns.violinplot(df_fits['identification']['koi_disposition'], s_transformed, ax=ax[1])
    else:
        sns.boxplot(df_fits['identification']['koi_disposition'], s_transformed, ax=ax[1])
    x_lims = ax[1].get_xlim()
    if n_sigma:
        ax[1].plot(x_lims, [oultiers_min, oultiers_min],  'g--', linewidth=1)
        ax[1].plot(x_lims, [oultiers_max, oultiers_max], 'g--', linewidth=1)
    ax[1].set_xlabel('Disposition')
    ax[1].set_ylabel(label)
    plt.show()

In [7]:
# Function to draw a boxplot with only the confirmed and distinct false positives categories observations
def eda_confirmed_and_false_positives_distribution(var_tuple, label=''):
    filter = (df_fits[('identification','fpflags_int')].isin([0, 1, 2, 4, 8])) & \
             (df_fits[('identification','koi_disposition')] != 'CANDIDATE')

    plt.figure(figsize=(12, 6))
    sns.boxplot(df_fits.loc[filter, ('identification', 'fpflags_int')], df_fits.loc[filter, var_tuple])
    plt.gca().set_xticklabels(['Confirmed Exoplanet', 'Ephemeris Match', 'Centroid Offset', 'Stellar Eclipse', 'Not Transit-Like'])
    plt.xlabel('False Positive Flags')
    plt.ylabel(label)
    plt.show()

In [8]:
# Function to drop outliers at n sigma
def eda_drop_nsigma_outliers(df, var_tuple, n_sigma, drop=True):
    s = df[var_tuple]
    zscore = (s - np.mean(s)) / np.std(s)
    value_min = np.mean(s) - n_sigma * np.std(s)
    value_max = np.mean(s) + n_sigma * np.std(s)
    
    idx = zscore[np.abs(zscore) > n_sigma].index
    print('Value range [{:.3f}, {:.3f}]'.format(value_min, value_max))
    if drop:
        df.drop(axis=0, index=idx, inplace=True)
        print('Outliers dropped:', idx.shape[0])
        print('DataFrame Shape:', df.shape)
    else:
        print('Outliers detected: ', idx.shape[0])

In [9]:
# Function to compute independant t-tests with pvalue threshold to reject H0 = 1% by default:
# - Confirmed exoplanat VS Candidates
# - Confirmed exoplanet VS False positives (all categories of false positives)
# - Confirmed exoplanet VS Ephemeris Match False Positives Only
# - Confirmed exoplanet VS Centroid Offset False Positives Only
# - Confirmed exoplanet VS Stellar Eclipse False Positives Only
# - Confirmed exoplanet VS Not Transit-Like False Positives Only
def eda_ttest_ind(df, variable, pvalue_max=0.01, label=''):
    stats = []
    pvalues = []
    reject= []
    
    # # Independeng t-test between the confirmed exoplanets and candidates, false positives
    for disposition in ['CANDIDATE', 'FALSE POSITIVE']:
        stat, pvalue = ttest_ind(df.loc[df[('identification', 'koi_disposition')] == 'CONFIRMED', variable], 
                                 df.loc[df[('identification', 'koi_disposition')] == disposition, variable],
                                 equal_var=False)
        stats.append(np.round(stat, 3))
        pvalues.append(pvalue)
        reject.append(pvalue < pvalue_max)
    
    
    # Independeng t-test between the confirmed exoplanet and main false positives kinds
    for fpflag in [1, 2, 4, 8]:
        stat, pvalue = ttest_ind(df.loc[df[('identification', 'koi_disposition')] == 'CONFIRMED', variable], 
                                 df.loc[df[('identification', 'fpflags_int')] == fpflag, variable],
                                 equal_var=False)
        stats.append(np.round(stat, 3))
        pvalues.append(pvalue)
        reject.append(pvalue < pvalue_max)
    
    # Return the results as a DataFrame
    data = {'statistic': stats, 'pvalue': pvalues, 'reject $H_0$': reject}
    idx = ['Candidates',
           'All False Positives',
           'Ephemeris Match False Positives Only',
           'Centroid Offset False Positives Only',
           'Stellar Eclipse False Positives Only',
           'Not Transit-Like False Positives Only'
          ] 
    result = pd.DataFrame(data=data, index=idx, columns=['statistic', 'pvalue', 'reject $H_0$'])
    result.index.name = '{} independant t-test (p-value < {}) | Confirmed Exoplanet VS'.format(label, pvalue_max)
    return result

In [10]:
# Function to draw scatter plot with only the confirmed exoplanet and distinct false positive categories observations
def eda_scatter_confirmed_false_positive(xvar, yvar, xlabel='', ylabel=''):

    # Exoplanet & False positive filters
    false_positives = df_fits[('identification', 'koi_disposition')] == 'FALSE POSITIVE'
    exoplanets = df_fits[('identification', 'koi_disposition')] == 'CONFIRMED'

    # False positives per type filters
    fp_ephemeris = df_fits[('identification','fpflags_int')] == 1
    fp_centroid  = df_fits[('identification','fpflags_int')] == 2
    fp_eclipse   = df_fits[('identification','fpflags_int')] == 4
    fp_notransit = df_fits[('identification','fpflags_int')] == 8

    #
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
    ax[0].scatter(df_fits.loc[false_positives, xvar], df_fits.loc[false_positives, yvar], s=3, color=red, label='False Positive')
    ax[0].scatter(df_fits.loc[exoplanets, xvar], df_fits.loc[exoplanets, yvar], s=3, color=green, label='Confirmed Exoplanet')
    ax[0].set_xlabel(xlabel)
    ax[0].set_ylabel(ylabel)
    ax[0].set_title('Confirmed exoplanets & All False Positives')
    ax[0].legend()

    ax[1].scatter(df_fits.loc[fp_centroid, xvar], df_fits.loc[fp_centroid, yvar], s=2, color='blue', label='Centroid Offset')
    ax[1].scatter(df_fits.loc[fp_eclipse, xvar], df_fits.loc[fp_eclipse, yvar], s=2, color='orange', label='Stellar Eclipse')
    ax[1].scatter(df_fits.loc[fp_notransit, xvar], df_fits.loc[fp_notransit, yvar], s=2, color='red', label='Not Transit-Like')
    ax[1].scatter(df_fits.loc[fp_ephemeris, xvar], df_fits.loc[fp_ephemeris, yvar], s=2, color='black', label='Ephemeris Match')
    ax[1].set_xlabel(xlabel)
    ax[1].set_ylabel(ylabel)
    ax[1].set_title('Main Types of False Positives')
    ax[1].legend()
    plt.show()

In [11]:
# Function to draw a 3d scatter plot with the confirmed exoplanets and all false positives observations
def scatter_3d(df, xvar, yvar, zvar, views, xlabel, ylabel, zlabel):
    
    # Exoplanet & False positive filters
    false_positives = df.koi_disposition == 'FALSE POSITIVE'
    exoplanets = df.koi_disposition == 'CONFIRMED'

    fig = plt.figure(num=2, figsize=(18, 7))

    ax = fig.add_subplot(1,2,1, projection='3d')
    for color, label, filter in [('red', 'False Positive', false_positives), ('green', 'Confirmed Exoplanet', exoplanets)]:
        ax.scatter(df.loc[filter, [xvar]],
                   df.loc[filter, [yvar]],
                   df.loc[filter, [zvar]], color=color, label=label, s=0.5)
    ax.view_init(views[0][0], views[0][1])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.legend()

    ax = fig.add_subplot(1,2,2, projection='3d')
    for color, label, filter in [('red', 'False Positive', false_positives), ('green', 'Confirmed Exoplanet', exoplanets)]:
        ax.scatter(df.loc[filter, [xvar]],
                   df.loc[filter, [yvar]],
                   df.loc[filter, [zvar]], color=color, label=label, s=0.5)
    ax.view_init(views[1][0], views[1][1])
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.legend()

    plt.show()