## Imports

In [1]:
# Some standard imports for math and data handling
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Imports for processing specific to this workbook
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from datetime import datetime

# Import the custom code developed for this work
sys.path.append('python')
from clusterOutliers import clusterOutliers as coo

%matplotlib inline

RuntimeError: module compiled against API version 0xb but this version of numpy is 0xa

## Data Scaler

In [2]:
def dataScaler(qdf,nfeats=60):
    """
    Data scaler for this work, returns a dataframe w/ scaled features
    """
    data = qdf.iloc[:,0:nfeats]
    scaler = preprocessing.StandardScaler().fit(data)
    scaled_data = scaler.transform(data)
    scaled_df = pd.DataFrame(index=qdf.index,\
                             columns=qdf.columns[:nfeats],\
                             data=scaled_data)
    return scaled_df

## Importing Data

In [3]:
def import_generator(suffix='_FullSample.csv',
                     filepath="/home/dgiles/Documents/KeplerLCs/output/",
                    fits_files_directory="/home/dgiles/Documents/KeplerLCs/fitsFiles/"):
    """
    Args:
        suffix (str) - the suffix of the file to be imported
        filepath (optional, str) - filepath to the file to be imported
        fits_files_directory (optional, str) - path to the directory containing the fits files
        
    Returns:
        lambda function that with single str input of the prefix, typically a Q#.
    
    Use:
        Enables simpler import of multiple quarters of data contained 
        in the same location with the same suffixes.
    
    Requirements: 
    import sys
    sys.path.append('python')
    from clusterOutliers import clusterOutliers as coo
    """
    return lambda QN: coo(filepath+QN+suffix,fits_files_directory+QN+"fitsfiles")

In [None]:
qs = ['Q4','Q8','Q11','Q16']
PCA_folder = "/home/dgiles/Documents/KeplerLCs/output/PCA_reductions/"

In [12]:
import_quarter = import_generator(suffix="_FullSample.csv")
import_base = import_generator(suffix="_base", filepath=PCA_folder)
import_90 = import_generator(suffix="_PCA90", filepath=PCA_folder)
import_95 = import_generator(suffix="_PCA95", filepath=PCA_folder)
import_99 = import_generator(suffix="_PCA99", filepath=PCA_folder)

paper_qs = dict(zip(qs,[import_quarter(Q) for Q in qs]))
base_qs = dict(zip(qs,[import_base(Q) for Q in qs]))
pca90_d = dict(zip(qs,[import_90(Q) for Q in qs]))
pca95_d = dict(zip(qs,[import_95(Q) for Q in qs]))
pca99_d = dict(zip(qs,[import_99(Q) for Q in qs]))

In [44]:
"""
Have to rearrange Quarter 8 data to match other data
"""

cols = base_qs['Q8'].data.columns.tolist()
new_cols = cols[:60]+[cols[62]]+cols[60:62]+cols[63:]
base_qs['Q8'].data = base_qs['Q8'].data[new_cols]

cols = paper_qs['Q8'].data.columns.tolist()
new_cols = cols[:60]+[cols[62]]+cols[60:62]+cols[63:]
paper_qs['Q8'].data = paper_qs['Q8'].data[new_cols]

## Saving dataframes

In [60]:
def save_dfs(PCA_folder = "/home/dgiles/Documents/KeplerLCs/output/PCA_reductions/"):
    """
    Save function specific to this workbook.
    
    !!!THIS WILL OVERWRITE EXISTING FILES!!!
    """
    
    for q in pca90_d:
        pca90_d[q].data.to_csv(PCA_folder+q+"_PCA90")
    for q in pca95_d:
        pca95_d[q].data.to_csv(PCA_folder+q+"_PCA95")
    for q in pca99_d:
        pca99_d[q].data.to_csv(PCA_folder+q+"_PCA99")
    for q in base_qs:
        base_qs['q'].data.to_csv(PCA_folder+q+"_base")
    
    return

## Scoring

### Distance Based

In [81]:
def dist_score(data,d2s=None,k=59):
    """
    Args:
        data (Numpy array or Pandas dataframe) - Full set of data
        d2s (Numpy array or Pandas dataframe) - Subset of data to be scored
        k (integer) - Neighbor to which the distance is considered the score
        
    Returns:
        scores (Numpy array) - MinMax scaled scores for data in d2s. 
    """
    # For Kepler data common to quarters 4, 8, 11, and 16, k=59 was determined to be useful.
    if type(d2s)==None:
        d2s=data
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree',n_jobs=-1).fit(data)
    distances, indices = nbrs.kneighbors(d2s)
    scores = distances[:,k]
    scores = (scores-scores.min())/(scores.max()-scores.min()) #min max scaled
    # TODO: readjust scaling so that the extreme outliers don't affect scores of the rest.
    # Potentially scale 90th percentile, define all beyond that as having a score of 1.
    
    return list(scores)

#### Scoring the baseline (unreduced) data

In [73]:
def scoreLoop(qdict):
    startTime = datetime.now()
    scores_dict = dict()
    for Q in adict:
        qStartTime = datetime.now()
        print("Starting {}".format(Q))
        QN = adict[Q]
        data = QN.data
        dims = 0
        for col in data.columns:    
            if col == 'db_out':
                break
            else:
                dims+=1
        print("Scoring {} in {} dimensions".format(Q,dims))
        scaled_data = dataScaler(data,dims)
        out_scores = dist_score(scaled_data,scaled_data[data.db_out==-1])
        scores_dict[Q] = out_scores
        print("Time to process {}: {}".format(Q,datetime.now()-qStartTime))

    print("Time to process all quarters: {}".format(datetime.now()-startTime))
    return scores_dict

In [74]:
adict = base_qs
base_out_scores = scoreLoop(adict)

Starting Q4
Scoring Q4 in 60 dimensions
Time to process Q4: 0:00:12.857038
Starting Q8
Scoring Q8 in 60 dimensions
Time to process Q8: 0:00:16.674572
Starting Q11
Scoring Q11 in 60 dimensions
Time to process Q11: 0:00:14.679183
Starting Q16
Scoring Q16 in 60 dimensions
Time to process Q16: 0:00:16.153303
Time to process all quarters: 0:01:00.365658


In [147]:
Q = 'Q16'
df = base_qs[Q].data
out_scores = base_out_scores[Q]
out_only_comp = pd.DataFrame({
    "Full":df[df.db_out==-1].dist_score.as_matrix(),
    "Out_only":out_scores},
    index=df[df.db_out==-1].index)
out_only_comp['rank_full'] = out_only_comp.Full.rank(ascending=False)
out_only_comp['rank_out'] = out_only_comp.Out_only.rank(ascending=False)
rank_diffs = out_only_comp.rank_full - out_only_comp.rank_out
print("""
Greatest rank decrease: {}, {}
Greatest rank increase: {}, {}
Median difference in rank: {}
Percent w/in 10: {:04.1f}%
Percent w/in 100: {:05.2f}%
""".format(rank_diffs.min(), rank_diffs[rank_diffs==rank_diffs.min()].index[0][:13],
          rank_diffs.max(), rank_diffs[rank_diffs==rank_diffs.max()].index[0][:13],
          rank_diffs.median(),
          len(rank_diffs[abs(rank_diffs)<10])/len(rank_diffs)*100,
          len(rank_diffs[abs(rank_diffs)<100])/len(rank_diffs)*100))


Greatest rank decrease: -161.0, kplr007742133
Greatest rank increase: 21.0, kplr005952324
Median difference in rank: 2.0
Percent w/in 10: 75.0%
Percent w/in 100: 99.87%



In [91]:
adict = pca90_d
pca90_out_scores = scoreLoop(adict)

Starting Q4
Scoring Q4 in 18 dimensions
Time to process Q4: 0:00:05.606169
Starting Q8
Scoring Q8 in 18 dimensions
Time to process Q8: 0:00:07.081499
Starting Q11
Scoring Q11 in 17 dimensions
Time to process Q11: 0:00:08.167452
Starting Q16
Scoring Q16 in 17 dimensions
Time to process Q16: 0:00:05.555375
Time to process all quarters: 0:00:26.412719


In [148]:
Q = 'Q16'
df = pca90_d[Q].data
out_scores = pca90_out_scores[Q]
out_only_comp = pd.DataFrame({
    "Full":df[df.db_out==-1].dist_score.as_matrix(),
    "Out_only":out_scores},
    index=df[df.db_out==-1].index)
out_only_comp['rank_full'] = out_only_comp.Full.rank(ascending=False)
out_only_comp['rank_out'] = out_only_comp.Out_only.rank(ascending=False)
rank_diffs = out_only_comp.rank_full - out_only_comp.rank_out
print("""
Greatest rank decrease: {}, {}
Greatest rank increase: {}, {}
Median difference in rank: {}
Percent w/in 10: {:04.1f}%
Percent w/in 100: {:05.2f}%
""".format(rank_diffs.min(), rank_diffs[rank_diffs==rank_diffs.min()].index[0][:13],
          rank_diffs.max(), rank_diffs[rank_diffs==rank_diffs.max()].index[0][:13],
          rank_diffs.median(),
          len(rank_diffs[abs(rank_diffs)<10])/len(rank_diffs)*100,
          len(rank_diffs[abs(rank_diffs)<100])/len(rank_diffs)*100))


Greatest rank decrease: -181.0, kplr009821923
Greatest rank increase: 18.0, kplr002019352
Median difference in rank: 2.0
Percent w/in 10: 78.8%
Percent w/in 100: 99.91%



In [128]:
adict = pca95_d
pca95_out_scores = scoreLoop(adict)

Starting Q4
Scoring Q4 in 23 dimensions
Time to process Q4: 0:00:09.120561
Starting Q8
Scoring Q8 in 24 dimensions
Time to process Q8: 0:00:08.660059
Starting Q11
Scoring Q11 in 22 dimensions
Time to process Q11: 0:00:06.728444
Starting Q16
Scoring Q16 in 23 dimensions
Time to process Q16: 0:00:06.948755
Time to process all quarters: 0:00:31.459481


In [149]:
Q = 'Q16'
df = pca95_d[Q].data
out_scores = pca95_out_scores[Q]
out_only_comp = pd.DataFrame({
    "Full":df[df.db_out==-1].dist_score.as_matrix(),
    "Out_only":out_scores},
    index=df[df.db_out==-1].index)
out_only_comp['rank_full'] = out_only_comp.Full.rank(ascending=False)
out_only_comp['rank_out'] = out_only_comp.Out_only.rank(ascending=False)
rank_diffs = out_only_comp.rank_full - out_only_comp.rank_out
print("""
Greatest rank decrease: {}, {}
Greatest rank increase: {}, {}
Median difference in rank: {}
Percent w/in 10: {:04.1f}%
Percent w/in 100: {:05.2f}%
""".format(rank_diffs.min(), rank_diffs[rank_diffs==rank_diffs.min()].index[0][:13],
          rank_diffs.max(), rank_diffs[rank_diffs==rank_diffs.max()].index[0][:13],
          rank_diffs.median(),
          len(rank_diffs[abs(rank_diffs)<10])/len(rank_diffs)*100,
          len(rank_diffs[abs(rank_diffs)<100])/len(rank_diffs)*100))


Greatest rank decrease: -116.0, kplr001873918
Greatest rank increase: 15.0, kplr005630212
Median difference in rank: 2.0
Percent w/in 10: 84.2%
Percent w/in 100: 99.91%



In [133]:
adict = pca99_d
pca99_out_scores = scoreLoop(adict)

Starting Q4
Scoring Q4 in 35 dimensions
Time to process Q4: 0:00:10.271810
Starting Q8
Scoring Q8 in 36 dimensions
Time to process Q8: 0:00:11.830123
Starting Q11
Scoring Q11 in 34 dimensions
Time to process Q11: 0:00:12.934743
Starting Q16
Scoring Q16 in 34 dimensions
Time to process Q16: 0:00:12.732713
Time to process all quarters: 0:00:47.770770


In [150]:
Q = 'Q16'
df = pca99_d[Q].data
out_scores = pca99_out_scores[Q]
out_only_comp = pd.DataFrame({
    "Full":df[df.db_out==-1].dist_score.as_matrix(),
    "Out_only":out_scores},
    index=df[df.db_out==-1].index)
out_only_comp['rank_full'] = out_only_comp.Full.rank(ascending=False)
out_only_comp['rank_out'] = out_only_comp.Out_only.rank(ascending=False)
rank_diffs = out_only_comp.rank_full - out_only_comp.rank_out
print("""
Greatest rank decrease: {}, {}
Greatest rank increase: {}, {}
Median difference in rank: {}
Percent w/in 10: {:04.1f}%
Percent w/in 100: {:05.2f}%
""".format(rank_diffs.min(), rank_diffs[rank_diffs==rank_diffs.min()].index[0][:13],
          rank_diffs.max(), rank_diffs[rank_diffs==rank_diffs.max()].index[0][:13],
          rank_diffs.median(),
          len(rank_diffs[abs(rank_diffs)<10])/len(rank_diffs)*100,
          len(rank_diffs[abs(rank_diffs)<100])/len(rank_diffs)*100))


Greatest rank decrease: -104.0, kplr005451040
Greatest rank increase: 17.0, kplr001849235
Median difference in rank: 2.0
Percent w/in 10: 88.7%
Percent w/in 100: 99.98%

