In [None]:
import csv
import cPickle as pickle
from datetime import datetime
import fnmatch
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
from matplotlib import colors
import matplotlib.cm as cmx
import matplotlib.gridspec as gridspec
from multiprocessing import Pool,cpu_count
from accelerate import cuda
import numpy as np
np.set_printoptions(threshold='nan')
import os
import pyfits
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import preprocessing
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, KMeans
import sys
if sys.version_info[0] < 3:
    import Tkinter as Tk
else:
    import tkinter as Tk
    
from tkFileDialog import askopenfilename,askdirectory,asksaveasfile

import keplerml
import km_outliers
import db_outliers

In [None]:
import matplotlib
%matplotlib

In [None]:
"""
============ Start ============
"""

# User defined
featCSV = "/Users/Dan/Documents/KeplerML/out.csv"
fitsDir = "/Users/Dan/Documents/KeplerML/data/Training_set_lightcurves"
numLCs = 5000

class feature_importer(object):
    def __init__(self,feats,fitsDir):
        self.data = pd.read_csv(feats,index_col=0)
        self.fitsDir = fitsDir
        self.files = self.data.index
        # Initializing the data and files samples with the first 1000 entries.
        self.dataSample = self.data.iloc[:1000]
        self.filesSample = self.files[:1000]
        self.lcs = self.poolRKC(self.filesSample)
        
    def poolRKC(self,files):
        numcpus = cpu_count()
        fwp = self.fitsDir+"/"+self.filesSample
        p = Pool(numcpus)
        lcs = p.map(keplerml.read_kepler_curve,fwp)
        p.close()
        p.join
        print("Done.")
        return lcs
    
    def randSample(self, numLCs):
        """
        Returns a random sample of numLCs light curves, data returned as an array
        of shape [numLCs,3,len(t)]
        Rerunning this, or randSampleWTabby will replace the previous random sample.
        """
        self.numLCs = numLCs
        print("Creating random file list...")
        self.dataSample = self.data.sample(n=numLCs)
        self.filesSample = self.dataSample.index
        print("Importing lightcurves...")
        self.lcs = self.poolRKC(self.filesSample)
        return self.lcs
    
    def randSampleWTabby(self, numLCs):
        """
        Returns a random sample of numLCs light curves, data returned as an array
        of shape [numLCs,3,len(t)]
        Rerunning this, or randSample will replace the previous random sample.
        """
        self.numLCs = numLCs
        print("Creating random file list...")
        self.dataSample = self.data.sample(n=numLCs)

        print("Checking for Tabby...")
        if not dataSample.index.str.contains('8462852').all():
            print("Adding Tabby...")
            self.dataSample.drop(self.dataSample.index[0])
            self.dataSample.append(self.data[self.data.index.str.contains('8462852')])
        self.filesSample = self.dataSample.index
        print("Importing lightcurves...")
        self.lcs = self.poolRKC(self.filesSample)
        return self.lcs
    
    def fullQ(self):
        self.filesSample = self.files
        self.dataSample = self.data
        self.lcs = self.poolRKC(self.filesSample)
        return self.lcs
    
    def tsne_fit(self,data):
        """
        Performs a t-SNE dimensionality reduction on the data sample generated.
        Uses a PCA initialization and the perplexity given, or defaults to 50.
        
        Appends the dataSample dataframe with the t-SNE X and Y coordinates
        Returns tsneX and tsneY
        """
        perplexity=50
        scaler = preprocessing.StandardScaler().fit(data)
        scaledData = scaler.transform(data)
        tsne = TSNE(n_components=2,perplexity=perplexity,init='random',verbose=True)
        tsne_fit=tsne.fit_transform(scaledData)
        ### !!! CHANGE THIS BACK TO TSNE_FIT.T[0],TSNE_FIT.T[1] !!!
        self.dataSample['tsne_x'] = tsne_fit[:102,0]
        self.dataSample['tsne_y'] = tsne_fit[:102,1]
        # Goal is to minimize the KL-Divergence
        if sklearn.__version__ == '0.18.1':
            print("KL-Divergence was %s"%tsne.kl_divergence_ )
        print("Done.")
        return tsne_fit.T[0],tsne_fit.T[1]
    
    def km_out(self):
        tsneData = self.dataSample[['tsne_x','tsne_y']]
        clusterLabels = km_outliers.kmeans_w_outliers(
            self.filesSample,tsneData,1)
        self.dataSample['km_cluster']=clusterLabels
        
    def db_out(self):
        clusterLabels = db_outliers.dbscan_w_outliers(
            self.dataSample[['tsne_x','tsne_y']])
        self.dataSample['db_cluster']=clusterLabels
        
    def save(self,of):
        data.to_csv(of)
        
sample = feature_importer(featCSV,fitsDir)
randomSample = sample.randSample(102)

# Generate random sampling of data

In [None]:
pieces = [sample.dataSample for i in range(25)]
data = pd.concat(pieces)
try:
    tsnedata=sample.tsne_fit(data)
except ValueError:
    randomSample = sample.randSample(102)
    print("retry")

In [None]:
sample.km_out()

In [None]:
sample.db_out()

## Apply t-SNE dimensionallity reduction and plot for review

In [None]:
x=tsnedata[0][:102]
y=tsnedata[1][:102]
with sns.axes_style("darkgrid"):
    sns.kdeplot(x, y,shade=False,cmap="nipy_spectral")
    plt.scatter(x, y,alpha=.3)

In [None]:
"""Run this before plotting in following cells."""
imported = True
def import_for_plot(df):
    """--- import light curve data ---"""
    pathtofits = df.fitsDir

    # The following needs to be generated in the cell above.
    files = df.filesSample
    clusterLabels = df.dataSample.db_cluster
    # data is an array containing each data point
    data = np.array([np.array(df.dataSample[['tsne_x','tsne_y']].loc[i]) for i in df.dataSample.index])

    cNorm  = colors.Normalize(vmin=0, vmax=max(clusterLabels))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap='jet')

    # tsneX has all the x-coordinates
    tsneX = data[:,0]
    # tsneY has all the y-coordinates
    tsneY = data[:,1]   
    outX = []
    outY = []
    files_out = []
    clusterX = []
    clusterY = []
    files_cluster = []

    for i in enumerate(data):
        if clusterLabels[i[0]] == -1:
            outX.append(i[1][0])
            outY.append(i[1][1])
            files_out.append(files[i[0]])
        else:
            clusterX.append(i[1][0])
            clusterY.append(i[1][1])
            files_cluster.append(files[i[0]])

    lightcurveData = sample.lcs

    """--- Organizing data and Labels ---"""
    tabbyCheck = fnmatch.filter(files,'*8462852*')
    if len(tabbyCheck)!=0:
        tabbyInd = files.index(tabbyCheck[0])
    else:
        tabbyInd = 0
    
    return files,clusterLabels,data,\
cNorm,scalarMap,tsneX,tsneY,outX,\
outY,files_out,clusterX,clusterY,\
files_cluster,lightcurveData,tabbyInd

files,clusterLabels,data,\
cNorm,scalarMap,tsneX,tsneY,outX,\
outY,files_out,clusterX,clusterY,\
files_cluster,lightcurveData,tabbyInd=import_for_plot(sample)

# Plot clusters (must be generated above)

In [None]:
%matplotlib tk

root = Tk.Tk()
root.wm_title("Scatter")
try:
    test=imported
    fig = Figure(figsize=(20,10))
    
    # a tk.DrawingArea
    canvas = FigureCanvasTkAgg(fig, master=root)
    canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)
    # Toolbar to help navigate the data (pan, zoom, save image, etc.)
    toolbar = NavigationToolbar2TkAgg(canvas, root)
    toolbar.update()
    canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)
    
    gs = gridspec.GridSpec(2,6)
    
    with sns.axes_style("white"):
        # empty subplot for scattered data
        ax = fig.add_subplot(gs[0,:4])
        # empty subplot for lightcurves
        ax2 = fig.add_subplot(gs[1,:])
        # empty subplot for center detail
        ax3 = fig.add_subplot(gs[0,4:])
    
    def distance(point, event):
        """Return distance between mouse position and given data point

        Args:
            point (np.array): np.array of shape (3,), with x,y,z in data coords
            event (MouseEvent): mouse event (which contains mouse position in .x and .xdata)
        Returns:
            distance (np.float64): distance (in screen coords) between mouse pos and data point
        """
        assert point.shape == (2,), "distance: point.shape is wrong: %s, must be (2,)" % point.shape
        x2,y2 = ax.transData.transform((point[0],point[1]))

        return np.sqrt ((x2 - event.x)**2 + (y2 - event.y)**2)
    
    def calcClosestDatapoint(XT, event):
        """Calculate which data point is closest to the mouse position.
        
        Args:
            XT (np.array) - array of points, of shape (numPoints, 2)
            event (MouseEvent) - mouse event (containing mouse position)
        Returns:
            smallestIndex (int) - the index (into the array of points X) of the element closest to the mouse position
        """
        distances = [distance (XT[:,i], event) for i in range(XT.shape[1])]
        
        return np.argmin(distances)
    
    def drawData(X, index):
        # Plots the lightcurve of the point chosen
        ax2.cla()
        
        x=X[index][0]
        y=X[index][1]
        
        axrange=0.55*(max(y)-min(y))
        mid=(max(y)+min(y))/2
        yaxmin = mid-axrange
        yaxmax = mid+axrange
        if yaxmin < .95:
            if yaxmax > 1.05:
                ax2.set_ylim(yaxmin,yaxmax)
            else:
                ax2.set_ylim(yaxmin,1.05)
        elif yaxmax > 1.05:
            ax2.set_ylim(.95,yaxmax)
        else:
            ax2.set_ylim(.95,1.05)

        if files[index] in files_cluster:
            color = 'blue'
        else:
            color = 'red'
        ax2.plot(x, y, 'o',markeredgecolor='none', c=color, alpha=0.2)
        ax2.plot(x, y, '-',markeredgecolor='none', c=color, alpha=0.7)
        #ax2.set_title(files[index][:13],fontsize = 20)
        ax2.set_xlabel('Time (Days)',fontsize=22)
        ax2.set_ylabel(r'$\frac{\Delta F}{F}$',fontsize=30)
        
        fig.suptitle(files[index][:13],fontsize=30)
        
        canvas.draw()
        
    def annotatePt(XT, index):
        """Create popover label in 3d chart

        Args:
            X (np.array) - array of points, of shape (numPoints, 3)
            index (int) - index (into points array X) of item which should be printed
        Returns:
            None
        """
        x2, y2 = XT[index][0], XT[index][1]
        # Either update the position, or create the annotation
        if hasattr(annotatePt, 'label'):
            annotatePt.label.remove()
            annotatePt.emph.remove()
        if hasattr(annotatePt, 'emphCD'):
            annotatePt.emphCD.remove()

        # Get data point from array of points X, at position index
        annotatePt.label = ax.annotate( "",
            xy = (x2, y2), xytext = (x2+10, y2+10),
            arrowprops = dict(headlength=20,headwidth=20,width=6,shrink=.1,color='red'))
        annotatePt.emph = ax.scatter(x2,y2,marker='o',s=50,c='red')
        if files[index] in files_cluster:
            annotatePt.emphCD = ax3.scatter(x2,y2,marker='o',s=150,c='red')
        else:
            annotatePt.emphCD = ax.scatter(x2,y2,marker='o',s=50,c='red')
        canvas.draw()
    
    
    def onMouseClick(event, X):
        """Event that is triggered when mouse is clicked. Shows lightcurve for data point closest to mouse."""
        XT = np.array(X.T) # array organized by feature, each in it's own array
        closestIndex = calcClosestDatapoint(XT, event)
        drawData(lightcurveData, closestIndex)
        
    def onMouseRelease(event, X):
        XT = np.array(X.T)
        closestIndex = calcClosestDatapoint(XT, event)
        annotatePt(X,closestIndex)
        #for centerIndex in centerIndices:
        #    annotateCenter(XT,centerIndex)
    
    def connect(X):
        if hasattr(connect,'cidpress'):
            fig.canvas.mpl_disconnect(connect.cidpress)
        if hasattr(connect,'cidrelease'):
            fig.canvas.mpl_disconnect(connect.cidrelease)
            
        connect.cidpress = fig.canvas.mpl_connect('button_press_event', lambda event: onMouseClick(event,X))
        connect.cidrelease = fig.canvas.mpl_connect('button_release_event', lambda event: onMouseRelease(event, X))
    
    def redraw():       
        # Clear the existing plots
        ax.cla()
        ax2.cla()
        ax3.cla()
        # Set those labels
        ax.set_xlabel("T-SNE X",fontsize=18)
        ax.set_ylabel("T-SNE Y",fontsize=18)
        # Scatter the data
        ax.scatter(outX, outY,c="black",s=30,cmap='jet')
        ax.hexbin(clusterX,clusterY,mincnt=5,bins="log",cmap="inferno",gridsize=35)
        
        hb = ax3.hexbin(clusterX,clusterY,mincnt=5,bins="log",cmap="inferno",gridsize=35)
        cb = fig.colorbar(hb)
        ax3.set_title("Center Density Detail")
        ax3.set_xlabel("T-SNE X",fontsize=18)
        ax3.set_ylabel("T-SNE Y",fontsize=18)
        
        
        #for centerIndex in centerIndices:
        #    annotateCenter(currentData1,centerIndex)
        
        if hasattr(redraw,'cidenter'):
                fig.canvas.mpl_disconnect(redraw.cidenter)
                fig.canvas.mpl_disconnect(redraw.cidexit)
        connect(data)
            
        annotatePt(data,tabbyInd)
        drawData(lightcurveData,tabbyInd)
        #fig.savefig('Plots/Q16_PCA_kmeans/Tabby.png')
        canvas.draw()
        canvas.show()
    print("Plotting.")
    
    redraw() # First draw, Tabby plotted
    
    
    def quit():
        print("Exitting.")
        root.quit()
        root.destroy()
        
    Tk.Button(root, text="Quit", command=quit).pack()
except NameError:
    print("Run import cell above.")
root.mainloop()

In [None]:

outX = []
outY = []
files_out = []
clusterX = []
clusterY = []
files_cluster = []
clusteredLabels = []

for i in enumerate(data):
    if clusterLabels[i[0]] == -1:
        outX.append(i[1][0])
        outY.append(i[1][1])
        files_out.append(files[i[0]])
    else:
        clusterX.append(i[1][0])
        clusterY.append(i[1][1])
        files_cluster.append(files[i[0]])
        clusteredLabels.append(clusterLabels[i[0]])
        
df = pd.DataFrame()
df['t-SNE_X'] = clusterX
df['t-SNE_Y'] = clusterY

cNorm  = colors.Normalize(vmin=0, vmax=max(clusteredLabels))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap='jet')
           
with sns.axes_style("white"):

    plt.xlabel("t-SNE X",fontsize=18)
    plt.ylabel("t-SNE Y",fontsize=18)
    #plt.hexbin(clusterX,clusterY,mincnt=1,bins="log",cmap="inferno",gridsize=35)
    #plt.colorbar()
    plt.scatter(clusterX,clusterY,marker='o',c=scalarMap.to_rgba(clusteredLabels))
    plt.scatter(outX,outY,c="red")
    xmid = (max(clusterX)+min(clusterX))/2
    xrng = (max(clusterX)-min(clusterX))/2
    ymid = (max(clusterY)+min(clusterY))/2
    yrng = (max(clusterY)-min(clusterY))/2
    plt.scatter(data[tabbyInd][0],data[tabbyInd][1],c='green',s=50)
    plt.xlim(xmid-xrng*1.05,xmid+xrng*1.05)
    plt.ylim(ymid-yrng*1.05,ymid+yrng*1.05)
    g = sns.jointplot(x="t-SNE_X",y="t-SNE_Y",data=df,kind='scatter',color="black",stat_func=None,size=10,\
                      xlim=(xmid-xrng*1.05,xmid+xrng*1.05),ylim=(ymid-yrng*1.05,ymid+yrng*1.05),\
                      marginal_kws=dict(bins=200),\
                      joint_kws=dict(s=1))
    
    g.set_axis_labels("t-SNE X","t-SNE Y", fontsize=18)
    
plt.show()