In [11]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


from bokeh.plotting import figure, show, output_file

In [12]:
np.random.seed(7)

# Created a neural net without any feature analysis

In [13]:
data = np.loadtxt("pima-indians-diabetes.data.csv", delimiter=",")
data.shape

(768, 9)

In [14]:
X = data[:, 0:8]
y = data[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [15]:
seq = Sequential()
seq.add(Dense(12, input_dim = 8, activation='relu'))
seq.add(Dense(8, activation='relu'))
seq.add(Dense(1, activation='sigmoid'))
seq.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
seq.fit(X_train, y_train, epochs = 150, verbose = 0, batch_size=10)
seq.evaluate(X_test,  y_test, batch_size=10)

 10/231 [>.............................] - ETA: 0s

[0.77003011849277458, 0.66233766569203634]

In [16]:
kbest = SelectKBest(k = 'all')
kbest.fit(X, y)

SelectKBest(k='all', score_func=<function f_classif at 0x7f084958fa28>)

# ran PCA

In [17]:
pca = PCA(n_components=3, svd_solver='full')
X2 = pca.fit_transform(X)
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size = 0.30)

In [18]:
seq = Sequential()
seq.add(Dense(12, input_dim = 3, activation='relu'))
seq.add(Dense(8, activation='relu'))
seq.add(Dense(1, activation='sigmoid'))
seq.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
seq.fit(X2_train, y_train, epochs = 150, verbose = 0, batch_size=10)
seq.evaluate(X2_test,  y_test, batch_size=10)

 10/231 [>.............................] - ETA: 1s

[0.54918106048654169, 0.77922077173794502]

# ran t_sne

In [19]:
tsne = TSNE(n_components=3)
X3 = tsne.fit_transform(X)
X3_train, X3_test, y_train, y_test = train_test_split(X3, y, test_size = 0.30)

In [20]:
seq = Sequential()
seq.add(Dense(12, input_dim = 3, activation='relu'))
seq.add(Dense(8, activation='relu'))
seq.add(Dense(1, activation='sigmoid'))
seq.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
seq.fit(X3_train, y_train, epochs = 150, verbose = 0, batch_size=10)
seq.evaluate(X3_test,  y_test, batch_size=10)

 10/231 [>.............................] - ETA: 1s

[0.5933600648657068, 0.69696969542152443]

# evaluation

**TSNE** seemd to perform better than pca and just splitting the data.  I was definitely surprised by this, especially since the data wasn't very long (700+) and had only 8 features.  

# plotting
-----------------

In [21]:
colors = {1 : 'red', 0: 'blue'}

In [22]:
TOOLS="resize,crosshair,pan,wheel_zoom,box_zoom,reset,tap,previewsave,box_select,poly_select,lasso_select"

p = figure(tools = TOOLS )
p.scatter(X3[:, 0], X3[:,1], fill_color = [colors[i] for i in y], fill_alpha = 0.6)
    
show(p)

In [29]:
ones = 0
zers = 0

for i in y:
    if i == 1:
        ones += 1
    else:
        zers += 1

print ones, zers, float(ones)/len(y), float(zers)/len(y)

268 500 0.348958333333 0.651041666667
