In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
import math

filename = "breast-cancer-wisconsin.data.txt"

# Initialize datafrmae
dataframe = pd.read_csv(filename, header=None)
dataframe = dataframe.drop(0, 1)
columns = ['ClTh', 'UnCeSi', 'UnCeSh', 'MaAd', 'SiEpCeSi', 'BaNu', 
           'BlCh', 'NoNu', 'Mitoses', 'Class']
dataframe.columns = columns

# Clean out missing values
dataframe = dataframe[dataframe.BaNu != '?']
dataframe = dataframe.reset_index(drop=True)
dataframe = dataframe.astype(int)

# Clean out outliers
LOF = LocalOutlierFactor(n_neighbors = 200)
Local = LOF.fit_predict(dataframe.drop(['Class'], axis=1))
dataframe['LOF_det'] = Local

# Drop outlier column 
dataframe = dataframe[dataframe.LOF_det != -1]
dataframe = dataframe.drop(['LOF_det'], axis=1)
dataframe = dataframe.reset_index(drop=True)

# Change class values from 4,2 to 1,0
dataframe['Class'] = dataframe['Class'].replace(4, 1)
dataframe['Class'] = dataframe['Class'].replace(2, 0)

# Split data 80-20
len_data = len(dataframe.index)
len_split = math.ceil(len_data * 0.8)
dataTest = dataframe.head(len_data - len_split)
dataTrain = dataframe.head(len_split)

# Get X and Y Data
X_train = dataTrain.drop('Class', 1)
Y_train = dataTrain.Class
X_test = dataTest.drop('Class', 1)
Y_test = dataTest.Class

print(X_test)
print(Y_test)

# print(dataframe)

     ClTh  UnCeSi  UnCeSh  MaAd  SiEpCeSi  BaNu  BlCh  NoNu  Mitoses
0       5       1       1     1         2     1     3     1        1
1       5       4       4     5         7    10     3     2        1
2       3       1       1     1         2     2     3     1        1
3       6       8       8     1         3     4     3     7        1
4       4       1       1     3         2     1     3     1        1
5       8      10      10     8         7    10     9     7        1
6       2       1       2     1         2     1     3     1        1
7       4       2       1     1         2     1     2     1        1
8       1       1       1     1         1     1     3     1        1
9       2       1       1     1         2     1     2     1        1
10      1       1       1     1         2     3     3     1        1
11      8       7       5    10         7     9     5     5        4
12      4       1       1     1         2     1     2     1        1
13      4       1       1     1   



In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

LogReg = LogisticRegression()
LogReg.fit(X_train, Y_train)
# scores = cross_val_score(LogReg, X_train.values, Y_train.values, 5)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogReg.score(X_test.values, Y_test.values)

# x_sample = X_data.iloc[3].values
# x_sample = x_sample.reshape(1, -1)
# print(x_sample)
# print(LogReg.predict(x_sample))



0.9836065573770492

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras import metrics
from sklearn.preprocessing import LabelEncoder
from keras.models import model_from_json
import os


model = Sequential()
model.add(Dense(3, input_dim=9, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


fitModel = model.fit(X_train.values, Y_train.values, batch_size=1, epochs=80, 
                      validation_data=(X_test.values, Y_test.values))



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 3)                 30        
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 12        
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 4         
Total params: 46
Trainable params: 46
Non-trainable params: 0
_________________________________________________________________
Train on 492 samples, validate on 122 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
E

Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [9]:
# Save model
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [20]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
loaded_score = loaded_model.evaluate(X_test.values, Y_test.values)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], loaded_score[1]*100))

for i in range(0,10):
    print(loaded_model.predict(X_test.values[i].reshape(1,-1), batch_size=1))


Loaded model from disk
acc: 98.36%
[[1.22107e-06]]
[[0.867795]]
[[0.0005451]]
[[0.84610283]]
[[4.0255668e-07]]
[[0.99848026]]
[[0.00016248]]
[[2.223473e-05]]
[[0.00012745]]
[[3.169132e-05]]


In [8]:
# filename2 = "breast-cancer_csv.csv"

# # Initialize datafrmae
# dataframe2 = pd.read_csv(filename2, header=0)
# # dataframe = dataframe.drop(0, 1)
# # columns = []
# # dataframe.columns = columns

# print(dataframe2)

       age menopause tumor-size inv-nodes node-caps  deg-malig breast  \
0    40-49   premeno      15-19       0-2       yes          3  right   
1    50-59      ge40      15-19       0-2        no          1  right   
2    50-59      ge40      35-39       0-2        no          2   left   
3    40-49   premeno      35-39       0-2       yes          3  right   
4    40-49   premeno      30-34       3-5       yes          2   left   
5    50-59   premeno      25-29       3-5        no          2  right   
6    50-59      ge40      40-44       0-2        no          3   left   
7    40-49   premeno      10-14       0-2        no          2   left   
8    40-49   premeno        0-4       0-2        no          2  right   
9    40-49      ge40      40-44     15-17       yes          2  right   
10   50-59   premeno      25-29       0-2        no          2   left   
11   60-69      ge40      15-19       0-2        no          2  right   
12   50-59      ge40      30-34       0-2        no