In [1]:
import pandas
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
#https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.names

data_source = "http://goo.gl/vhm1eU"
names = ['prg', 'plas', 'pres','skin','test','mass','pedi', 'age','class']

df = pandas.read_csv(data_source, names=names)
df.ix[:5]

Unnamed: 0,prg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [3]:
y_data = df["class"].values.reshape(-1,1)
y_data[:5]

array([[1],
       [0],
       [1],
       [0],
       [1]])

In [4]:
df_norm = (df.ix[:,:-1] - df.ix[:,:-1].mean()) / (df.ix[:,:-1].max() - df.ix[:,:-1].min()) #MinMax Standardzation 
df_norm[:5]

Unnamed: 0,prg,plas,pres,skin,test,mass,pedi,age
0,0.126762,0.136208,0.023726,0.146096,-0.094326,0.023956,0.066236,0.279319
1,-0.167356,-0.180375,-0.025455,0.08549,-0.094326,-0.080366,-0.051612,-0.037348
2,0.244409,0.312088,-0.041848,-0.207439,-0.094326,-0.129547,0.08545,-0.020681
3,-0.167356,-0.160274,-0.025455,0.024884,0.016785,-0.058012,-0.130178,-0.204015
4,-0.22618,0.080932,-0.238569,0.146096,0.104256,0.165535,0.775458,-0.004015


In [5]:
x_data = df_norm[:].values
x_data[:5]

array([[ 0.12676164,  0.13620839,  0.02372567,  0.14609638, -0.09432563,
         0.02395562,  0.06623557,  0.27931858],
       [-0.167356  , -0.18037453, -0.02545466,  0.08549032, -0.09432563,
        -0.08036629, -0.05161243, -0.03734809],
       [ 0.2444087 ,  0.31208778, -0.0418481 , -0.20743897, -0.09432563,
        -0.12954662,  0.08544991, -0.02068142],
       [-0.167356  , -0.16027403, -0.02545466,  0.02488426,  0.01678549,
        -0.0580116 , -0.13017775, -0.20401476],
       [-0.22617953,  0.080932  , -0.23856942,  0.14609638,  0.10425593,
         0.16553535,  0.77545845, -0.00401476]])

In [6]:
import scipy.stats as ss
x_data = np.array(ss.zscore(df.ix[:,:-1].values)) #Z-Score Normalization
x_data[:5]

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575,  0.53090156, -0.69289057,
        -0.68442195, -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, -1.28821221, -0.69289057,
        -1.10325546,  0.60439732, -0.10558415],
       [-0.84488505, -0.99820778, -0.16054575,  0.15453319,  0.12330164,
        -0.49404308, -0.92076261, -1.04154944],
       [-1.14185152,  0.5040552 , -1.50468724,  0.90726993,  0.76583594,
         1.4097456 ,  5.4849091 , -0.0204964 ]])

In [7]:
training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]

In [8]:
import tensorflow as tf

X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)

W = tf.Variable(tf.random_uniform([len(x_data[0]), 1], -1.0, -1.0))

In [9]:
h = tf.matmul(X, W)
hypothesis = tf.div(1., 1. + tf.exp(-h))
cost = -1 * tf.reduce_mean(Y * tf.log(hypothesis) + (1-Y) * tf.log(1-hypothesis))

a = tf.Variable(0.1)
optimizer = tf.train.GradientDescentOptimizer(a)
train = optimizer.minimize(cost)

init = tf.initialize_all_variables()

In [10]:
sess = tf.Session()
sess.run(init)


cost_history = []

for i in range(1000):
    sess.run(train, feed_dict={X: x_training, Y:y_training})
    if i % 50 == 0:
        print (i, sess.run(cost, feed_dict={X: x_training, Y:y_training}))
        cost_history.append(sess.run(cost, feed_dict={X: x_training, Y:y_training})) 
final_weight = sess.run(W)         
sess.close()

0 2.37277
50 0.610519
100 0.535753
150 0.529018
200 0.527207
250 0.526543
300 0.526263
350 0.526135
400 0.526074
450 0.526044
500 0.526029
550 0.526021
600 0.526017
650 0.526015
700 0.526013
750 0.526013
800 0.526012
850 0.526012
900 0.526012
950 0.526012


In [11]:
def sigmoid(z):
    return 1 / (1 + np.exp(z))

def hypothesis_function(x, theta):
    z = (np.dot(-x,theta))    
    return sigmoid(z)

sum((hypothesis_function(x_test, final_weight) > 0.5) == y_test) / y_test.shape[0]

array([ 0.75816993])

In [12]:
from sklearn import linear_model, datasets

logreg = linear_model.LogisticRegression(fit_intercept=False)
logreg.fit(x_training, y_training.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
sum(logreg.predict(x_test).reshape(-1,1) == y_test)  / y_test.shape[0]

array([ 0.75816993])

In [14]:
prediction_result = []
for _ in range(100):
    training_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.8))
    test_idx = np.random.randint(y_data.shape[0], size=int(y_data.shape[0] * 0.2))

    x_training, x_test = x_data[training_idx,:], x_data[test_idx,:]
    y_training, y_test = y_data[training_idx,:], y_data[test_idx,:]
    
    logreg.fit(x_training, y_training.ravel())
    prediction_result.append(sum(logreg.predict(x_test) == y_test.ravel())  / y_test.shape[0])
    
np.mean(prediction_result)

0.73150326797385601

In [15]:
max, min = np.mean(prediction_result) + 1.96 * (np.std(prediction_result) /10), \
            np.mean(prediction_result) - 1.96 * (np.std(prediction_result) /10)
min, max

(0.72417846389314533, 0.73882807205456669)