## Dependencies

In [1]:
# data-handling dependencies
from matplotlib import pyplot
import numpy as np
import os
import pandas as pd

In [2]:
# Sklearn scaling & splitting
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [3]:
# Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.preprocessing import image

## Loading and Preprocessing our Data

### Load the PlentyOfFish image data

In [4]:
strImgPath = '../ek_scrape/img/doctordata/MyCity'
lstImg = []
for root, dirs, lstFile in os.walk(strImgPath):
    for strFile in lstFile:
        if strFile[-4:] == '.png':
            img = pyplot.imread(strImgPath + '/' + strFile, format='jpg')
            arrImg = image.img_to_array(img)
            lstImg.append(arrImg)

In [5]:
arrX = np.array(lstImg)
arrX.shape

(226, 110, 110, 3)

In [6]:
dfY = pd.read_csv('doctordata_MyCity.csv')
arrY = np.array(dfY['interested'])
arrY.shape
arrY.ndim
arrY.shape

(226,)

In [7]:
# create Train and Test datasets
arrTrainX, arrTestX, arrTrainY, arrTestY = train_test_split(arrX, arrY, random_state=17) # , stratify=arrY

### For Logistic Regression, we want to flatten our data into rows of 1D image arrays

In [8]:
# transform the 110x110x3 pics to a flat 1D array
fltDimCount = arrTrainX.shape[1] * arrTrainX.shape[2] * arrTrainX.shape[3]
arrTrainX = arrTrainX.reshape(arrTrainX.shape[0], fltDimCount)
arrTestX = arrTestX.reshape(arrTestX.shape[0], fltDimCount)
print("Training Shape:", arrTrainX.shape)
print("Testing Shape:", arrTestX.shape)

Training Shape: (169, 36300)
Testing Shape: (57, 36300)


## Scaling and Normalization

In [9]:
# normalize training data
scaler = MinMaxScaler().fit(arrTrainX)
arrTrainX = scaler.transform(arrTrainX)
arrTestX = scaler.transform(arrTestX)

## Our first step is to create an empty sequential model

In [10]:
# Create an empty sequential model
model = Sequential()

## Create first hidden layer, 100 nodes

In [11]:
# Add the first layer where the input dimensions are the 784 pixel values
# We can also choose our activation function. `relu` is a common
model.add(Dense(100, activation='relu', input_dim=arrTrainX.shape[1]))

Instructions for updating:
Colocations handled automatically by placer.


## Create second hidden layer, 100 nodes

In [12]:
# Add a second hidden layer
model.add(Dense(100, activation='relu'))

## Our final output layer uses a `softmax` activation function for logistic regression.

We also need to specify the number of output classes. In this case, the number of digits that we wish to classify.

In [13]:
# Add our final output layer where the number of nodes 
# corresponds to the number of y labels
model.add(Dense(1, activation='sigmoid'))

## Compile and Train Model

In [14]:
# uses categorical hinge for "interested" (1) and "not interested" (0)
# EXPERIMENTING WITH loss AND optimizer VALUES
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

## Finally, we train our model using our training data

Training consists of updating our weights using our optimizer and loss function. In this example, we choose 10 iterations (loops) of training that are called epochs.

We also choose to shuffle our training data and increase the detail printed out during each training cycle.

In [18]:
# Fit (train) the model
model.fit(
    arrTrainX,
    arrTrainY,
    epochs=5,
    shuffle=True,
    verbose=2
)

Epoch 1/5
 - 0s - loss: 2.3843 - acc: 0.8521
Epoch 2/5
 - 0s - loss: 2.3843 - acc: 0.8521
Epoch 3/5
 - 0s - loss: 2.3843 - acc: 0.8521
Epoch 4/5
 - 0s - loss: 2.3843 - acc: 0.8521
Epoch 5/5
 - 0s - loss: 2.3843 - acc: 0.8521


<tensorflow.python.keras.callbacks.History at 0x1bcf2c66d30>

## Save model

In [16]:
# Save the model
model.save('MNIST1DStyle_doctordata_MyCity.h5')

## Evaluating the Model

We use our testing data to validate our model. This is how we determine the validity of our model (i.e. the ability to predict new and previously unseen data points)

In [17]:
# Evaluate the model using the training data 
fltLoss, fltAccuracy = model.evaluate(arrTestX, arrTestY, verbose=2)
print(f'Loss: {fltLoss}, Accuracy: {fltAccuracy}')

 - 0s - loss: 2.5450 - acc: 0.8421
Loss: 2.544962445894877, Accuracy: 0.8421052694320679
