In [1]:
import numpy as np
import pandas as pd
import os
import requests 
from multiprocessing.pool import ThreadPool
import time
import random
from shutil import copyfile
import matplotlib.pyplot as plt
import tensorflow as tf
import keras_preprocessing
from keras_preprocessing import image
from keras_preprocessing.image import ImageDataGenerator


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.CSRRD7HKRKC3T3YXA7VY7TAZGLSWDKW6.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
#path for FIFI cleaned and preprocessed data frame
path_FIFI = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/generated_datasets/data_final.pkl'
    
#read in pkl file containing 
data = pd.read_pickle(path_FIFI)
data = pd.DataFrame(data = data)
data.reset_index(inplace=True, drop=True)

print('Original data shape:\n', data.shape, '\n')

#clean data, combine 'sign/signal' and 'streetlight' 
data = data[data['FIFI_category'] != 'Community Walk']
data['FIFI_category'] = data['FIFI_category'].replace({'Needles/Dumping': 'Needles_Dumping', 'Sign/Signal': 'Sign_Signal_Light', 'Streetlight': 'Sign_Signal_Light'})

#we are only interested in FIFI requests that have a photo included with the request
print(data.shape)
data_photo = data[~data['Photo'].isna()]
print(data_photo.shape)

print(data_photo['FIFI_category'].value_counts())

Original data shape:
 (248815, 67) 

(248661, 67)
(183278, 67)
Needles_Dumping      40265
Other                34428
Grafitti             31749
Abnd_Vehicle         26656
Parking              25140
Pothole              13154
Sign_Signal_Light    11398
Clogged_Drain        427  
Vegetation           44   
Dead Animal          17   
Name: FIFI_category, dtype: int64


In [3]:
#create a directory for each category
save_path = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/photos/'

cats_to_dl = ['Abnd_Vehicle', 'Grafitti', 'Needles_Dumping', 'Parking',
       'Pothole', 'Sign_Signal_Light']

for cat in cats_to_dl:
    if os.path.exists(os.path.join(save_path, cat)) == True:
        print(cat, " already exists")
    else:
        os.mkdir(os.path.join(save_path, cat))
        print(cat, " directory created")

Abnd_Vehicle  already exists
Grafitti  already exists
Needles_Dumping  already exists
Parking  already exists
Pothole  already exists
Sign_Signal_Light  already exists


In [4]:
#create function to get photo from url and save
def fetch_and_save(image_url_tupple):
    image_name, image_url = image_url_tupple
    #create file name for photo
    image_name = os.path.join(save_path_cat, image_name)  
    #get photo from url and save
    r = requests.get(image_url, allow_redirects=True)
    open(image_name, 'wb').write(r.content)

#do each category individually
for cat in cats_to_dl:
    print(cat)
    
    if len(os.listdir(os.path.join(save_path, cat))) > 0:
        print(cat, " already downloaded")
    else:
        cat_data = data_photo[data_photo['FIFI_category'] == cat].reset_index() 
        save_path_cat = os.path.join(save_path, cat)
        
        i = 0
        while i < cat_data.shape[0]:
            if i%50 == 0:
                print(i)
            #run on multiple threads for faster performance, save each photo with 'service_request_number' for later indexing
            urls = list(zip([cat+'_'+str(x)+'.jpeg' for x in cat_data['Service_Request_Number'].iloc[i:i+3]], cat_data['Photo'][i:i+3]))
            ThreadPool(3).imap_unordered(fetch_and_save, urls)
            i += 3

Abnd_Vehicle
Abnd_Vehicle  already downloaded
Grafitti
Grafitti  already downloaded
Needles_Dumping
Needles_Dumping  already downloaded
Parking
Parking  already downloaded
Pothole
Pothole  already downloaded
Sign_Signal_Light
Sign_Signal_Light  already downloaded


In [5]:
#check number of photos downloaded for each FIFI category compared to those listed in dataframe
for cat in cats_to_dl:
    print(cat)
    print('count in dataframe: ', data_photo[data_photo['FIFI_category'] == cat].shape[0])
    print('count in directory: ', len(os.listdir(os.path.join(save_path, cat))))
    print('\n')

Abnd_Vehicle
count in dataframe:  26656
count in directory:  26646


Grafitti
count in dataframe:  31749
count in directory:  31736


Needles_Dumping
count in dataframe:  40265
count in directory:  33146


Parking
count in dataframe:  25140
count in directory:  19162


Pothole
count in dataframe:  13154
count in directory:  7273


Sign_Signal_Light
count in dataframe:  11398
count in directory:  11398




In [6]:
#create directories for training and testing 

try:
    os.mkdir(os.path.join(save_path, 'cnn_cat_other'))
except OSError:
    print("directory already created")
try:
    os.mkdir(os.path.join(save_path, 'cnn_cat_other/training'))
except OSError:
    print("directory already created")
try:
    os.mkdir(os.path.join(save_path, 'cnn_cat_other/testing'))
except OSError:
    print("directory already created")


directory already created


In [7]:
#create directories for train/test for each FIFI category 
photo_categories = ['Abnd_Vehicle', 'Grafitti', 'Needles_Dumping', 'Parking',
       'Pothole', 'Sign_Signal_Light']

train_dir = os.path.join(save_path, 'cnn_cat_other/training')
test_dir = os.path.join(save_path, 'cnn_cat_other/testing')

for cat in photo_categories:
    try:
        os.mkdir(os.path.join(train_dir, cat))
    except OSError:
        print('cannot make directory for train: ', cat)
    try:
        os.mkdir(os.path.join(test_dir, cat))
    except OSError:
        print('cannot make directory for test: ', cat)

In [8]:
#function to check file size (throw out empty files) and split data into training and testing based on a specified total number of data entries and split size
def split_data(source_dir, training_dir, testing_dir, size, split_size, seed):
    
    #check for non zero file size
    print('creating file list')
    files = [filename if os.path.getsize(os.path.join(source_dir, filename)) > 0 else print(filename + " is zero length, so ignoring.") for filename in os.listdir(source_dir)]
  
    #create entries of specified size and seed
    random.seed(seed)
    files_final = random.sample(files, size)
    #split into train and test based on specified split size
    training_length = int(len(files_final) * split_size)
    testing_length = int(len(files_final) - training_length)
    shuffled_set = random.sample(files_final, len(files_final))
    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[-testing_length:]

    print('copying into train')
    for filename in training_set:
        this_file = os.path.join(source_dir, filename)
        destination = os.path.join(training_dir, filename)
        copyfile(this_file, destination)

    print('copying into test')
    for filename in testing_set:
        this_file = os.path.join(source_dir, filename)
        destination = os.path.join(testing_dir, filename)
        copyfile(this_file, destination)

In [9]:
#separate each FIFI category into train and test using split data function
for cat in cats_to_dl:
    print(cat)
    source_dir = os.path.join(save_path, cat)
    train_dir_cat = os.path.join(train_dir, cat)
    test_dir_cat = os.path.join(test_dir, cat)

    size = 1000
    split_size = .7
    seed = 39
    
    start_time = time.time()
    split_data(source_dir, train_dir_cat, test_dir_cat, size, split_size, seed)
    print("--- %s seconds ---" % (time.time() - start_time))


Abnd_Vehicle
creating file list
copying into train
copying into test
--- 11.690559148788452 seconds ---
Grafitti
creating file list
copying into train
copying into test
--- 12.393068790435791 seconds ---
Needles_Dumping
creating file list
copying into train
copying into test
--- 16.199681282043457 seconds ---
Parking
creating file list
copying into train
copying into test
--- 13.127983331680298 seconds ---
Pothole
creating file list
copying into train
copying into test
--- 8.707978963851929 seconds ---
Sign_Signal_Light
creating file list
copying into train
copying into test
--- 9.50690770149231 seconds ---


In [11]:
#download weights and import model to use for transfer learning

model_weights = 'https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
image_name = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/photos/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'
if os.path.isfile(image_name) == False:
    r = requests.get(model_weights, allow_redirects=True)
    open(image_name, 'wb').write(r.content)
else:
    print('model weights already downloaded')
    
from tensorflow.keras.applications.inception_v3 import InceptionV3

local_weights_file = image_name

pre_trained_model = InceptionV3(input_shape = (150, 150, 3), 
                                include_top = False, 
                                weights = None)

pre_trained_model.load_weights(local_weights_file)

for layer in pre_trained_model.layers:
    layer.trainable = False

pre_trained_model.summary()

model weights already downloaded
Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150, 150, 3)  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 74, 74, 32)   864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_v1 (BatchNo (None, 74, 74, 32)   96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 74, 74, 32)   0           batch_normalization_v1[0][0]     
____

In [None]:
last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

# Flatten the output layer to 1 dimension
x = layers.Flatten()(last_output)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.3)(x)  
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.3)(x)                  
x = layers.Dense  (6, activation='softmax')(x)           

model = Model(pre_trained_model.input, x) 

model.compile(optimizer = RMSprop(lr=0.0001), 
              loss = 'binary_crossentropy', 
              metrics = ['acc'])


In [15]:
#create model using last output 
last_layer_output = pre_trained_model.get_layer(-1).output
last_layer_output = keras.layers.Flatten()(last_layer_output)
pre_trained_model = Model(pre_trained_model.input, output)

input_shape = pre_trained_model.output_shape[1]

model = Sequential()
model.add(InputLayer(input_shape=(input_shape,)))
model.add(Dense(512, activation='relu', input_dim=input_shape))
model.add(Dropout(0.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

model.summary()

TypeError: must be str, not int

In [None]:
#train CNN using images from each category besides 'other'

TRAINING_DIR = train_dir
VALIDATION_DIR = test_dir

# Add our data-augmentation parameters to ImageDataGenerator
train_datagen = ImageDataGenerator(rescale = 1./255.)
#(rescale=1./255, zoom_range=0.3, rotation_range=50, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, horizontal_flip=True, fill_mode='nearest')

test_datagen = ImageDataGenerator(rescale = 1./255.)

# Flow training images in batches of 20 using train_datagen generator
train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
                                                    class_mode = 'categorical', 
                                                    target_size = (150, 150))     

# Flow validation images in batches of 20 using test_datagen generator
validation_generator =  test_datagen.flow_from_directory(VALIDATION_DIR,
                                                          class_mode  = 'categorical', 
                                                          target_size = (150, 150))

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('acc')>.998):
      print("\nReached 99.8% accuracy so cancelling training!")
      self.model.stop_training = True

callbacks = myCallback()

history = model.fit_generator(
            train_generator,
            validation_data = validation_generator,
            epochs = 20,
            callbacks=[callbacks],
            verbose = 1)

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()


plt.show()

In [None]:
model.save('cats_dogs_tlearn_basic_cnn.h5')

In [None]:
import numpy as np
from google.colab import files
from keras.preprocessing import image

uploaded = files.upload()

for fn in uploaded.keys():
 
  # predicting images
  path = '/content/' + fn
  img = image.load_img(path, target_size=(300, 300))
  x = image.img_to_array(img)
  x = np.expand_dims(x, axis=0)

  images = np.vstack([x])
  classes = model.predict(images, batch_size=10)
  print(classes[0])
  if classes[0]>0.5:
    print(fn + " is a human")
  else:
    print(fn + " is a horse")
 

In [None]:
# scaling test features
x_test /= 255.

# getting model predictions
test_predictions = model.predict(x_test)
predictions = pd.DataFrame(test_predictions, columns=labels_ohe_names.columns)
predictions = list(predictions.idxmax(axis=1))
test_labels = list(y_test)

# evaluate model performance
import model_evaluation_utils as meu
meu.get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)