In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import cv2

import random
import math
import networkx as nx

import boto3
from PIL import Image

import requests
import json

from tqdm import tqdm
getattr(tqdm, '_instances', {}).clear()  # ⬅ add this line

%matplotlib inline

plt.style.use('ggplot')

pd.set_option('display.max_columns', 50)

In [2]:
import pickle # save images
import time # get time stamp of models trained

## Import text files (image names, labels)

In [3]:
import matplotlib.image as mpimg # show images
from io import BytesIO # reading bytes

#### Create dataframe with images.txt

_Contains name of images and file path_  
- Split into file path, image name, and folder number

In [4]:
bucket = 'cwbirdsimages'

In [5]:
s3 = boto3.client('s3')
img_txt = s3.get_object(Bucket=bucket, Key='images.txt')

img_names = BytesIO(img_txt['Body'].read())

In [6]:
img_data = pd.read_csv(img_names, header=None, low_memory=False, na_values='n/a')

In [7]:
img_data['file_path'] = img_data[0].apply(lambda x: x.split()[1])
img_data['img_name'] = img_data[0].apply(lambda x: x.split()[0])

img_data['class_id'] = img_data['file_path'].apply(lambda x: x.split('/')[0])

In [8]:
img_data.drop(0, axis=1, inplace=True)

In [9]:
img_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48562 entries, 0 to 48561
Data columns (total 3 columns):
file_path    48562 non-null object
img_name     48562 non-null object
class_id     48562 non-null object
dtypes: object(3)
memory usage: 1.1+ MB


In [10]:
# 'class_id' should be int
img_data['class_id'] = img_data['class_id'].apply(lambda x: int(x))

#### Create dataframe of image_class_labels.txt

_Contains name of image file and corresponding folder number_  
- Split into image name, and folder number

In [11]:
s3 = boto3.client('s3')
img_class = s3.get_object(Bucket=bucket, Key='image_class_labels.txt')

img_class_labels = BytesIO(img_class['Body'].read())

In [12]:
labels_df = pd.read_csv(img_class_labels, header=None, low_memory=False, na_values='n/a')

In [13]:
labels_df['img_name'] = labels_df[0].apply(lambda x: x.split()[0])
labels_df['class_id'] = labels_df[0].apply(lambda x: x.split()[1])

In [14]:
labels_df.drop(0, axis=1, inplace=True)

In [15]:
# 'class_id' should be int
labels_df['class_id'] = labels_df['class_id'].apply(lambda x: int(x))

In [16]:
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48562 entries, 0 to 48561
Data columns (total 2 columns):
img_name    48562 non-null object
class_id    48562 non-null int64
dtypes: int64(1), object(1)
memory usage: 758.9+ KB


#### Create dataframe with hierarchy.txt

_Contains folder number and class number_  
- Split into folder number and class number

In [None]:
# s3 = boto3.client('s3')
# hierarchy_txt = s3.get_object(Bucket=bucket, Key='hierarchy.txt')

# hierarchy = BytesIO(hierarchy_txt['Body'].read())

# hier_df = pd.read_csv(hierarchy, header=None, low_memory=False, na_values='n/a')

# hier_df.head()

In [None]:
# hier_df['folder_num'] = hier_df[0].apply(lambda x: x.split()[0])
# hier_df['class_id'] = hier_df[0].apply(lambda x: x.split()[1])

# hier_df.drop(0, axis=1, inplace=True)

# hier_df

#### Create dataframe with classes.txt

_Contains class number and class labels_  
- Split into class number and class labels_

In [17]:
s3 = boto3.client('s3')
classes_txt = s3.get_object(Bucket=bucket, Key='classes.txt')

classes = BytesIO(classes_txt['Body'].read())

In [18]:
classes_df = pd.read_csv(classes, sep='\t', header=None, low_memory=False, na_values='n/a')

In [19]:
classes_df.head()

Unnamed: 0,0
0,0 Birds
1,"1 Ducks, Geese, and Swans"
2,"2 Grouse, Quail, and Allies"
3,3 Loons
4,4 Grebes


In [20]:
classes_df['class_id'] = classes_df[0].apply(lambda x: x.split(' ', 1)[0])
classes_df['txt_labels'] = classes_df[0].apply(lambda x: x.split(' ', 1)[1])

In [21]:
classes_df.drop(0, axis=1, inplace=True)

In [22]:
# 'class_id' should be int
classes_df['class_id'] = classes_df['class_id'].apply(lambda x: int(x))

## Merge all dataframes

The folder numbers corresponds to the class ids
merge the **img_data** dataframe (containing file path, image name, and class id) and the **classes_df** dataframe (class id and txt_labels)

In [23]:
master_df = img_data.merge(classes_df, on='class_id')

In [24]:
master_df[master_df['class_id']==565]

Unnamed: 0,file_path,img_name,class_id,txt_labels


In [25]:
master_df.shape

(48562, 4)

In [26]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48562 entries, 0 to 48561
Data columns (total 4 columns):
file_path     48562 non-null object
img_name      48562 non-null object
class_id      48562 non-null int64
txt_labels    48562 non-null object
dtypes: int64(1), object(3)
memory usage: 1.9+ MB


In [27]:
pd.set_option('display.max_rows', 100)

master_df.head(5)

Unnamed: 0,file_path,img_name,class_id,txt_labels
0,0817/0000139e21dc4d0cbfe14cae3c85c829.jpg,0000139e-21dc-4d0c-bfe1-4cae3c85c829,817,Oak Titmouse
1,0817/01a472d8e93047a080aae4f958a2ef47.jpg,01a472d8-e930-47a0-80aa-e4f958a2ef47,817,Oak Titmouse
2,0817/036fba7c96374635853511ead2c1c728.jpg,036fba7c-9637-4635-8535-11ead2c1c728,817,Oak Titmouse
3,0817/07814887f59b44cb9b7f399999634fba.jpg,07814887-f59b-44cb-9b7f-399999634fba,817,Oak Titmouse
4,0817/0822865741de43128a6a6c8897387975.jpg,08228657-41de-4312-8a6a-6c8897387975,817,Oak Titmouse


In [None]:
# master_df.to_csv('data/master_df.csv')

In [28]:
master_df['txt_labels'].nunique()

555

In [29]:
master_df['class_id'].nunique()

555

## Import images

In [30]:
len(master_df['file_path'])

48562

In [69]:
master_df['file_path'][47049:47050]

47049    0982/ff17652d470c4730b57a5133d6b7d0cc.jpg
Name: file_path, dtype: object

In [115]:
# grab and resize image from and to s3 bucket

img_dir = 'images' # folder containing all other folders of images
paths = master_df['file_path']

def resize_images_array(img_dir, file_paths):
    # arrays of image pixels
    img_arrays = []
    
    paths = []
    
    # loop through the dataframe that is linked to its label so that all images are in the same order
    for path in tqdm(file_paths):
        s3 = boto3.client('s3')
        try:
            obj = s3.get_object(Bucket=bucket, Key=f'{img_dir}/{path}')
            img_bytes = BytesIO(obj['Body'].read())
            open_img = Image.open(img_bytes)
            arr = np.array(open_img.resize((299,299))) # resize to 200,200. possible to play around with better or worse resolution
            img_arrays.append(arr)
            paths.append(path)
        except:
#             print(path) # get file_path of ones that fail to load
            continue
    return np.array(img_arrays)

In [None]:
# s3 = boto3.client('s3')
# obj = s3.get_object(Bucket=bucket, Key=f'images/0817/0000139e21dc4d0cbfe14cae3c85c829.jpg')
# img_bytes = BytesIO(obj['Body'].read())
# open_img = Image.open(img_bytes)
# arr = np.array(open_img.resize((200,200))) # resize to 200,200. possible to play around with better or worse resolution
# s3.upload_file(arr, 'cwbirdsimages', 'resized_images/0817/0000139e21dc4d0cbfe14cae3c85c829.jpg')

##### final data grab amount

In [None]:
X = resize_images_array(img_dir, master_df['file_path'][:47001])

 86%|████████▌ | 40459/47001 [1:18:45<13:16,  8.22it/s]  

##### small sample len and shape

In [None]:
# sm_samp = resize_images_array(img_dir, master_df['file_path'][:3098])

In [None]:
print('length of sample: ', len(X))
X.shape

#### Show the 3 channels of colors

In [None]:
single_img = master_df['file_path'][985]
single_img

In [None]:
obj = s3.get_object(Bucket=bucket, Key=f'images/0776/16398b734cf540e3b0bcc943621e3515.jpg')
img_bytes = BytesIO(obj['Body'].read())
open_img = Image.open(img_bytes)

In [None]:
# By stacking these together into a 3-tensor, we can represent a color image as a single object.

fig, axes = plt.subplots(1, 4, figsize=(16,6))

axes[0].imshow(open_img)
axes[0].set_title('original')
for ax, channel, name in zip(axes[1:], open_img.split(), ['red channel', 'green channel', 'blue channel']):
    ax.imshow(channel)
    ax.set_title(name)

### Create labels and features arrays and normalize features arrays

##### small sample normalize

In [None]:
# normalize the RBG values
X = X/255.0

In [None]:
# test = X.copy()

# from tensorflow.keras.utils import normalize

# backup = pd.HDFStore('backup.h5')

# backup['X'] = pd.Series(X)

# DONT FORGET TO CHANGE RANGE HERE TOO

In [None]:
# grab numeric label
# VALUES MUST BE NP.ARRAYS

label = np.array(master_df['class_id'][:47001].values)

##### y labels need to be one hot encoded

In [None]:
y = (label.reshape(-1,1) == master_df['class_id'][:47001].unique()).astype(float)

In [None]:
print('label shape: ', y.shape)
print('features shape: ', X.shape)

In [None]:
master_df['class_id'][:47001].unique()

In [None]:
y[115]

## Machine Learning Libraries Import

In [None]:
# from sklearn.ensemble import RandomForestClassifier

## Random Forest Classifier

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# sample = np.array([x.flatten() for x in xs_samp])
# sample.shape

In [None]:
# clf = RandomForestClassifier(n_jobs=-1)

# X_train, X_test, y_train, y_test = train_test_split(sample, y, test_size=0.2, random_state=42)

# print("Train model")
# clf.fit(X_train, y_train)

# print("Predictions")
# predicted = clf.predict(X_test)

# # there are too many labels

# print("Accuracy: ", accuracy_score(y_test, predicted))

## Train Test Split

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, GlobalAveragePooling2D
from tensorflow.keras.layers import Conv2D, MaxPool2D, BatchNormalization # CNN
from tensorflow.keras.models import Model

from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.applications import Xception
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, RMSprop

from tensorflow.keras.callbacks import TensorBoard # graphical visual of loss and accuracy over the epochs of train and test set
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import datetime

tf.__version__

In [None]:
# check to make sure the bird images and labels are aligned
# this is indeed a semipalmated sandpiper

print(master_df.iloc[57, :]['txt_labels'])
plt.imshow(X[57]);

1. X, and y defined
- make sure they are arrays!!

2. normalize X values by dividing by 255
3. check images
4. train test split
5. make model Sequential()
6. add input layer
7. add multiple hidden layers
8. ADD FLATTEN LAYER, MUST BE BEFORE OUTPUT
9. add dense layer, which are fully connected layers
10. add output dense layer, will be the amount of labels there are
11. model.compile(loss = 'sparse_categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])
12. model.fit(xtrain, ytrain, epochs) also has validation_split (out of sample) do about 0.1, batchsize: how many at a time, more data requires bigger (20-200 range)
13. model.evaluate(xtest,ytest) returns val loss and val accuracy  

14. model.save('name') saves the model
- to load: new_model = tf.keras.models.load_model('name')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
plt.imshow(X_train[55]);

In [None]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

## TRANSFER LEARNING MODEL

In [None]:
log_xcept = os.path.join("logs/large_xception", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_xcept, histogram_freq=1)

In [None]:
input_size = (299,299,3)
model = Xception(weights='imagenet',
                          include_top=True,
                          input_shape=input_size)

In [None]:
# def print_model_properties(model, indices = 0):
#      for i, layer in enumerate(model.layers[indices:]):
#         print(f"Layer {i+indices} | Name: {layer.name} | Trainable: {layer.trainable}")

# print_model_properties(model)

In [None]:
def create_transfer_model(input_size, n_categories, weights = 'imagenet'):
        # note that the "top" is not included in the weights below
        base_model = Xception(weights=weights,
                          include_top=False,
                          input_shape=input_size)
        
        model = base_model.output
        model = GlobalAveragePooling2D()(model)
        predictions = Dense(n_categories, activation='softmax')(model)
        model = Model(inputs=base_model.input, outputs=predictions)
        
        return model

In [None]:
transfer_model = create_transfer_model((299,299,3),555)

In [None]:
def change_trainable_layers(model, trainable_index):
    for layer in model.layers[:trainable_index]:
        layer.trainable = False
    for layer in model.layers[trainable_index:]:
        layer.trainable = True

In [None]:
_ = change_trainable_layers(transfer_model, 132)

In [None]:
print_model_properties(transfer_model, 130)

In [None]:
transfer_model.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
xception_final = transfer_model.fit(X, y, batch_size=1000, epochs=5, validation_split=0.1)

In [None]:
transfer_model.save('saved_models/large_xception.h5')
# load_L_xception = tf.keras.models.load_model('saved_models/large_xception.h5')

In [None]:
# transfer_test = transfer_model.fit(X_train, y_train, batch_size = 32, epochs=5, validation_split=0.1)

In [None]:
# print('Transfer Model1: Loss and Accuracy')
# evaluate = transfer_model.evaluate(X_test, y_test)

In [None]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

pred1 = transfer_model.predict(X_test)

y_true = y_test.copy()

y_true = np.array([i.argmax() for i in y_true]).reshape(-1,1)

y_predicted = (pred1 > 0.5).astype(float)

y_predicted = np.array([i.argmax() for i in y_predicted]).reshape(-1,1)

mat = confusion_matrix(y_true, y_predicted)

plot_confusion_matrix(conf_mat=mat, figsize=(8,8), class_names=folders);

## Neural Network Model 0

In [None]:
model = Sequential()

model.add(Flatten(input_shape=X_train[0].shape))
model.add(Dense(128, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax')) # have to have same amount as y_train.shape[1]

In [None]:
print('Model 0: Baseline Model NN')
print(f'Number of Training Images: {X_train.shape[0]}/{X_train.shape[0] + X_test.shape[0]}')
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])

In [None]:
print('Model 0: Baseline Model NN')
print(f'Number of Training Images: {X_train.shape[0]}/{X_train.shape[0] + X_test.shape[0]}')
history = model.fit(X_train, y_train, epochs=5, validation_split=0.1)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)

In [None]:
pred = model.predict(X_test)

In [None]:
print('Weak Model Prediction Check: ')
print('True label of bird: ', y_train[0].argmax(),classes_df.loc[379][['class_id', 'txt_labels']].values)
print('Predicted label of bird: ', pred[0].argmax(), classes_df.loc[458][['class_id', 'txt_labels']].values)

In [None]:
pred[0].argmax()

In [None]:
master_df.iloc[:20047, :]['class_id'].unique()[201]

## CNN: Convolutional Neural Network Model 1

In [None]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ', y_test.shape)

In [None]:
# Create model
model1 = Sequential()

# Convolution Layer
model1.add(Conv2D(32, (3,3), activation='relu', input_shape=X_train[0].shape)) # scans with a (3,3) grid
model1.add(BatchNormalization())
model1.add(MaxPool2D(2,2)) # grid to pool together the first grid
model1.add(Dropout(0.3))

model1.add(Conv2D(64, (3,3), activation='relu')) # scans with a (3,3) grid
model1.add(BatchNormalization())
model1.add(MaxPool2D(2,2)) # grid to pool together the first grid
model1.add(Dropout(0.3))

model1.add(Conv2D(128, (3,3), activation='relu')) # scans with a (3,3) grid
model1.add(BatchNormalization())
model1.add(MaxPool2D(2,2)) # grid to pool together the first grid
model1.add(Dropout(0.4))

# Must Flatten before entering Dense layers
model1.add(Flatten())

model1.add(Dense(128, activation='relu'))
model1.add(BatchNormalization())
model1.add(Dropout(0.4))

model1.add(Dense(128, activation='relu'))
model1.add(BatchNormalization())
model1.add(Dropout(0.4))

model1.add(Dense(y_train.shape[1], activation='softmax')) # have to have same amount as y_train.shape[1]

In [None]:
print('Model 1: CNN')
print(f'Number of Training Images: {X_train.shape[0]}/{X_train.shape[0] + X_test.shape[0]}')
model1.summary()

In [None]:
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
log_dir = os.path.join("logs/fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
print('Model 1: CNN')
print(f'Number of Training Images: {X_train.shape[0]}/{X_train.shape[0] + X_test.shape[0]}')
history1 = model1.fit(X_train, y_train, batch_size = 100, epochs=10, validation_split=0.1, callbacks=[tensorboard_callback])

In [None]:
history1.history

In [None]:
fig, axes = plt.subplots(1,2,figsize=(12,6))

epoch_range = range(1, 11)

axes[0].plot(epoch_range, history1.history['accuracy'])
axes[0].plot(epoch_range, history1.history['val_accuracy'])
axes[0].set_ylabel('Accuracy')
axes[0].set_xlabel('Number of Epochs')
axes[0].legend(['Train', 'Val'], loc='upper left')
axes[0].set_title('Model2 Accuracy')

axes[1].plot(epoch_range, history1.history['loss'])
axes[1].plot(epoch_range, history1.history['val_loss'])
axes[1].set_ylabel('Accuracy')
axes[1].set_xlabel('Number of Epochs')
axes[1].legend(['Train', 'Val'], loc='upper left')
axes[1].set_title('Model2 Loss')

plt.savefig('graphs/model2_acc_loss.png')

In [None]:
# %load_ext tensorboard

# %tensorboard --logdir='logs/'

In [None]:
pred1 = model1.predict(X_test)

In [None]:
print('CNN Model 1 Prediction Check: ')
print('True label of bird: ', classes_df[classes_df['class_id'] == master_df.iloc[:y.shape[0], :]['class_id'].unique()[y_test[0].argmax()]].values)
print('Predicted label of bird: ', classes_df[classes_df['class_id'] == master_df.iloc[:y.shape[0], :]['class_id'].unique()[pred1[0].argmax()]].values)

In [None]:
model1.save('saved_models/conv-3-dense-2-fr32-128.h5')
# keras.models.load_model