In [133]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import cv2

import random
import math
import networkx as nx

import boto3
from PIL import Image

import requests
import json

from tqdm import tqdm
getattr(tqdm, '_instances', {}).clear()  # ⬅ add this line

%matplotlib inline

plt.style.use('ggplot')

pd.set_option('display.max_columns', 50)

In [145]:
tf.__version__

'2.2.0'

In [2]:
# keras and tensorflow downloads
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D # CNN
from tensorflow.keras.callbacks import TensorBoard # graphical visual of loss and accuracy over the epochs of train and test set

import pickle # save images
import time # get time stamp of models trained

## Import text files (image names, labels)

In [3]:
import matplotlib.image as mpimg # show images
from io import BytesIO # reading bytes

#### Create dataframe with images.txt

_Contains name of images and file path_  
- Split into file path, image name, and folder number

In [4]:
bucket = 'cwbirdsimages'

In [5]:
s3 = boto3.client('s3')
img_txt = s3.get_object(Bucket=bucket, Key='images.txt')

img_names = BytesIO(img_txt['Body'].read())

In [6]:
img_data = pd.read_csv(img_names, header=None, low_memory=False, na_values='n/a')

In [7]:
img_data['file_path'] = img_data[0].apply(lambda x: x.split()[1])
img_data['img_name'] = img_data[0].apply(lambda x: x.split()[0])

img_data['class_id'] = img_data['file_path'].apply(lambda x: x.split('/')[0])

In [8]:
img_data.drop(0, axis=1, inplace=True)

In [9]:
img_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48562 entries, 0 to 48561
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   file_path  48562 non-null  object
 1   img_name   48562 non-null  object
 2   class_id   48562 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [10]:
# 'class_id' should be int
img_data['class_id'] = img_data['class_id'].apply(lambda x: int(x))

#### Create dataframe of image_class_labels.txt

_Contains name of image file and corresponding folder number_  
- Split into image name, and folder number

In [11]:
s3 = boto3.client('s3')
img_class = s3.get_object(Bucket=bucket, Key='image_class_labels.txt')

img_class_labels = BytesIO(img_class['Body'].read())

In [12]:
labels_df = pd.read_csv(img_class_labels, header=None, low_memory=False, na_values='n/a')

In [13]:
labels_df

Unnamed: 0,0
0,0000139e-21dc-4d0c-bfe1-4cae3c85c829 817
1,0000d9fc-4e02-4c06-a0af-a55cfb16b12b 860
2,00019306-9d83-4334-b255-a447742edce3 900
3,0001afd4-99a1-4a67-b940-d419413e23b3 645
4,000332b8-997c-4540-9647-2f0a8495aecf 929
...,...
48557,fff86e8b-795f-400a-91e8-565bbb8c453a 891
48558,fff926d7-ccad-4788-839e-97af2dd99372 660
48559,fffa33ef-a765-408d-8d66-6efc7f504c71 492
48560,ffff0d87-bc84-4ef2-a47e-a4bfa48502ce 372


In [14]:
labels_df['img_name'] = labels_df[0].apply(lambda x: x.split()[0])
labels_df['class_id'] = labels_df[0].apply(lambda x: x.split()[1])

In [15]:
labels_df.drop(0, axis=1, inplace=True)

In [16]:
labels_df

Unnamed: 0,img_name,class_id
0,0000139e-21dc-4d0c-bfe1-4cae3c85c829,817
1,0000d9fc-4e02-4c06-a0af-a55cfb16b12b,860
2,00019306-9d83-4334-b255-a447742edce3,900
3,0001afd4-99a1-4a67-b940-d419413e23b3,645
4,000332b8-997c-4540-9647-2f0a8495aecf,929
...,...,...
48557,fff86e8b-795f-400a-91e8-565bbb8c453a,891
48558,fff926d7-ccad-4788-839e-97af2dd99372,660
48559,fffa33ef-a765-408d-8d66-6efc7f504c71,492
48560,ffff0d87-bc84-4ef2-a47e-a4bfa48502ce,372


In [17]:
# 'class_id' should be int
labels_df['class_id'] = labels_df['class_id'].apply(lambda x: int(x))

In [18]:
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48562 entries, 0 to 48561
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   img_name  48562 non-null  object
 1   class_id  48562 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 758.9+ KB


#### Create dataframe with hierarchy.txt

_Contains folder number and class number_  
- Split into folder number and class number

In [19]:
s3 = boto3.client('s3')
hierarchy_txt = s3.get_object(Bucket=bucket, Key='hierarchy.txt')

hierarchy = BytesIO(hierarchy_txt['Body'].read())

In [20]:
hier_df = pd.read_csv(hierarchy, header=None, low_memory=False, na_values='n/a')

In [21]:
hier_df.head()

Unnamed: 0,0
0,1 0
1,2 0
2,3 0
3,4 0
4,5 0


In [22]:
hier_df['folder_num'] = hier_df[0].apply(lambda x: x.split()[0])
hier_df['class_id'] = hier_df[0].apply(lambda x: x.split()[1])

In [23]:
hier_df.drop(0, axis=1, inplace=True)

In [24]:
hier_df

Unnamed: 0,folder_num,class_id
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
1005,1006,591
1006,1007,259
1007,1008,704
1008,1009,691


#### Create dataframe with classes.txt

_Contains class number and class labels_  
- Split into class number and class labels_

In [25]:
s3 = boto3.client('s3')
classes_txt = s3.get_object(Bucket=bucket, Key='classes.txt')

classes = BytesIO(classes_txt['Body'].read())

In [26]:
classes_df = pd.read_csv(classes, sep='\t', header=None, low_memory=False, na_values='n/a')

In [27]:
classes_df.head()

Unnamed: 0,0
0,0 Birds
1,"1 Ducks, Geese, and Swans"
2,"2 Grouse, Quail, and Allies"
3,3 Loons
4,4 Grebes


In [28]:
classes_df['class_id'] = classes_df[0].apply(lambda x: x.split(' ', 1)[0])
classes_df['txt_labels'] = classes_df[0].apply(lambda x: x.split(' ', 1)[1])

In [29]:
classes_df.drop(0, axis=1, inplace=True)

In [30]:
# 'class_id' should be int
classes_df['class_id'] = classes_df['class_id'].apply(lambda x: int(x))

## Merge all dataframes

The folder numbers corresponds to the class ids
merge the **img_data** dataframe (containing file path, image name, and class id) and the **classes_df** dataframe (class id and txt_labels)

In [31]:
master_df = img_data.merge(classes_df, on='class_id')

In [32]:
master_df

Unnamed: 0,file_path,img_name,class_id,txt_labels
0,0817/0000139e21dc4d0cbfe14cae3c85c829.jpg,0000139e-21dc-4d0c-bfe1-4cae3c85c829,817,Oak Titmouse
1,0817/01a472d8e93047a080aae4f958a2ef47.jpg,01a472d8-e930-47a0-80aa-e4f958a2ef47,817,Oak Titmouse
2,0817/036fba7c96374635853511ead2c1c728.jpg,036fba7c-9637-4635-8535-11ead2c1c728,817,Oak Titmouse
3,0817/07814887f59b44cb9b7f399999634fba.jpg,07814887-f59b-44cb-9b7f-399999634fba,817,Oak Titmouse
4,0817/0822865741de43128a6a6c8897387975.jpg,08228657-41de-4312-8a6a-6c8897387975,817,Oak Titmouse
...,...,...,...,...
48557,0627/e0e4460e044e4f90a2087ee711be9d37.jpg,e0e4460e-044e-4f90-a208-7ee711be9d37,627,Harlequin Duck (Female/juvenile)
48558,0627/e363eeb0b467477c8e35be4ff9186e5d.jpg,e363eeb0-b467-477c-8e35-be4ff9186e5d,627,Harlequin Duck (Female/juvenile)
48559,0627/e546eb5307d34eaf870eccc659d314a1.jpg,e546eb53-07d3-4eaf-870e-ccc659d314a1,627,Harlequin Duck (Female/juvenile)
48560,0627/e6d7531c575642b19e41669d140a0cd0.jpg,e6d7531c-5756-42b1-9e41-669d140a0cd0,627,Harlequin Duck (Female/juvenile)


In [33]:
master_df.shape

(48562, 4)

In [34]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48562 entries, 0 to 48561
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_path   48562 non-null  object
 1   img_name    48562 non-null  object
 2   class_id    48562 non-null  int64 
 3   txt_labels  48562 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.9+ MB


In [47]:
master_df['file_path']

0        0817/0000139e21dc4d0cbfe14cae3c85c829.jpg
1        0817/01a472d8e93047a080aae4f958a2ef47.jpg
2        0817/036fba7c96374635853511ead2c1c728.jpg
3        0817/07814887f59b44cb9b7f399999634fba.jpg
4        0817/0822865741de43128a6a6c8897387975.jpg
                           ...                    
48557    0627/e0e4460e044e4f90a2087ee711be9d37.jpg
48558    0627/e363eeb0b467477c8e35be4ff9186e5d.jpg
48559    0627/e546eb5307d34eaf870eccc659d314a1.jpg
48560    0627/e6d7531c575642b19e41669d140a0cd0.jpg
48561    0627/f70e3f79460443aaa9d033ce06f188c2.jpg
Name: file_path, Length: 48562, dtype: object

In [35]:
master_df[master_df['txt_labels'].str.contains('Warbler')]['txt_labels'].unique()

array(['Chestnut-sided Warbler (Breeding male)',
       'Chestnut-sided Warbler (Female/immature male)', 'Canada Warbler',
       'Blackpoll Warbler (Breeding male)',
       'Yellow-rumped Warbler (Winter/juvenile Myrtle)',
       'Black-throated Gray Warbler',
       'Yellow-rumped Warbler (Breeding Myrtle)',
       'Black-throated Green Warbler',
       'Blackpoll Warbler (Female/juvenile)', 'Black-and-white Warbler',
       'Hermit Warbler', 'Yellow-throated Warbler', "Wilson's Warbler",
       'Nashville Warbler', "MacGillivray's Warbler", 'Tennessee Warbler',
       'Palm Warbler', 'Magnolia Warbler (Female/immature male)',
       'Yellow Warbler', 'Hooded Warbler', 'Cape May Warbler',
       'Prairie Warbler',
       'Black-throated Blue Warbler (Female/Immature male)',
       'Bay-breasted Warbler (Breeding male)', 'Prothonotary Warbler',
       'Blue-winged Warbler', 'Black-throated Blue Warbler (Adult Male)',
       'Pine Warbler', 'Blackburnian Warbler',
       'Bay-breasted 

## Import images

In [49]:
len(master_df['file_path'])

48562

In [129]:
# grab and resize image from and to s3 bucket

img_dir = 'images' # folder containing all other folders of images
paths = master_df['file_path']

def resize_images_array(img_dir, file_paths):
    # arrays of image pixels
    img_arrays = []
    
    # loop through the dataframe that is linked to its label so that all images are in the same order
    for path in tqdm(file_paths):
        s3 = boto3.client('s3')
        try:
            obj = s3.get_object(Bucket=bucket, Key=f'{img_dir}/{path}')
            img_bytes = BytesIO(obj['Body'].read())
            open_img = Image.open(img_bytes)
            arr = np.array(open_img.resize((200,200))) # resize to 200,200. possible to play around with better or worse resolution
            img_arrays.append(arr)
        except:
#             print(path) # get file_path of ones that fail to load
            continue

    return np.array(img_arrays)

In [44]:
# X = resize_images_array(img_dir, master_df['file_path'][::5]) # grabs 9713 images, ~19:30mins

In [55]:
# sm_samp = resize_images_array(img_dir, master_df['file_path'][::100])

100%|██████████| 486/486 [00:51<00:00,  9.40it/s]


In [134]:
xs_samp = resize_images_array(img_dir, master_df['file_path'][::485])

100%|██████████| 101/101 [00:07<00:00, 13.56it/s]


In [105]:
# print('length of small sample: ', len(sm_samp))
# np.array(sm_samp).shape

length of small sample:  486


(486, 200, 200, 3)

In [135]:
print('length of x-small sample: ', len(xs_samp))
np.array(xs_samp).shape

length of x-small sample:  101


(101, 200, 200, 3)

In [84]:
# # normalize the RBG values
# sm_samp = sm_samp/255.0

array([0.18823529, 0.18039216, 0.19215686])

In [136]:
# normalize the RBG values
xs_samp = xs_samp/255.0

In [137]:
# grab numeric label
y = master_df['class_id'][::485]
y_txt_label = master_df['txt_labels'][::485]

In [138]:
y

0        817
485      929
970      698
1455     555
1940     985
        ... 
46560    551
47045    982
47530    881
48015    330
48500    360
Name: class_id, Length: 101, dtype: int64

In [139]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [140]:
sample = [x.flatten() for x in xs_samp]

In [141]:
len(sample[0])

120000

In [144]:
xs_samp.shape

(101, 200, 200, 3)

In [101]:
# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# X_train, X_test, y_train, y_test = train_test_split(sample, y, test_size=0.2, random_state=42)

# print("Train model")
# clf.fit(X_train, y_train)

Train model


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [102]:
# print("Predictions")
# predicted = clf.predict(X_test)

Predictions


In [104]:
# there are too many labels

# print("Accuracy: ", accuracy_score(y_test, predicted))