<a href="https://colab.research.google.com/github/emma-s137/cs344/blob/master/project/seedling_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Run this cell and select the kaggle.json file downloaded
# from the Kaggle account settings page.
from google.colab import files
files.upload()

In [0]:
# Let's make sure the kaggle.json file is present.
!ls -lha kaggle.json

In [0]:
# Next, install the Kaggle API client.
!pip install -q kaggle

In [0]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
# Copy the stackoverflow data set locally.
!kaggle datasets download -d vbookshelf/v2-plant-seedlings-dataset
!ls -l 

In [0]:
!unzip -q v2-plant-seedlings-dataset.zip 

In [0]:
import pandas as pd
import numpy as np

import tensorflow

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import os
import cv2

import imageio
import skimage
import skimage.io
import skimage.transform

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
# Number of samples we will have in each class.
SAMPLE_SIZE = 250

# The images will all be resized to this size.
IMAGE_SIZE = 224

In [0]:
os.listdir('nonsegmentedv2')

In [0]:
# Create a new directory to store all available images
all_images_dir = 'all_images_dir'
os.mkdir(all_images_dir)


In [0]:
# This code copies all images from their seperate folders into the same 
# folder called all_images_dir.


folder_list = os.listdir('nonsegmentedv2')

for folder in folder_list:
    
    # create a path to the folder
    path = 'nonsegmentedv2/' + str(folder)

    # create a list of all files in the folder
    file_list = os.listdir(path)

    # move the 0 images to all_images_dir
    for fname in file_list:

        # source path to image
        src = os.path.join(path, fname)
        
        # Change the file name because many images have the same file name.
        # Add the folder name to the existing file name.
        new_fname = str(folder) + '_' + fname
        
        # destination path to image
        dst = os.path.join(all_images_dir, new_fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [0]:
# Check how many images are in all_images_dir.
# Should be 5539.

len(os.listdir('all_images_dir'))


In [0]:
# Get a list of all images in the all_images_dir folder.
image_list = os.listdir('all_images_dir')

# Create the dataframe.
df_data = pd.DataFrame(image_list, columns=['image_id'])

df_data.head()

In [0]:
# Each file name has this format:
# Loose Silky-bent_377.png

# This function will extract the class name from the file name of each image.
def extract_target(x):
    # split into a list
    a = x.split('_')
    # the target is the first index in the list
    target = a[0]
    
    return target


# create a new column called 'target'
df_data['target'] = df_data['image_id'].apply(extract_target)

df_data.head()

In [0]:
df_data.shape

In [0]:
# What is the class distribution?

df_data['target'].value_counts()

In [0]:
# Get a list of classes
target_list = os.listdir('nonsegmentedv2')

for target in target_list:

    # Filter out a target and take a random sample
    df = df_data[df_data['target'] == target].sample(SAMPLE_SIZE, random_state=101)
    
    # if it's the first item in the list
    if target == target_list[0]:
        df_sample = df
    else:
        # Concat the dataframes
        df_sample = pd.concat([df_sample, df], axis=0).reset_index(drop=True)

In [0]:
# Display the balanced classes.

df_sample['target'].value_counts()

In [0]:
# train_test_split

# stratify=y creates a balanced validation set.
y = df_sample['target']

df_train, df_val = train_test_split(df_sample, test_size=0.10, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

In [0]:
# Train set class distribution

df_train['target'].value_counts()


In [0]:
# Val set class distribution

df_val['target'].value_counts()

In [0]:
folder_list = os.listdir('nonsegmentedv2')

folder_list

In [0]:
# Create a new directory
base_dir = 'base_dir'
os.mkdir(base_dir)


#[CREATE FOLDERS INSIDE THE BASE DIRECTORY]

# now we create 2 folders inside 'base_dir':

# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)


# [CREATE FOLDERS INSIDE THE TRAIN AND VALIDATION FOLDERS]

# create new folders inside train_dir

for folder in folder_list:
    
    folder = os.path.join(train_dir, str(folder))
    os.mkdir(folder)


# create new folders inside val_dir

for folder in folder_list:
    
    folder = os.path.join(val_dir, str(folder))
    os.mkdir(folder)

In [0]:
# check that the folders have been created

os.listdir('base_dir/train_dir')


In [0]:
# Set the id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [0]:
df_data.head()

In [0]:
# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

# Transfer the train images

for image in train_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image
    # get the label for a certain image
    folder = df_data.loc[image,'target']
    
    
    # source path to image
    src = os.path.join(all_images_dir, fname)
    # destination path to image
    dst = os.path.join(train_dir, folder, fname)
    
    # resize the image and save it at the new location
    image = cv2.imread(src)
    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))
    # save the image at the destination
    cv2.imwrite(dst, image)
        
    

# Transfer the val images

for image in val_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image
    # get the label for a certain image
    folder = df_data.loc[image,'target']
    

    # source path to image
    src = os.path.join(all_images_dir, fname)
    # destination path to image
    dst = os.path.join(val_dir, folder, fname)

    # resize the image and save it at the new location
    image = cv2.imread(src)
    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))
    # save the image at the destination
    cv2.imwrite(dst, image)


In [0]:
folder_list = os.listdir('base_dir/train_dir')

total_images = 0

# loop through each folder
for folder in folder_list:
    # set the path to a folder
    path = 'base_dir/train_dir/' + str(folder)
    # get a list of images in that folder
    images_list = os.listdir(path)
    # get the length of the list
    num_images = len(images_list)
    
    total_images = total_images + num_images
    # print the result
    print(str(folder) + ':' + ' ' + str(num_images))
    
print('\n')
# print the total number of images available
print('Total Images: ', total_images)

In [0]:
# get a list of image folders
folder_list = os.listdir('base_dir/val_dir')

total_images = 0

# loop through each folder
for folder in folder_list:
    # set the path to a folder
    path = 'base_dir/val_dir/' + str(folder)
    # get a list of images in that folder
    images_list = os.listdir(path)
    # get the length of the list
    num_images = len(images_list)
    
    total_images = total_images + num_images
    # print the result
    print(str(folder) + ':' + ' ' + str(num_images))
    
print('\n')
# print the total number of images available
print('Total Images: ', total_images)

In [0]:
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'


num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10


train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [0]:
datagen = ImageDataGenerator(rescale=1.0/255)

train_gen = datagen.flow_from_directory(train_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=train_batch_size,
                                        class_mode='categorical')

val_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')

# Note: shuffle=False causes the test dataset to not be shuffled
test_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                        batch_size=1,
                                        class_mode='categorical',
                                        shuffle=True)

Basic neural network Model similar to the one devoleped for homework04.

In [0]:

kernel_size = (3,3)
pool_size= (2,2)

model_basic = Sequential()
model_basic.add(Conv2D(32, kernel_size, activation = 'relu', 
                 input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3)))

model_basic.add(Conv2D(32, (3, 3), activation='relu'))
model_basic.add(MaxPooling2D(pool_size=(2, 2)))
model_basic.add(Dropout(0.25))

model_basic.add(Conv2D(128, kernel_size, activation ='relu'))
model_basic.add(MaxPooling2D(pool_size = pool_size))
model_basic.add(Dropout(0.25))

model_basic.add(Flatten())
model_basic.add(Dense(64, activation = "relu"))
model_basic.add(Dense(12, activation = "softmax"))

model_basic.summary()

In [0]:
model_basic.compile(loss='categorical_crossentropy',
             optimizer='adagrad',
             metrics=['accuracy'])

model_basic.fit(train_gen,
         epochs=25,
         validation_data=val_gen)

Model uses Alex Net architecture
Found at https://www.mydatahack.com/building-alexnet-with-keras/

In [0]:
# (1) Importing dependency
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten,\
 Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
import numpy as np
np.random.seed(1000)


# (3) Create a sequential model
model = Sequential()

# 1st Convolutional Layer
model.add(Conv2D(filters=96, input_shape=(224,224,3), kernel_size=(11,11),\
 strides=(4,4), padding='valid'))
model.add(Activation('relu'))
# Pooling 
model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='valid'))
# Batch Normalisation before passing it to the next layer
model.add(BatchNormalization())

# 2nd Convolutional Layer
model.add(Conv2D(filters=256, kernel_size=(11,11), strides=(1,1), padding='valid'))
model.add(Activation('relu'))
# Pooling
model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='valid'))
# Batch Normalisation
model.add(BatchNormalization())

# 3rd Convolutional Layer
model.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='valid'))
model.add(Activation('relu'))
# Batch Normalisation
model.add(BatchNormalization())

# 4th Convolutional Layer
model.add(Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), padding='valid'))
model.add(Activation('relu'))
# Batch Normalisation
model.add(BatchNormalization())

# 5th Convolutional Layer
model.add(Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), padding='valid'))
model.add(Activation('relu'))
# Pooling
model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='valid'))
# Batch Normalisation
model.add(BatchNormalization())

# Passing it to a dense layer
model.add(Flatten())
# 1st Dense Layer
model.add(Dense(4096, input_shape=(224*224*3,)))
model.add(Activation('relu'))
# Add Dropout to prevent overfitting
model.add(Dropout(0.4))
# Batch Normalisation
model.add(BatchNormalization())

# 2nd Dense Layer
model.add(Dense(4096))
model.add(Activation('relu'))
# Add Dropout
model.add(Dropout(0.4))
# Batch Normalisation
model.add(BatchNormalization())

# 3rd Dense Layer
model.add(Dense(1000))
model.add(Activation('relu'))
# Add Dropout
model.add(Dropout(0.4))
# Batch Normalisation
model.add(BatchNormalization())

# Output Layer
model.add(Dense(12, activation = "softmax"))

model.summary()

# (4) Compile 
model.compile(loss='categorical_crossentropy', optimizer='adagrad',\
 metrics=['accuracy'])



In [0]:
# (5) Train
model.fit(train_gen, epochs=25, verbose=1, shuffle=True, validation_data=val_gen )