In [8]:
# Import the required libraries
import os
from zipfile import ZipFile
import tarfile
import pandas as pd
from PIL import Image
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Machine Learning Pipeline for Image Recognition

## By: ZXS

### Introduction-

Working with images is a unique component of data science that requires additional pre-processing steps involving the manipulation of files to make them "comprehensible" to the computer. 

Essentially what these steps boil down to is the decomposition of the image down to its component R, G, B color pixels. 

As I read into the various sources available on the internet regarding image classification, I realized that while many of them contained comprehensive information on building models- many neglected to mention the meticulous work that goes into preparing the data beforehand. 

In this exercise, we will go over those steps so that we can create a fully-functional pipeline that is able to convert a set of images and class labels into a functional model. 

### Data-

For the purposes of this endeavor, we will use the dog breeds dataset published by [Stanford University](http://vision.stanford.edu/aditya86/ImageNetDogs/).

This dataset contains ~20,000 labelled images of dogs from 120 different bbreed classifications. It has been made available in the form of tarfiles on the University website at the link provided. In any case, lets get started...

---

#### Pre-Processing

*Download the archive-*

In [4]:
# Set the working directory
wd = '/Users/zxs/Documents/code/kaggle/dog_breeds/stanford/data/'
os.chdir(wd)

In [None]:
# Load the tar files
imgs = tarfile.open('images.tar', 'r')
lbls = tarfile.open('annotations.tar', 'r')

# Extract contents into labelled folders
imgs.extractall()
lbls.extractall()

# Define content
imgs = 'Images/'
annot = 'Annotation/'

*Rename the folders and files-*

In [None]:

###############################################################################
#                                RENAMING                                     #
###############################################################################

'''
    Folders
'''

#folders = os.listdir(imgs)

#os.chdir(imgs)

#for folder in os.listdir():
#    
#    folder_name = folder.replace('-', '_')
#    folder_name = folder_name.split('_', 1)
#    folder_name = folder_name[1]
#    folder_name = folder.lower()
#    
#    os.rename(folder, folder_name)
#
#folders1 = os.listdir(imgs)

#folders2 = os.listdir(annot)

#os.chdir(annot)
#folders = os.listdir()
#for folder in folders:
#    print(folder)
#    fn = folder.split('-', 1)
#    fn1 = fn[1]
#    fn2 = fn1.replace('-', '_')
#    os.rename(folder, fn2)
#
#folders1 = os.listdir()
#
#for folder in folders1:
#    
#    fn3 = folder.lower()
#    
#    os.rename(folder, fn3)
#    
#folders2 = os.listdir()

'''
    Files
'''

#img_dict = {}
#
#img_dir = wd + 'images/'
#
#for folder in os.listdir(img_dir):
#    
#    try:
#        
#        n = 1
#    
#        os.chdir(img_dir + folder)
#    
#        for file in os.listdir():
#        
#            original_fn, ext = file.split('.')
#        
#            new_fn = folder + str(n) + '.' + ext
#        
#            os.rename(file, new_fn)
#        
#            n += 1
#            
#    except:
#        
#        pass

#for folder in os.listdir():
#        
#    n = 1
#    
#    os.chdir(folder)
#    
#    for file in os.listdir():
#        
#        file1 = folder + str(n) + '_annot'
#        
#        os.rename(file, file1)
#        
#        n += 1
#        
#    os.chdir(wd)
#    os.chdir(annot)

*Index the folder contents-*

In [None]:
###############################################################################
#                                INDEXING                                     #
###############################################################################

#img_dict = {}
#
#folders = os.listdir(imgs)
#
#for folder in folders:
#
#    try:
#        
#        files = os.listdir(imgs + folder)
#        
#        fn, ext = [file.split('.') for file in files]
#        
#        img_dict[folder] = files
#        
#    except:
#        
#        if folder == '.DS_Store':
#        
#            pass

#annot_dict = {}
#
#folders = os.listdir()
#
#for folder in folders:
#    
#    try:
#        
#        files = os.listdir(folder)
#        
#        annot_dict[folder] = [file for file in files]
#        
#    except:
#        
#        pass

*Reading the actual files-*

In [None]:
###############################################################################
#                                READING                                      #
###############################################################################

## Function to extract features
#def extract_fts(image):
#    
#    try:
#        
#        (r, g, b) = img.split()
#    
#    except:
#        
#        (r,g,b,a) = img.split()
#        
#    features = [np.mean(r), np.mean(g), np.mean(b),
#                np.std(r), np.std(g), np.std(b)]
#    
#    return features
#
## Initialize an empty df to store results            
#features = pd.DataFrame(columns = ['breed', 'image', 'r_mean', 'g_mean', 'b_mean', 'r_std', 'g_std', 'b_std'])
#
## Navigate directory
#os.chdir(imgs)
#
## Iterate
#for folder in os.listdir():
#    
#    if os.path.isdir(folder):
#        
#        os.chdir(folder)
#    
#        for file in os.listdir():
#            
#            if file.endswith('.jpg'):
#                
#                img = Image.open(file)
#
#                r_mean, g_mean, b_mean, r_std, g_std, b_std = [i for i in extract_fts(img)]
#                
#                features = features.append({'breed': folder,
#                                            'image': file,
#                                            'r_mean': r_mean,
#                                            'g_mean': g_mean,
#                                            'b_mean': b_mean,
#                                            'r_std': r_std,
#                                            'g_std': g_std,
#                                            'b_std': b_std}, ignore_index = True)
#    
#            elif file.endswith('.png'):
#                
#                img = Image.open(file)
#                
#                fts = np.array
#
#        os.chdir(wd)
#        os.chdir(imgs)
#
## Pickle the data
#with open('features.pickle', 'wb') as f:
#
#    pickle.dump(features, f)

All of the code above has been commented out because the files are large and take a long time to rename, index, and load. 

A convenient package, `pickle`, provides efficient data structures for saving information in Python for later use. By utilizing this package, we can save ourselves some computational overhead by only ever needing to read-in the source files once. Thereafter, we will be able to load the pickle files instead, to retrieve our data. 

---

### Modeling-

In [7]:
# Load the pickled data
with open('features.pickle', 'rb') as f:
    
    df = pickle.load(f)

In [9]:
# Label Encode the Breeds
le = LabelEncoder()

df['breed'] = le.fit_transform(df['breed'])

df['image'] = le.fit_transform(df['image'])

# Separate the variables
x = [col for col in df.columns if col != 'breed']
x = df[x]

y = df['breed']

# Split for training / testing
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = 100, test_size = .3)
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, random_state = 100, test_size = .5)

In [10]:
'''
    Random Forest
'''

rfc = RandomForestClassifier(random_state = 100)
rfc.fit(train_x, train_y)
rfc_preds = rfc.predict(val_x)
rfc_score = accuracy_score(val_y, rfc_preds)

print()
print('The Accuracy of the Random Forest Model on the Dog Breeds validation set is: {}%'.format(rfc_score * 100))



The Accuracy of the Random Forest Model on the Dog Breeds validation set is: 42.01244813278009%


In [12]:
print('Classification Report for the RFC model:')
print()
print(classification_report(val_y, rfc_preds))

Classification Report for the RFC model:

              precision    recall  f1-score   support

           0       0.87      1.00      0.93        27
           1       0.90      1.00      0.95        27
           2       0.79      0.77      0.78        30
           3       0.71      0.90      0.79        30
           4       0.66      0.88      0.75        24
           5       0.57      0.65      0.60        20
           6       0.42      0.38      0.40        13
           7       0.54      0.71      0.62        35
           8       0.68      0.63      0.65        27
           9       0.60      0.78      0.68        27
          10       0.54      0.61      0.58        31
          11       0.67      0.74      0.70        42
          12       0.48      0.52      0.50        23
          13       0.67      0.83      0.74        24
          14       0.49      0.71      0.58        28
          15       0.41      0.50      0.45        26
          16       0.43      0.53      