## Import Modules

In [1]:
import os
from datetime import datetime
from PIL import Image
import pandas
import numpy
import shutil

### Rename all images with flower type 

```python
directories = ["/users/danielcorcoran/desktop/flowers/daisy/",
              "/users/danielcorcoran/desktop/flowers/tulip/",
              "/users/danielcorcoran/desktop/flowers/sunflower/",
              "/users/danielcorcoran/desktop/flowers/dandelion/",
              "/users/danielcorcoran/desktop/flowers/rose/"]
```

```python 

for directory in directories:

    flower_type = directory.split("/")[-2]
    
    for filename in os.listdir(directory):
        
        print(flower_type, filename)
        os.rename(directory+filename, directory+flower_type +"_" + filename)
        
```

### Create Dataframe containing all flower image data

In [2]:
directory_path = "/users/danielcorcoran/desktop/flowers/all_flowers/"

In [3]:
dataset = pandas.DataFrame()

In [4]:
counter = 0

for filename in os.listdir(directory_path):
    
    filepath = directory_path + filename
    if ".jpg" in filename:
        image = Image.open(filepath)
        size = image.size
        width = int(image.size[0])
        height = int(image.size[1])
        size = str(width) + "x" + str(height)
        category = filename.split("_")[0]
        
        dataset.loc[counter, "file_name"] = filename
        dataset.loc[counter, "file_path"] = directory_path
        dataset.loc[counter, "width"] = width
        dataset.loc[counter, "height"] = height
        dataset.loc[counter, "size"] = size
        dataset.loc[counter, "flower_type"] = category
        
        counter = counter + 1

In [5]:
grouped = dataset.groupby("size").count()
grouped.sort_values(by = "file_name", ascending = False).head()

Unnamed: 0_level_0,file_name,file_path,width,height,flower_type
size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
320x240,604,604,604,604,604


### Create subset containing only 320x240 images

In [6]:
data320x240 = dataset[dataset["size"] == "320x240"]

### Reset index

In [7]:
data320x240.reset_index(drop = True, inplace = True)

In [8]:
data320x240.head()

Unnamed: 0,file_name,file_path,width,height,size,flower_type
0,daisy_144099102_bf63a41e4f_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy
1,tulip_4612075317_91eefff68c_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,tulip
2,daisy_2578695910_5ab8ee17c1_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy
3,rose_5721768347_2ec4d2247b_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,rose
4,daisy_8709110478_60d12efcd4_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy


In [9]:
data320x240["full_path"] = data320x240["file_path"] + data320x240["file_name"]

In [10]:
data320x240

Unnamed: 0,file_name,file_path,width,height,size,flower_type,full_path
0,daisy_144099102_bf63a41e4f_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...
1,tulip_4612075317_91eefff68c_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,tulip,/users/danielcorcoran/desktop/flowers/all_flow...
2,daisy_2578695910_5ab8ee17c1_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...
3,rose_5721768347_2ec4d2247b_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,rose,/users/danielcorcoran/desktop/flowers/all_flow...
4,daisy_8709110478_60d12efcd4_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...
5,sunflower_22183521655_56221bf2a4_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,sunflower,/users/danielcorcoran/desktop/flowers/all_flow...
6,daisy_4544110929_a7de65d65f_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...
7,rose_5863698305_04a4277401_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,rose,/users/danielcorcoran/desktop/flowers/all_flow...
8,sunflower_5139969631_743880e01d_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,sunflower,/users/danielcorcoran/desktop/flowers/all_flow...
9,daisy_1396526833_fb867165be_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...


### Feature engineering
Collect RGB features 

In [11]:
def get_image_data(path):
    
    image = Image.open(path)
    pixels = list(image.getdata())
    
    average_pixels = list(map(numpy.average, pixels))

    return average_pixels

In [12]:
start = datetime.now()

data320x240["image_data_mapped"] = data320x240["full_path"].map(get_image_data)

print("Finished in: {}".format(datetime.now() - start))

Finished in: 0:07:55.737988


In [13]:
data320x240.head()

Unnamed: 0,file_name,file_path,width,height,size,flower_type,full_path,image_data_mapped
0,daisy_144099102_bf63a41e4f_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[58.0, 23.333333333333332, 21.0, 7.0, 16.0, 5...."
1,tulip_4612075317_91eefff68c_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,tulip,/users/danielcorcoran/desktop/flowers/all_flow...,"[123.0, 137.66666666666666, 143.0, 157.0, 140...."
2,daisy_2578695910_5ab8ee17c1_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[108.0, 102.33333333333333, 99.66666666666667,..."
3,rose_5721768347_2ec4d2247b_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,rose,/users/danielcorcoran/desktop/flowers/all_flow...,"[163.66666666666666, 160.0, 158.0, 160.3333333..."
4,daisy_8709110478_60d12efcd4_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[21.333333333333332, 23.666666666666668, 23.66..."


### Create new dataframe splitting image_data_mapped lists into multiple columns

In [45]:
expanded_data = data320x240["image_data_mapped"].apply(pandas.Series)

In [47]:
expanded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76790,76791,76792,76793,76794,76795,76796,76797,76798,76799
0,58.0,23.333333,21.0,7.0,16.0,5.0,32.333333,18.666667,12.0,59.666667,...,35.666667,16.666667,23.666667,49.0,40.333333,37.333333,23.666667,39.0,46.666667,11.0
1,123.0,137.666667,143.0,157.0,140.666667,133.0,154.666667,174.666667,178.0,179.333333,...,64.0,59.333333,55.333333,80.666667,33.333333,44.0,50.0,59.333333,82.333333,61.666667
2,108.0,102.333333,99.666667,103.666667,109.333333,111.666667,106.333333,101.666667,97.0,104.333333,...,130.0,131.0,129.0,130.0,130.0,131.0,131.0,132.0,132.0,133.0
3,163.666667,160.0,158.0,160.333333,155.666667,143.333333,130.0,121.666667,119.666667,117.0,...,193.333333,191.0,198.333333,193.666667,193.666667,195.666667,188.666667,175.333333,169.333333,172.0
4,21.333333,23.666667,23.666667,18.666667,9.0,4.333333,9.0,18.666667,14.666667,12.666667,...,23.333333,25.666667,27.333333,28.333333,30.0,31.0,29.0,24.333333,17.0,13.0


### Join split data onto original dataframe

In [48]:
full_data = pandas.concat([data320x240, expanded_data], axis = 1)

In [50]:
full_data.head()

Unnamed: 0,file_name,file_path,width,height,size,flower_type,full_path,image_data_mapped,len_image_data_mapped,0,...,76790,76791,76792,76793,76794,76795,76796,76797,76798,76799
0,daisy_144099102_bf63a41e4f_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[58.0, 23.333333333333332, 21.0, 7.0, 16.0, 5....",604,58.0,...,35.666667,16.666667,23.666667,49.0,40.333333,37.333333,23.666667,39.0,46.666667,11.0
1,tulip_4612075317_91eefff68c_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,tulip,/users/danielcorcoran/desktop/flowers/all_flow...,"[123.0, 137.66666666666666, 143.0, 157.0, 140....",604,123.0,...,64.0,59.333333,55.333333,80.666667,33.333333,44.0,50.0,59.333333,82.333333,61.666667
2,daisy_2578695910_5ab8ee17c1_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[108.0, 102.33333333333333, 99.66666666666667,...",604,108.0,...,130.0,131.0,129.0,130.0,130.0,131.0,131.0,132.0,132.0,133.0
3,rose_5721768347_2ec4d2247b_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,rose,/users/danielcorcoran/desktop/flowers/all_flow...,"[163.66666666666666, 160.0, 158.0, 160.3333333...",604,163.666667,...,193.333333,191.0,198.333333,193.666667,193.666667,195.666667,188.666667,175.333333,169.333333,172.0
4,daisy_8709110478_60d12efcd4_n.jpg,/users/danielcorcoran/desktop/flowers/all_flow...,320.0,240.0,320x240,daisy,/users/danielcorcoran/desktop/flowers/all_flow...,"[21.333333333333332, 23.666666666666668, 23.66...",604,21.333333,...,23.333333,25.666667,27.333333,28.333333,30.0,31.0,29.0,24.333333,17.0,13.0


In [51]:
full_data.to_csv("/users/danielcorcoran/desktop/flowers/flowers_320x240_data_final.csv", 
                 index_label= "row_index")

### Read in data 

In [2]:
data = pandas.read_csv("/users/danielcorcoran/desktop/flowers/flowers_320x240_data_final.csv")

In [3]:
columns = list(data.columns)

In [4]:
columns

['row_index',
 'file_name',
 'file_path',
 'width',
 'height',
 'size',
 'flower_type',
 'full_path',
 'image_data_mapped',
 'len_image_data_mapped',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',


In [5]:
X = data.drop(['row_index',
 'file_name',
 'file_path',
 'width',
 'height',
 'size',
 'flower_type',
 'full_path',
 'image_data_mapped',
 'len_image_data_mapped'], axis = 1)

y = data["flower_type"]

In [6]:
X.shape

(604, 76800)

In [7]:
y.shape

(604,)

In [8]:
y.unique()

array(['daisy', 'tulip', 'rose', 'sunflower', 'dandelion'], dtype=object)

In [18]:
mapping = {
    "daisy":0,
    "tulip":1,
    "rose":2,
    "sunflower":3,
    "dandelion":4
}

In [19]:
y_numeric = y.replace(mapping)

## Modelling

### Import sklearn

In [45]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

### Instantiate the model

In [46]:
pca = PCA(svd_solver='randomized', 
          n_components = 120,
          whiten = True, 
          random_state = 42)

svc = SVC(kernel = "rbf", 
          class_weight = "balanced")

model = make_pipeline(pca, 
                      svc)

### Split target vector and feature matrix into train and test subsets

In [52]:
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest  = train_test_split(X, #flowers feature matrix
                                                 y_numeric, #flowers target matrix
                                                 random_state = 42)

### Fit the model

In [53]:
from sklearn.grid_search import GridSearchCV
param_grid = {"svc__C": [1, 5, 10, 50, 75, 100],
             "svc__gamma":[.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]}

grid = GridSearchCV(model, param_grid)
%time grid.fit(Xtrain, ytrain)

print(grid.best_params_)

CPU times: user 16min 35s, sys: 1min 44s, total: 18min 20s
Wall time: 7min 48s
{'svc__C': 50, 'svc__gamma': 0.001}


### Predict using model built with test data

In [54]:
model = grid.best_estimator_
yfit = model.predict(Xtest)

### Check Accuracy

In [55]:
from sklearn.metrics import accuracy_score

accuracy_score(yfit, ytest)

0.3576158940397351

### Generate Classification report

In [56]:
from sklearn.metrics import classification_report
print(classification_report(ytest, yfit))

             precision    recall  f1-score   support

          0       0.57      0.33      0.42        24
          1       0.26      0.14      0.18        36
          2       0.16      0.13      0.14        23
          3       0.49      0.45      0.47        42
          4       0.32      0.73      0.44        26

avg / total       0.37      0.36      0.34       151

