# Importing Necessary Packages

In [28]:
# Importing all the functions I made
from data_creation import data_creation, pickle_me, get_pickle

# Model Building
import pandas as pd
import numpy as np
# Neural Network Building
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Flatten, Dropout, Conv1D, Conv2D

# Classification Metrics
from sklearn.metrics import confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data Creation

#### I only need to run this once to create my data. It will pickle my data for future reference.
I will only run the categories object since I call on it later in the notebook.

In [30]:
# Directory where all the images are stored.
#data_dir = 'C://Users/Cristian/Documents/flatiron/Capstone/Fruit'

# Categories of Fruit with the correct labels.
categories = ['Apple', 'Banana', 'Carambola', 'Guava', 'Kiwi', 'Mango',
              'Muskmelon', 'Orange', 'Peach', 'Pear', 'Persimmon', 'Pitaya',
              'Plum', 'Pomegranate', 'Tomatoes']

##### Creating Grayscale data and pickling it

In [31]:
#X_gray, y_gray = data_creation(50, 0, categories, data_dir)
#pickle_me(X_gray,y_gray, 'X_grayscale', 'y_grayscale')

##### Creating Color data and pickling it

In [32]:
#X_color, y_color = data_creation(50, 1, categories, data_dir)
#pickle_me(X_color, y_color,'X_color', 'y_color')

##### Getting our pickled data for grayscale images and color images

In [6]:
X_grayscale, y_grayscale = get_pickle('X_grayscale.pickle', 'y_grayscale.pickle')

In [7]:
X_rgb, y_rgb = get_pickle('X_color.pickle', 'y_color.pickle')

# Data Manipulation

#### Creating a train, test split for cross-validation.
'gs' will be for grayscale images, 'rgb' will be for colore images

In [8]:
X_gs_train, X_gs_test, y_gs_train, y_gs_test = train_test_split(X_grayscale,
                                                                y_grayscale, 
                                                                test_size=.2,
                                                                random_state=42)

In [9]:
X_rgb_train, X_rgb_test, y_rgb_train, y_rgb_test = train_test_split(X_rgb,
                                                                    y_rgb, 
                                                                    test_size=.2,
                                                                    random_state=42)

#### Scaling our data and reshaping so they are the correct input for our models.

In [10]:
X_gs_train_sc = X_gs_train/255.0
X_gs_test_sc = X_gs_test/255.0

X_gs_train_reshape = X_gs_train_sc.reshape((X_gs_train.shape[0],50,50,))
X_gs_test_reshape = X_gs_test_sc.reshape((X_gs_test.shape[0],50,50,))

In [11]:
X_rgb_train_sc = X_rgb_train/255.0
X_rgb_test_sc = X_rgb_test/255.0

X_rgb_train_reshape = X_rgb_train_sc.reshape((X_rgb_train.shape[0],50,50,3))
X_rgb_test_reshape = X_rgb_test_sc.reshape((X_rgb_test.shape[0],50,50,3))

#### For our models we need a target input of (15,) instead of (1,), which is the number of fruit labels we have

In [18]:
y_gs_train_dummies = pd.get_dummies(y_gs_train)
y_gs_test_dummies = pd.get_dummies(y_gs_test)

In [19]:
y_rgb_train_dummies = np.array(pd.get_dummies(y_rgb_train))
y_rgb_test_dummies = np.array(pd.get_dummies(y_rgb_test))

# Model Building
As a way to better visualize how models are doing compared to one another, I will create the neural networks, compile them, and fit our data. Then, I will have them predict classes and put our classification metrics into a dataframe.

### Model 1
First Simple Model

In [27]:
model_1 = fsm = Sequential([Dense(15, activation='softmax')])
model_1.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])
#model_1_results = model_1.fit(X_gs_train_reshape, y_gs_train_dummies, epochs=5, validation_data =(X_gs_test_reshape, y_gs_test_dummies))

In [24]:
fsm = Sequential([Conv1D(150, kernel_size=(1), activation='relu', input_shape=(50,50)),
                  Flatten(input_shape=(50,50)),
                  Dense(64, activation='sigmoid'),
                  Dense(15, activation='softmax')])

In [25]:
fsm.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
fsm_results = fsm.fit(X_gs_train_reshape, y_gs_train_dummies,epochs=5, validation_data = (X_gs_test_reshape, y_gs_test_dummies))


Train on 35524 samples, validate on 8882 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Model 2

#### Model 4

In [35]:
model_4 = Sequential([Conv2D(10, kernel_size=(1), activation='relu'),
                  Flatten(input_shape=(50,50)),
                  Dropout(0.2),
                  Dense(64, activation='sigmoid', bias_regularizer=regularizers.l1(0.02)),
                  Dense(15, activation='softmax')])

In [36]:
model_4.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])

In [39]:
import numpy as np

In [41]:
model_4_results = model_4.fit(X_rgb_train_reshape, y_rgb_train_dummies,epochs=5, validation_data = (X_rgb_test_reshape, y_rgb_test_dummies))

Train on 35524 samples, validate on 8882 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
fsm_preds = fsm.predict_classes(X_gs_test_reshape)
cm_fsm = confusion_matrix(y_gs_test, fsm_preds, normalize='pred')

In [50]:
for i in range(15):
    print(categories[i]+':', cm_fsm[i][i])

Apple: 0.3918385256691531
Banana: 0.8238866396761133
Carambola: 0.9833333333333333
Guava: 0.9901960784313726
Kiwi: 0.9986559139784946
Mango: 0.9150805270863837
Muskmelon: 0.9376854599406528
Orange: 0.9317647058823529
Peach: 0.9831730769230769
Pear: 0.9576837416481069
Persimmon: 0.9851632047477745
Pitaya: 0.9544419134396356
Plum: 0.991578947368421
Pomegranate: 0.748046875
Tomatoes: 0.781578947368421


In [44]:
model_4_preds = model_4.predict_classes(X_rgb_test_reshape)
cm_model_4 = confusion_matrix(y_rgb_test, model_4_preds, normalize='pred')

In [45]:
for i in range(15):
    print(categories[i]+':', cm_model_4[i][i])

Apple: 0.8051575931232091
Banana: 0.8761220825852782
Carambola: 0.9084507042253521
Guava: 0.9294871794871795
Kiwi: 0.9334112149532711
Mango: 0.864951768488746
Muskmelon: 0.8781609195402299
Orange: 0.9404186795491143
Peach: 0.9552572706935123
Pear: 0.9429590017825312
Persimmon: 0.9622166246851386
Pitaya: 0.9393939393939394
Plum: 0.9933628318584071
Pomegranate: 0.9058823529411765
Tomatoes: 0.92


In [51]:
model_5 = Sequential([Conv2D(10, kernel_size=(1), activation='relu'),
                  Flatten(input_shape=(50,50)),
                  Dropout(0.3),
                  Dense(64, activation='sigmoid', bias_regularizer=regularizers.l1(0.02)),
                  Dense(15, activation='softmax')])

model_5.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])

In [52]:
model_5_results = model_5.fit(X_rgb_train_reshape, y_rgb_train_dummies,epochs=5, validation_data = (X_rgb_test_reshape, y_rgb_test_dummies))

Train on 35524 samples, validate on 8882 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
len(model_4_preds)

8882

In [56]:
len(y_rgb_test)

8882

In [64]:
pred_results_actual = []
pred_results_pred = []
for i in range(len(y_rgb_test)):
    if y_rgb_test[i]!=model_4_preds[i]:
        pred_results_actual.append(categories[y_rgb_test[i]])
        pred_results_pred.append(categories[model_4_preds[i]])

In [67]:
pred_results = list(zip(pred_results_actual, pred_results_pred))

In [73]:
pred_results_df = pd.DataFrame(pred_results)

In [74]:
pred_results_df

Unnamed: 0,0,1
0,Apple,Pomegranate
1,Apple,Pear
2,Mango,Orange
3,Kiwi,Mango
4,Pear,Apple
...,...,...
811,Guava,Carambola
812,Apple,Muskmelon
813,Kiwi,Apple
814,Kiwi,Apple


In [77]:
pred_results_df.columns=['Actual', 'Predicted']

In [80]:
pred_results_df.Predicted.value_counts(normalize=True)

Apple          0.250000
Mango          0.154412
Banana         0.084559
Kiwi           0.069853
Guava          0.067402
Muskmelon      0.064951
Pomegranate    0.049020
Carambola      0.047794
Orange         0.045343
Tomatoes       0.044118
Pear           0.039216
Pitaya         0.036765
Peach          0.024510
Persimmon      0.018382
Plum           0.003676
Name: Predicted, dtype: float64

In [103]:
pred_results_df.Actual.value_counts()

Apple          178
Banana         105
Pear            91
Peach           78
Guava           74
Pomegranate     49
Mango           48
Persimmon       34
Muskmelon       32
Pitaya          31
Kiwi            29
Orange          25
Carambola       24
Tomatoes        17
Plum             1
Name: Actual, dtype: int64

In [104]:
pred_results_df[pred_results_df['Actual']=='Plum']

Unnamed: 0,Actual,Predicted
57,Plum,Muskmelon


In [86]:
results = {}
for category in categories:
    results[category] = dict(pred_results_df[pred_results_df['Actual']==category].Predicted.value_counts(normalize=True))

In [89]:
results

{'Apple': {'Muskmelon': 0.21910112359550563,
  'Tomatoes': 0.17415730337078653,
  'Kiwi': 0.10674157303370786,
  'Pear': 0.10112359550561797,
  'Pitaya': 0.10112359550561797,
  'Guava': 0.07303370786516854,
  'Peach': 0.07303370786516854,
  'Pomegranate': 0.05056179775280899,
  'Mango': 0.03932584269662921,
  'Carambola': 0.028089887640449437,
  'Banana': 0.011235955056179775,
  'Orange': 0.011235955056179775,
  'Persimmon': 0.011235955056179775},
 'Banana': {'Mango': 0.47619047619047616,
  'Pomegranate': 0.13333333333333333,
  'Carambola': 0.12380952380952381,
  'Apple': 0.09523809523809523,
  'Kiwi': 0.05714285714285714,
  'Persimmon': 0.0380952380952381,
  'Guava': 0.02857142857142857,
  'Pear': 0.02857142857142857,
  'Orange': 0.009523809523809525,
  'Muskmelon': 0.009523809523809525},
 'Carambola': {'Guava': 0.4583333333333333,
  'Banana': 0.375,
  'Pomegranate': 0.125,
  'Apple': 0.041666666666666664},
 'Guava': {'Mango': 0.2702702702702703,
  'Banana': 0.25675675675675674,
  'Ca

In [92]:
wrong_predictions_df = pd.DataFrame(results)

In [93]:
wrong_predictions_df

Unnamed: 0,Apple,Banana,Carambola,Guava,Kiwi,Mango,Muskmelon,Orange,Peach,Pear,Persimmon,Pitaya,Plum,Pomegranate,Tomatoes
Muskmelon,0.219101,0.009524,,0.027027,0.034483,0.020833,,,,0.076923,,,1.0,0.020408,
Tomatoes,0.174157,,,,,,,0.08,0.012821,,0.058824,,,,
Kiwi,0.106742,0.057143,,0.135135,,0.0625,,,0.076923,0.032967,0.176471,0.032258,,0.040816,0.058824
Pear,0.101124,0.028571,,0.067568,0.068966,0.020833,0.09375,,,,,,,,
Pitaya,0.101124,,,,,,,,0.128205,,,,,,0.117647
Guava,0.073034,0.028571,0.458333,,,0.0625,0.21875,,,0.186813,,,,0.020408,
Peach,0.073034,,,,,,,,,0.010989,,0.064516,,,0.235294
Pomegranate,0.050562,0.133333,0.125,0.027027,0.034483,0.145833,0.03125,,,0.032967,,,,,
Mango,0.039326,0.47619,,0.27027,0.172414,,,0.52,0.012821,0.098901,,,,0.428571,
Carambola,0.02809,0.12381,,0.175676,0.034483,0.0625,,,,,,,,0.081633,


In [95]:
import pickle

In [96]:
wrong_pickle_out = open('Wrong_preds_df.pickle', 'wb')
pickle.dump(wrong_predictions_df, wrong_pickle_out)
wrong_pickle_out.close()

In [97]:
pickle_wrong = open('Wrong_preds_df.pickle', 'rb')
test= pickle.load(pickle_wrong)

In [98]:
test

Unnamed: 0,Apple,Banana,Carambola,Guava,Kiwi,Mango,Muskmelon,Orange,Peach,Pear,Persimmon,Pitaya,Plum,Pomegranate,Tomatoes
Muskmelon,0.219101,0.009524,,0.027027,0.034483,0.020833,,,,0.076923,,,1.0,0.020408,
Tomatoes,0.174157,,,,,,,0.08,0.012821,,0.058824,,,,
Kiwi,0.106742,0.057143,,0.135135,,0.0625,,,0.076923,0.032967,0.176471,0.032258,,0.040816,0.058824
Pear,0.101124,0.028571,,0.067568,0.068966,0.020833,0.09375,,,,,,,,
Pitaya,0.101124,,,,,,,,0.128205,,,,,,0.117647
Guava,0.073034,0.028571,0.458333,,,0.0625,0.21875,,,0.186813,,,,0.020408,
Peach,0.073034,,,,,,,,,0.010989,,0.064516,,,0.235294
Pomegranate,0.050562,0.133333,0.125,0.027027,0.034483,0.145833,0.03125,,,0.032967,,,,,
Mango,0.039326,0.47619,,0.27027,0.172414,,,0.52,0.012821,0.098901,,,,0.428571,
Carambola,0.02809,0.12381,,0.175676,0.034483,0.0625,,,,,,,,0.081633,


In [None]:
model_5 = Sequential([Conv2D(10, kernel_size=(1), activation='relu'),
                  Flatten(input_shape=(50,50)),
                  Dropout(0.3),
                  Dense(64, activation='sigmoid', bias_regularizer=regularizers.l1(0.02)),
                  Dense(15, activation='softmax')])

model_5.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy'])