In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()

le.fit(df_train['species'])
df_train['species'] = le.transform(df_train['species'])

In [6]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,3,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,49,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,65,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,94,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,84,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [7]:
unique_images = list()
unique_ids = list()

while True:
    df_length = len(df_train.index)
    index = np.random.randint(1, df_length+1)
    img_id = df_train.loc[index]['id']
    
    if img_id not in unique_ids:
        unique_ids.append(img_id)
        unique_images.append(index)
    
    if len(unique_images) >= 25:
        break

In [8]:
print(unique_images)

[136, 61, 150, 74, 82, 78, 93, 518, 905, 674, 577, 846, 70, 104, 818, 212, 931, 600, 787, 375, 228, 511, 767, 466, 864]


In [9]:
fig, ax = plt.subplots(figsize=(10,6))
image_folder = '../data/images/'

for i,image in enumerate(unique_images):
    fig.add_subplot(5, 5, i+1)
    image_file = image_folder+str(image)+'.jpg'
    img = mpimg.imread(image_file)
    plt.axis('off')
    plt.imshow(img, cmap='Greys')

ax.axis('off')
output_file = 'plots/leaf_grid.jpg'
plt.savefig(output_file)   # SAVE FILE: black and white leaves

Now we will graph some training results from Step 1.

In [10]:
results = {
    'Decision Tree':{
        'Training_Accuracy':0.984,
        'Validation_Accuracy':0.561,
        'Testing_Score':15.36690,
        'LogLoss_As_Scorer':15.36690
    },
    'Random Forest':{
        'Training_Accuracy':1.000,
        'Validation_Accuracy':0.944,
        'Testing_Score':0.97268,
        'LogLoss_As_Scorer':0.95484
    },
    'AdaBoost':{
        'Training_Accuracy':0.366,
        'Validation_Accuracy':0.268,
        'Testing_Score':3.09445,
        'LogLoss_As_Scorer':2.45393
    },
    'Logistic Regression':{
        'Training_Accuracy':1.000,
        'Validation_Accuracy':0.990,
        'Testing_Score':0.26996,
        'LogLoss_As_Scorer':0.18344
    },
    'S. Gradient Descent':{
        'Training_Accuracy':0.995,
        'Validation_Accuracy':0.944,
        'Testing_Score':0.41905,
        'LogLoss_As_Scorer':0.62995
    },
    'Support Vectors':{
        'Training_Accuracy':0.997,
        'Validation_Accuracy':0.995,
        'Testing_Score':2.23691,
        'LogLoss_As_Scorer':2.23739
    }
}

In [11]:
classifiers = [clf for clf in results]
training_acc = [results[clf]['Training_Accuracy'] for clf in results]
valid_acc = [results[clf]['Validation_Accuracy'] for clf in results]

print(classifiers)
print(training_acc)
print(valid_acc)

['Logistic Regression', 'Decision Tree', 'Support Vectors', 'S. Gradient Descent', 'Random Forest', 'AdaBoost']
[1.0, 0.984, 0.997, 0.995, 1.0, 0.366]
[0.99, 0.561, 0.995, 0.944, 0.944, 0.268]


In [12]:
N = len(classifiers)

ind = np.arange(N)
width = 0.35

fig, ax = plt.subplots(figsize=(8,8))

training_rects = ax.bar(ind, training_acc, width, color='b', edgecolor='k')
valid_rects = ax.bar(ind+width, valid_acc, width, color='g', edgecolor='k')

ax.set_ylabel('Accuracy', fontsize=14)
ax.set_title('Chart 1: Training and Validation Accuracy of Various Classifiers', y=1.08, fontsize=16)
ax.set_xticks(ind + width/2)
ax.set_xticklabels(classifiers, rotation=70, fontsize=14)

ax.legend((training_rects[0], valid_rects[0]), ('Training', 'Validation'), 
          bbox_to_anchor=(.3, .97, 1., .102), loc=2,
           ncol=2, borderaxespad=0.)


plt.gcf().subplots_adjust(bottom=0.28)
output_file = 'plots/step1classifiers.jpg'
plt.savefig(output_file)   # SAVE FILE: step1 classifiers

Now we will graph the testing results using accuracy and log loss as grid search scorers

In [13]:
results.pop('Decision Tree') # Clip for graph beauty
classifiers = [clf for clf in results]
acc_as_scorer = [results[clf]['Testing_Score'] for clf in results]
ll_as_scorer = [results[clf]['LogLoss_As_Scorer'] for clf in results]

N = len(classifiers)

ind = np.arange(N)
width = 0.35

fig, ax = plt.subplots(figsize=(8,8))

acc_rects = ax.bar(ind, acc_as_scorer, width, color='r', edgecolor='k')
ll_rects = ax.bar(ind+width, ll_as_scorer, width, color='k', edgecolor='k')

ax.set_ylabel('Log Loss Test Score', fontsize=14)
ax.set_title('Chart 2: Test Scores for Various Classifiers', y=1.08, fontsize=16)
ax.set_xticks(ind + width/2)
ax.set_xticklabels(classifiers, rotation=70, fontsize=14)

ax.legend((acc_rects[0], ll_rects[0]), ('Accuracy Scorer', 'Log Loss Scorer'), 
          bbox_to_anchor=(.3, .97, 1., .102), loc=2,
           ncol=2, borderaxespad=0.)


plt.gcf().subplots_adjust(bottom=0.28)
output_file = 'plots/step1scores.jpg'
plt.savefig(output_file)   # SAVE FILE: step1 test scores

Now for the neural network training graph:

In [4]:
epochs = [i for i in range(1,41)]

nn_data = {
1:{'loss': 4.5174, 'acc': 0.0328, 'val_loss': 4.3558, 'val_acc': 0.0808},
2:{'loss': 3.8304, 'acc': 0.1932, 'val_loss': 3.3135, 'val_acc': 0.3586},
3:{'loss': 2.6560, 'acc': 0.4141, 'val_loss': 2.0825, 'val_acc': 0.5354},
4:{'loss': 1.6383, 'acc': 0.6035, 'val_loss': 1.2187, 'val_acc': 0.7525},
5:{'loss': 1.0849, 'acc': 0.7399, 'val_loss': 0.8377, 'val_acc': 0.8283},
6:{'loss': 0.7133, 'acc': 0.8396, 'val_loss': 0.5036, 'val_acc': 0.9141},
7:{'loss': 0.5181, 'acc': 0.8624, 'val_loss': 0.3666, 'val_acc': 0.9293},
8:{'loss': 0.4198, 'acc': 0.9003, 'val_loss': 0.2521, 'val_acc': 0.9596},
9:{'loss': 0.3056, 'acc': 0.9268, 'val_loss': 0.2309, 'val_acc': 0.9495},
10:{'loss': 0.2688, 'acc': 0.9268, 'val_loss': 0.1762, 'val_acc': 0.9697},
11:{'loss': 0.2447, 'acc': 0.9394, 'val_loss': 0.1518, 'val_acc': 0.9798},
12:{'loss': 0.1813, 'acc': 0.9609, 'val_loss': 0.1213, 'val_acc': 0.9899},
13:{'loss': 0.1903, 'acc': 0.9583, 'val_loss': 0.1395, 'val_acc': 0.9596},
14:{'loss': 0.1406, 'acc': 0.9684, 'val_loss': 0.0913, 'val_acc': 0.9848},
15:{'loss': 0.1805, 'acc': 0.9457, 'val_loss': 0.1010, 'val_acc': 0.9848},
16:{'loss': 0.1351, 'acc': 0.9710, 'val_loss': 0.0994, 'val_acc': 0.9798},
17:{'loss': 0.0884, 'acc': 0.9836, 'val_loss': 0.0740, 'val_acc': 0.9798},
18:{'loss': 0.0816, 'acc': 0.9836, 'val_loss': 0.0862, 'val_acc': 0.9798},
19:{'loss': 0.0959, 'acc': 0.9722, 'val_loss': 0.0676, 'val_acc': 0.9848},
20:{'loss': 0.0837, 'acc': 0.9836, 'val_loss': 0.0730, 'val_acc': 0.9899},
21:{'loss': 0.0694, 'acc': 0.9861, 'val_loss': 0.0767, 'val_acc': 0.9798},
22:{'loss': 0.0577, 'acc': 0.9886, 'val_loss': 0.0702, 'val_acc': 0.9798},
23:{'loss': 0.0705, 'acc': 0.9798, 'val_loss': 0.0636, 'val_acc': 0.9747},
24:{'loss': 0.0716, 'acc': 0.9735, 'val_loss': 0.0756, 'val_acc': 0.9747},
25:{'loss': 0.0471, 'acc': 0.9937, 'val_loss': 0.0632, 'val_acc': 0.9848},
26:{'loss': 0.0496, 'acc': 0.9899, 'val_loss': 0.0728, 'val_acc': 0.9798},
27:{'loss': 0.0439, 'acc': 0.9912, 'val_loss': 0.0481, 'val_acc': 0.9899},
28:{'loss': 0.0378, 'acc': 0.9924, 'val_loss': 0.1042, 'val_acc': 0.9747},
29:{'loss': 0.0614, 'acc': 0.9861, 'val_loss': 0.0691, 'val_acc': 0.9848},
30:{'loss': 0.0609, 'acc': 0.9798, 'val_loss': 0.1010, 'val_acc': 0.9646},
31:{'loss': 0.0553, 'acc': 0.9874, 'val_loss': 0.0481, 'val_acc': 0.9899},
32:{'loss': 0.0351, 'acc': 0.9949, 'val_loss': 0.0689, 'val_acc': 0.9747},
33:{'loss': 0.0255, 'acc': 0.9975, 'val_loss': 0.0642, 'val_acc': 0.9899},
34:{'loss': 0.0313, 'acc': 0.9912, 'val_loss': 0.0367, 'val_acc': 0.9899},
35:{'loss': 0.0259, 'acc': 0.9937, 'val_loss': 0.0549, 'val_acc': 0.9798},
36:{'loss': 0.0328, 'acc': 0.9912, 'val_loss': 0.0267, 'val_acc': 0.9899},
37:{'loss': 0.0189, 'acc': 0.9975, 'val_loss': 0.0496, 'val_acc': 0.9899},
38:{'loss': 0.0184, 'acc': 0.9975, 'val_loss': 0.0364, 'val_acc': 0.9949},
39:{'loss': 0.0315, 'acc': 0.9861, 'val_loss': 0.0485, 'val_acc': 0.9848},
40:{'loss': 0.0196, 'acc': 0.9987, 'val_loss': 0.0432, 'val_acc': 0.9848}
}

In [16]:
losses = []
accuracies = []
val_losses = []
val_accuracies = []

for epoch in epochs:
    entry = nn_data[epoch]
    losses.append(entry['loss'])
    accuracies.append(entry['acc'])
    val_losses.append(entry['val_loss'])
    val_accuracies.append(entry['val_acc'])

In [19]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(8, 8), sharex=True)

fig.subplots_adjust(hspace=0)

ax1.set_yscale("log")

ax1.scatter(epochs, losses, c='b', edgecolors='b')
ax1.scatter(epochs, val_losses, c='r')

ax2.scatter(epochs, accuracies, c='b')
ax2.scatter(epochs, val_accuracies, c='r')

ax1.set_ylabel("Loss (log scale)")
ax2.set_ylabel("Accuracy")
ax2.set_xlabel("Epoch")

plt.legend(('Training', 'Validation'), 
          bbox_to_anchor=(0, 0.97, 1, 1),
           loc='upper center',
           ncol=2, borderaxespad=0.)

ax1.set_title('Chart 3: Training Loss and Accuracy for Neural Network', fontsize=16)

output_file = 'plots/step2.jpg'
plt.savefig(output_file)

Draw the neural network:

Code from https://gist.github.com/craffel/2d727968c3aaebd10359

In [23]:
def draw_neural_net(ax, left, right, bottom, top, layer_sizes):
    '''
    Draw a neural network cartoon using matplotilb.
    
    :usage:
        >>> fig = plt.figure(figsize=(12, 12))
        >>> draw_neural_net(fig.gca(), .1, .9, .1, .9, [4, 7, 2])
    
    :parameters:
        - ax : matplotlib.axes.AxesSubplot
            The axes on which to plot the cartoon (get e.g. by plt.gca())
        - left : float
            The center of the leftmost node(s) will be placed here
        - right : float
            The center of the rightmost node(s) will be placed here
        - bottom : float
            The center of the bottommost node(s) will be placed here
        - top : float
            The center of the topmost node(s) will be placed here
        - layer_sizes : list of int
            List of layer sizes, including input and output dimensionality
    '''
    n_layers = len(layer_sizes)
    v_spacing = (top - bottom)/float(max(layer_sizes))
    h_spacing = (right - left)/float(len(layer_sizes) - 1)
    # Nodes
    for n, layer_size in enumerate(layer_sizes):
        layer_top = v_spacing*(layer_size - 1)/2. + (top + bottom)/2.
        for m in range(layer_size):
            circle = plt.Circle((n*h_spacing + left, layer_top - m*v_spacing), v_spacing/4.,
                                color='w', ec='k', zorder=4)
            ax.add_artist(circle)
    # Edges
    for n, (layer_size_a, layer_size_b) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        layer_top_a = v_spacing*(layer_size_a - 1)/2. + (top + bottom)/2.
        layer_top_b = v_spacing*(layer_size_b - 1)/2. + (top + bottom)/2.
        for m in range(layer_size_a):
            for o in range(layer_size_b):
                line = plt.Line2D([n*h_spacing + left, (n + 1)*h_spacing + left],
                                  [layer_top_a - m*v_spacing, layer_top_b - o*v_spacing], c='k')
                ax.add_artist(line)

In [None]:
fig = plt.figure(figsize=(6, 6))
ax = fig.gca()
ax.axis('off')
draw_neural_net(ax, .1, .9, .1, .9, [512, 1024, 99])
fig.savefig('plots/nn.png')