In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()

le.fit(df_train['species'])
df_train['species'] = le.transform(df_train['species'])

In [6]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,3,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,49,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,65,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,94,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,84,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [7]:
unique_images = list()
unique_ids = list()

while True:
    df_length = len(df_train.index)
    index = np.random.randint(1, df_length+1)
    img_id = df_train.loc[index]['id']
    
    if img_id not in unique_ids:
        unique_ids.append(img_id)
        unique_images.append(index)
    
    if len(unique_images) >= 25:
        break

In [8]:
print(unique_images)

[17, 188, 7, 827, 849, 804, 516, 161, 711, 151, 958, 256, 575, 835, 236, 147, 461, 572, 583, 622, 769, 52, 726, 963, 209]


In [9]:
fig, ax = plt.subplots(figsize=(10,6))
image_folder = '../data/images/'

for i,image in enumerate(unique_images):
    fig.add_subplot(5, 5, i+1)
    image_file = image_folder+str(image)+'.jpg'
    img = mpimg.imread(image_file)
    plt.axis('off')
    plt.imshow(img, cmap='Greys')

ax.axis('off')
output_file = 'plots/leaf_grid.jpg'
plt.savefig(output_file)   # SAVE FILE: black and white leaves

Now we will graph some training results from Step 1.

In [10]:
results = {
    'Decision Tree':{
        'Training_Accuracy':0.984,
        'Validation_Accuracy':0.561,
        'Testing_Score':15.36690,
        'LogLoss_As_Scorer':15.36690
    },
    'Random Forest':{
        'Training_Accuracy':1.000,
        'Validation_Accuracy':0.944,
        'Testing_Score':0.97268,
        'LogLoss_As_Scorer':0.95484
    },
    'AdaBoost':{
        'Training_Accuracy':0.366,
        'Validation_Accuracy':0.268,
        'Testing_Score':3.09445,
        'LogLoss_As_Scorer':2.45393
    },
    'Logistic Regression':{
        'Training_Accuracy':1.000,
        'Validation_Accuracy':0.990,
        'Testing_Score':0.26996,
        'LogLoss_As_Scorer':0.18344
    },
    'S. Gradient Descent':{
        'Training_Accuracy':0.995,
        'Validation_Accuracy':0.944,
        'Testing_Score':0.41905,
        'LogLoss_As_Scorer':0.62995
    },
    'Support Vectors':{
        'Training_Accuracy':0.997,
        'Validation_Accuracy':0.995,
        'Testing_Score':2.23691,
        'LogLoss_As_Scorer':2.23739
    }
}

In [11]:
classifiers = [clf for clf in results]
training_acc = [results[clf]['Training_Accuracy'] for clf in results]
valid_acc = [results[clf]['Validation_Accuracy'] for clf in results]

print(classifiers)
print(training_acc)
print(valid_acc)

['Support Vectors', 'Random Forest', 'Decision Tree', 'AdaBoost', 'S. Gradient Descent', 'Logistic Regression']
[0.997, 1.0, 0.984, 0.366, 0.995, 1.0]
[0.995, 0.944, 0.561, 0.268, 0.944, 0.99]


In [12]:
N = len(classifiers)

ind = np.arange(N)
width = 0.35

fig, ax = plt.subplots(figsize=(8,8))

training_rects = ax.bar(ind, training_acc, width, color='b', edgecolor='k')
valid_rects = ax.bar(ind+width, valid_acc, width, color='g', edgecolor='k')

ax.set_ylabel('Accuracy', fontsize=14)
ax.set_title('Testing and Validation Accuracy of Various Classifiers', y=1.08, fontsize=16)
ax.set_xticks(ind + width/2)
ax.set_xticklabels(classifiers, rotation=70, fontsize=14)

ax.legend((training_rects[0], valid_rects[0]), ('Training', 'Validation'), 
          bbox_to_anchor=(.3, .97, 1., .102), loc=2,
           ncol=2, borderaxespad=0.)


plt.gcf().subplots_adjust(bottom=0.28)
output_file = 'plots/step1classifiers.jpg'
plt.savefig(output_file)   # SAVE FILE: step1 classifiers

Now we will graph the testing results using accuracy and log loss as grid search scorers

In [20]:
results.pop('Decision Tree') # Clip for graph beauty
classifiers = [clf for clf in results]
acc_as_scorer = [results[clf]['Testing_Score'] for clf in results]
ll_as_scorer = [results[clf]['LogLoss_As_Scorer'] for clf in results]

N = len(classifiers)

ind = np.arange(N)
width = 0.35

fig, ax = plt.subplots(figsize=(8,8))

acc_rects = ax.bar(ind, acc_as_scorer, width, color='r', edgecolor='k')
ll_rects = ax.bar(ind+width, ll_as_scorer, width, color='k', edgecolor='k')

ax.set_ylabel('Log Loss Test Score', fontsize=14)
ax.set_title('Test Scores for Various Classifiers', y=1.08, fontsize=16)
ax.set_xticks(ind + width/2)
ax.set_xticklabels(classifiers, rotation=70, fontsize=14)

ax.legend((acc_rects[0], ll_rects[0]), ('Accuracy Scorer', 'Log Loss Scorer'), 
          bbox_to_anchor=(.3, .97, 1., .102), loc=2,
           ncol=2, borderaxespad=0.)


plt.gcf().subplots_adjust(bottom=0.28)
output_file = 'plots/step1scores.jpg'
plt.savefig(output_file)   # SAVE FILE: step1 test scores