In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../content/drive/MyDrive/dataset/leaves/"))

# Any results you write to the current directory are saved as output.

['test', 'train']


In [7]:
train_set = pd.read_csv("../content/drive/MyDrive/dataset/leaves/train.csv")
test_set  = pd.read_csv("../content/drive/MyDrive/dataset/leaves/test.csv")

In [8]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
labels = lb.fit_transform(train_set['label'])

In [9]:
test_ids = test_set.id

train = train_set.drop(['id', 'label'], axis=1)
test = test_set.drop(['id'], axis=1)
sc = StandardScaler().fit(test)
test_scaled = sc.transform(test)

In [10]:
ss_split = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
ss_split.get_n_splits(train, labels)

for train_index, test_index in ss_split.split(train, labels):   
    x_train, x_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]   

In [11]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [12]:
print(y_train.shape, y_test.shape)

(519,) (174,)


In [13]:
print(x_train.shape, x_test.shape)

(519, 192) (174, 192)


1. Naive Bayes

In [14]:
#gaussian naive bayes
gnb = GaussianNB().fit(x_train, y_train)

In [15]:
pred_gnb = gnb.predict_proba(x_test)

In [17]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, gnb.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_gnb)))

Accuracy: 0.034
Log-loss: 33.348


2. Logistic Regression

In [18]:
#logistic regression
lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')

param_grid = {'C': [ 1000, 10000],
              'tol': [0.000001, 0.00001]}
grid_search = GridSearchCV(lr, param_grid, scoring='neg_log_loss', refit='True', n_jobs=1, cv=ss_split)
grid_search.fit(x_train, y_train)

print ('Best parameter: {}'.format(grid_search.best_params_))
print ('Best cross-validation neg_log_loss score: {}'.format(grid_search.best_score_))
print ('\nBest estimator:\n{}'.format(grid_search.best_estimator_))

Best parameter: {'C': 1000, 'tol': 1e-05}
Best cross-validation neg_log_loss score: -0.0888913052443087

Best estimator:
LogisticRegression(C=1000, multi_class='multinomial', solver='newton-cg',
                   tol=1e-05)


In [19]:
pred_lr = grid_search.predict_proba(x_test)

In [20]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, grid_search.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_lr)))

Accuracy: 0.983
Log-loss: 0.066


In [21]:
param_grid = {'C': [1000],
              'tol': [0.00001]}
log_reg = LogisticRegression(solver='newton-cg', multi_class='multinomial')
grid_search = GridSearchCV(log_reg, param_grid, scoring='neg_log_loss', refit='True', n_jobs=1, cv=ss_split)
grid_search.fit(x_train, y_train)

test_pred = grid_search.predict_proba(test_scaled)

sub = pd.DataFrame(test_pred, index=test_ids, columns=lb.classes_)

In [22]:
sub.head(2)

Unnamed: 0_level_0,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,Alnus_Cordata,Alnus_Maximowiczii,Alnus_Rubra,Alnus_Sieboldiana,Alnus_Viridis,Arundinaria_Simonii,Betula_Austrosinensis,Betula_Pendula,Callicarpa_Bodinieri,Castanea_Sativa,Celtis_Koraiensis,Cercis_Siliquastrum,Cornus_Chinensis,Cornus_Controversa,Cornus_Macrophylla,Cotinus_Coggygria,Crataegus_Monogyna,Cytisus_Battandieri,Eucalyptus_Glaucescens,Eucalyptus_Neglecta,Eucalyptus_Urnigera,Fagus_Sylvatica,Ginkgo_Biloba,Ilex_Aquifolium,Ilex_Cornuta,Liquidambar_Styraciflua,Liriodendron_Tulipifera,Lithocarpus_Cleistocarpus,Lithocarpus_Edulis,Magnolia_Heptapeta,...,Quercus_Coccinea,Quercus_Crassifolia,Quercus_Crassipes,Quercus_Dolicholepis,Quercus_Ellipsoidalis,Quercus_Greggii,Quercus_Hartwissiana,Quercus_Ilex,Quercus_Imbricaria,Quercus_Infectoria_sub,Quercus_Kewensis,Quercus_Nigra,Quercus_Palustris,Quercus_Phellos,Quercus_Phillyraeoides,Quercus_Pontica,Quercus_Pubescens,Quercus_Pyrenaica,Quercus_Rhysophylla,Quercus_Rubra,Quercus_Semecarpifolia,Quercus_Shumardii,Quercus_Suber,Quercus_Texana,Quercus_Trojana,Quercus_Variabilis,Quercus_Vulcanica,Quercus_x_Hispanica,Quercus_x_Turneri,Rhododendron_x_Russellianum,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1509,6.536719e-07,4.947098e-09,5.382338e-07,6.113461e-06,1.865319e-09,1.327993e-09,4.747781e-08,6.541224e-10,1.256163e-08,8.155319e-10,2.019871e-07,7.000681e-10,6.933646e-10,5.009953e-12,4.753673e-10,7.07316e-09,2.119825e-10,2.848028e-09,3.910068e-08,5.208802e-07,7.641794e-10,1.384031e-08,1.283528e-09,1.285951e-10,2.437543e-07,9.838221e-10,2.698399e-08,1.831692e-06,1.332753e-10,7.406387e-08,1e-06,6.834956e-09,1.101492e-07,1.105213e-09,2.463362e-06,1.743484e-09,1.017856e-07,2.921882e-08,1.693038e-06,8.867547e-10,...,5.233244e-08,7.431645e-09,5.8e-05,5.020712e-09,2.148408e-08,1.200277e-09,6.89829e-11,7.453443e-10,1.132127e-05,2.9e-05,0.0007929191,7.736317e-09,7.370345e-08,5.885256e-08,3e-06,6.163761e-15,5.314372e-09,8.118181e-06,3.541335e-10,1.89299e-11,1.925755e-07,5.387909e-11,3e-06,7.550146e-07,1.132573e-10,2.713507e-07,2.816114e-07,3.70044e-08,1.12832e-07,4.458283e-12,5.071871e-07,0.999062,3.138835e-11,5.360539e-10,9.953696e-11,1.015628e-07,2.854759e-09,2.189001e-06,1.727421e-10,6.611966e-08
1339,1.422939e-07,1.903676e-08,2.031858e-06,5.418585e-07,1.882825e-06,1.780179e-07,1.342394e-07,1.818578e-05,4.465459e-09,1.288262e-06,0.0002944581,1.300751e-06,1.315526e-06,3.62981e-06,7.86504e-08,8.984331e-08,3.699012e-10,4.211885e-08,2.816907e-06,4.440921e-07,2.240024e-07,8.200852e-06,4.929474e-06,2.325477e-07,2.345494e-06,7.226347e-07,4.357241e-08,5.374155e-09,5.105282e-06,1.738872e-07,8e-06,1.68027e-05,3.52299e-06,5.385982e-06,1.282794e-09,1.691399e-06,2.595568e-08,5.354794e-10,1.002323e-08,1.356359e-10,...,1.624648e-08,4.865267e-07,9e-06,5.282908e-06,9.560893e-10,4.28448e-05,6.830833e-05,4.121967e-07,8.644609e-07,0.996132,2.277644e-07,4.634875e-05,3.586177e-07,2.373187e-08,0.00016,1.061432e-08,6.815806e-06,1.212834e-08,3.939502e-08,1.436675e-08,6.970682e-06,2.48695e-06,3e-06,1.148851e-11,2.66442e-07,5.339665e-09,7.278526e-08,0.0006639681,1.42619e-10,1.759766e-07,2.292816e-09,4e-06,3.994582e-07,0.001958845,2.310203e-06,1.811124e-08,2.674175e-10,4.945565e-08,2.069126e-07,0.0001004035


In [23]:
#svm
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(SVC(probability=True), param_grid=param_grid, cv=ss_split)
grid_search.fit(x_train, y_train)

print ('Best parameter: {}'.format(grid_search.best_params_))
print ('Best cross-validation neg_log_loss score: {}'.format(grid_search.best_score_))
print ('\nBest estimator:\n{}'.format(grid_search.best_estimator_))

Best parameter: {'C': 1, 'gamma': 0.01}
Best cross-validation neg_log_loss score: 0.9700000000000001

Best estimator:
SVC(C=1, gamma=0.01, probability=True)


In [24]:
pred_svm = grid_search.predict_proba(x_test)

In [25]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, grid_search.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_svm)))

Accuracy: 0.977
Log-loss: 2.720


3. KNN

In [26]:
#knn
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

KNeighborsClassifier()

In [27]:
pred_knn = knn.predict_proba(x_test)

In [28]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, knn.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_knn)))

Accuracy: 0.954
Log-loss: 0.419


In [29]:
test_predict =knn.predict_proba(test_scaled)

sub3 = pd.DataFrame(test_predict, index=test_ids, columns=lb.classes_)

In [30]:
sub3.head(2)

Unnamed: 0_level_0,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,Alnus_Cordata,Alnus_Maximowiczii,Alnus_Rubra,Alnus_Sieboldiana,Alnus_Viridis,Arundinaria_Simonii,Betula_Austrosinensis,Betula_Pendula,Callicarpa_Bodinieri,Castanea_Sativa,Celtis_Koraiensis,Cercis_Siliquastrum,Cornus_Chinensis,Cornus_Controversa,Cornus_Macrophylla,Cotinus_Coggygria,Crataegus_Monogyna,Cytisus_Battandieri,Eucalyptus_Glaucescens,Eucalyptus_Neglecta,Eucalyptus_Urnigera,Fagus_Sylvatica,Ginkgo_Biloba,Ilex_Aquifolium,Ilex_Cornuta,Liquidambar_Styraciflua,Liriodendron_Tulipifera,Lithocarpus_Cleistocarpus,Lithocarpus_Edulis,Magnolia_Heptapeta,...,Quercus_Coccinea,Quercus_Crassifolia,Quercus_Crassipes,Quercus_Dolicholepis,Quercus_Ellipsoidalis,Quercus_Greggii,Quercus_Hartwissiana,Quercus_Ilex,Quercus_Imbricaria,Quercus_Infectoria_sub,Quercus_Kewensis,Quercus_Nigra,Quercus_Palustris,Quercus_Phellos,Quercus_Phillyraeoides,Quercus_Pontica,Quercus_Pubescens,Quercus_Pyrenaica,Quercus_Rhysophylla,Quercus_Rubra,Quercus_Semecarpifolia,Quercus_Shumardii,Quercus_Suber,Quercus_Texana,Quercus_Trojana,Quercus_Variabilis,Quercus_Vulcanica,Quercus_x_Hispanica,Quercus_x_Turneri,Rhododendron_x_Russellianum,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


4. Decission Tree

In [31]:
#DecisionTree
dt = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100,max_depth=3, min_samples_leaf=5)
dt.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=100)

In [32]:
pred_dt = dt.predict_proba(x_test)

In [33]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, dt.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_dt)))

Accuracy: 0.040
Log-loss: 5.430


5. Random Forrest

In [34]:
#randomfforest
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [35]:
pred_rf = rf.predict_proba(x_test)

In [36]:
print("Accuracy: %.3f"
      % (accuracy_score(y_test, rf.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_rf)))

Accuracy: 0.351
Log-loss: 3.935


6. Linear Discriminant Analysis

In [37]:
lda = LinearDiscriminantAnalysis().fit(x_train, y_train)

In [38]:
pred_lda = lda.predict_proba(x_test)

print("Accuracy: %.3f"
      % (accuracy_score(y_test, lda.predict(x_test))))
print("Log-loss: %.3f"
      % (log_loss(y_test, pred_lda)))

Accuracy: 0.954
Log-loss: 1.001


In [39]:
sub.to_csv('leaves_classification.csv')

In [40]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

csv = drive.CreateFile({'title': 'leaves_classification.csv'})
csv.SetContentFile('leaves_classification.csv')
csv.Upload()