In [1]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.tabular.all import *
import pandas as pd
import numpy as np
from torch.nn.functional import nll_loss,log_softmax

from sklearn import preprocessing
from functools import partial
import copy

In [2]:
file = "dbs/training/training-data-v0-3.csv"
obs = pd.read_csv(file)
obs = obs.loc[~(obs.imgid > 1)]
data = obs[['kg', 'elu_class1', 'elu_class2', 'elu_class3', 'species', 'normalized_month', 'season']].copy()
data

Unnamed: 0,kg,elu_class1,elu_class2,elu_class3,species,normalized_month,season
0,26,Cool Wet Plains,Carbonate Sedimentary Rock,Mostly Deciduous Forest,Usnea subfloridana,6,summer
1,26,Cold Wet Hills,Carbonate Sedimentary Rock,Mostly Deciduous Forest,Trametes ochracea,1,winter
2,27,Cold Wet Hills,Metamorphic Rock,Mostly Needleleaf/Evergreen Forest,Hertelidea botryosa,6,summer
3,26,Cold Wet Plains,Acidic Plutonics,Mostly Cropland,Phellinidium ferrugineofuscum,4,spring
4,26,Cold Wet Plains,Acidic Plutonics,Mostly Cropland,Lecania naegelii,10,autumn
...,...,...,...,...,...,...,...
12087603,29,Cold Wet Mountains,Mixed Sedimentary Rock,Sparse Vegetation,Cladonia mitis,7,summer
12087604,19,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,Cladonia rangiferina,7,summer
12087605,19,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,Nephromopsis cucullata,7,summer
12087606,19,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,Cladonia stellaris,7,summer


In [3]:
obs

Unnamed: 0,gbifid,_family,genus,species,eventyear,eventmonth,eventday,eventdate,decimallatitude,decimallongitude,kg,elu,elu_class1,elu_class2,elu_class3,normalized_month,imgid,season
0,1431462551,Parmeliaceae,Usnea,Usnea subfloridana,2016,5,31,2016-05-31 00:00:00,56.218360,16.416320,26,810,Cool Wet Plains,Carbonate Sedimentary Rock,Mostly Deciduous Forest,6,,summer
1,1438086806,Polyporaceae,Trametes,Trametes ochracea,2014,12,7,2014-12-07 00:00:00,58.448580,13.701640,26,372,Cold Wet Hills,Carbonate Sedimentary Rock,Mostly Deciduous Forest,1,,winter
2,1669971044,Stereocaulaceae,Hertelidea,Hertelidea botryosa,2012,5,21,2012-05-21 00:00:00,59.551380,12.236730,27,301,Cold Wet Hills,Metamorphic Rock,Mostly Needleleaf/Evergreen Forest,6,,summer
3,2270949504,Hymenochaetaceae,Phellinidium,Phellinidium ferrugineofuscum,2019,3,2,2019-03-02 00:00:00,59.845890,17.567560,26,451,Cold Wet Plains,Acidic Plutonics,Mostly Cropland,4,,spring
4,2271059142,Ramalinaceae,Lecania,Lecania naegelii,2018,9,12,2018-09-12 00:00:00,59.262680,15.548110,26,451,Cold Wet Plains,Acidic Plutonics,Mostly Cropland,10,,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12087603,3750486160,Cladoniaceae,Cladonia,Cladonia mitis,2006,6,27,2006-06-27 00:00:00,59.988337,-150.566099,29,14,Cold Wet Mountains,Mixed Sedimentary Rock,Sparse Vegetation,7,,summer
12087604,3750486210,Cladoniaceae,Cladonia,Cladonia rangiferina,2006,6,26,2006-06-26 00:00:00,60.318215,-150.361411,19,141,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,7,,summer
12087605,3750486608,Parmeliaceae,Nephromopsis,Nephromopsis cucullata,2006,6,26,2006-06-26 00:00:00,60.318215,-150.361411,19,141,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,7,,summer
12087606,3750486848,Cladoniaceae,Cladonia,Cladonia stellaris,2006,6,26,2006-06-26 00:00:00,60.318215,-150.361411,19,141,Cold Wet Mountains,Non-Carbonate Sedimentary Rock,Swampy or Often Flooded Vegetation,7,,summer


In [4]:
family_map = {species:family for (species,_,family) in obs.groupby(by=["species","genus","_family"]).indices.keys()}
family_list = list(family_map.values())
fle = preprocessing.LabelEncoder()
family_targets = torch.tensor(fle.fit_transform(family_list), device='cuda:0')
family_dims = family_targets.max().int().item() + 1

genus_map = {species:genus for (species,genus,_) in obs.groupby(by=["species","genus","_family"]).indices.keys()}
genus_list = list(genus_map.values())
gle = preprocessing.LabelEncoder()
genus_targets = torch.tensor(gle.fit_transform(genus_list), device='cuda:0')
genus_dims = genus_targets.max().int().item() + 1

In [6]:
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(data))
to = TabularPandas(data, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['kg', 'elu_class1', 'elu_class2', 'elu_class3', 'normalized_month', 'season'],
                   # cont_names = ['decimallatitude', 'decimallongitude'],
                   y_names='species',
                   splits=splits)
to

         kg  elu_class1  elu_class2  elu_class3  species  normalized_month  \
3729261  15          22           3           2      773                11   
4656730  26          11           5           5     1786                11   
5097181  27           9           1           3     1011                 9   
7794618  15          10           8           2     1219                11   
1518646  27          10           5           5      620                 6   
...      ..         ...         ...         ...      ...               ...   
4523706  26          11           5           5      830                 5   
4333343  26           2           0           0     1827                 1   
4447886  15          11          12           4      807                10   
2619258  27           9           5           5     1603                11   
4252802  27          10           3           1      769                10   

         season  
3729261       1  
4656730       1  
5097181  

In [7]:
def accuracy_species(inp, targ, axis=-1):
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

def top_5(inp, targ, axis=-1):
    return top_n(5, inp, targ, axis)

def top_10(inp, targ, axis=-1):
    return top_n(10, inp, targ, axis)

def top_n(n, inp, targ, axis=-1):
    _, idx = torch.topk(inp, n)
    return (idx == targ.unsqueeze(axis)).any(axis).float().mean()

def accuracy_tax(tax_targets, inp, targ, axis=-1):
    temp = [torch.argmax(x) for x in inp]
    new_inp = tensor([tax_targets[x.long()] for x in temp])
    new_targ = tensor([tax_targets[x.long()] for x in targ])
    return (new_inp == new_targ).float().mean()    

def accuracy_family(inp, targ, axis=-1):
    return accuracy_tax(family_targets, inp, targ, axis)

def accuracy_genus(inp, targ, axis=-1):
    return accuracy_tax(genus_targets, inp, targ, axis)



In [8]:
batch_size = 16384
dls = to.dataloaders(bs=batch_size)
len(dls.vocab)

2957

In [9]:
learn = tabular_learner(dls)
learn.fine_tune(5)
learn.export("models/v0.3/tabular-f4-no-lat-long.pkl")

epoch,train_loss,valid_loss,accuracy,top_5,top_10,time
0,6.205503,6.178729,0.027301,0.017342,0.030011,00:25


epoch,train_loss,valid_loss,accuracy,top_5,top_10,time
0,6.136428,6.136303,0.028023,0.016973,0.030132,00:25
1,6.111842,6.113156,0.028683,0.017123,0.029955,00:25
2,6.093467,6.096765,0.029024,0.017238,0.03005,00:25
3,6.078681,6.085807,0.029469,0.017157,0.029976,00:25
4,6.070002,6.083148,0.029582,0.017133,0.030087,00:25


FileNotFoundError: [Errno 2] No such file or directory: 'models/tab/v0.3-tabular-f4-no-lat-long.pkl'

In [14]:
# preds = learn.get_preds(dl=dls.valid)
row, clas, probs = learn.predict(data.iloc[200035])

In [15]:
row, clas

(     kg  elu_class1  elu_class2  elu_class3  normalized_month  season  \
 0  16.0        10.0         5.0         2.0               2.0     4.0   
 
    decimallatitude  decimallongitude  species  
 0         0.110528         -0.217228   1299.0  ,
 tensor(1299))

torch.return_types.topk(
values=tensor([0.0135, 0.0108, 0.0105, 0.0105, 0.0104, 0.0100, 0.0099, 0.0094, 0.0094, 0.0094, 0.0092, 0.0089, 0.0089, 0.0084, 0.0084, 0.0083, 0.0081, 0.0080, 0.0080, 0.0079, 0.0077, 0.0077, 0.0077, 0.0075,
        0.0075, 0.0074, 0.0072, 0.0071, 0.0071, 0.0071, 0.0070, 0.0069, 0.0069, 0.0067, 0.0066, 0.0066, 0.0065, 0.0065, 0.0063, 0.0062, 0.0062, 0.0062, 0.0061, 0.0061, 0.0061, 0.0061, 0.0060, 0.0060,
        0.0059, 0.0056, 0.0056, 0.0055, 0.0055, 0.0052, 0.0052, 0.0052, 0.0052, 0.0051, 0.0051, 0.0051, 0.0051, 0.0050, 0.0050, 0.0050, 0.0049, 0.0048, 0.0048, 0.0047, 0.0047, 0.0047, 0.0047, 0.0046,
        0.0046, 0.0045, 0.0045, 0.0045, 0.0044, 0.0044, 0.0044, 0.0043, 0.0042, 0.0042, 0.0041, 0.0041, 0.0041, 0.0041, 0.0040, 0.0040, 0.0040, 0.0039, 0.0038, 0.0038, 0.0038, 0.0037, 0.0037, 0.0036,
        0.0036, 0.0036, 0.0035, 0.0035, 0.0034, 0.0034, 0.0033, 0.0033, 0.0033, 0.0032, 0.0032, 0.0032, 0.0032, 0.0031, 0.0030, 0.0030, 0.0030, 0.0030, 0.0030, 0.0030, 

In [None]:
(np.array(dls.vocab) == 'Arthonia spadicea').argmax()