In [104]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.data.external import *
from ipywidgets import *
import pandas as pd
import numpy as np
import math
import sqlite3
from random import randint

In [2]:
def accuracy_species(inp, targ, axis=-1):
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

def top_5(inp, targ, axis=-1):
    return top_n(5, inp, targ, axis)

def top_10(inp, targ, axis=-1):
    return top_n(10, inp, targ, axis)

def top_n(n, inp, targ, axis=-1):
    _, idx = torch.topk(inp, n)
    return (idx == targ.unsqueeze(axis)).any(axis).float().mean()

def accuracy_tax(tax_targets, inp, targ, axis=-1):
    temp = [torch.argmax(x) for x in inp]
    new_inp = tensor([tax_targets[x] for x in temp])
    new_targ = tensor([tax_targets[x] for x in targ])
    return (new_inp == new_targ).float().mean()    

def accuracy_family(inp, targ, axis=-1):
    return accuracy_tax(family_targets, inp, targ, axis)

def accuracy_genus(inp, targ, axis=-1):
    return accuracy_tax(genus_targets, inp, targ, axis)

def cross_entropy_species(input, target, weight=None, size_average=None, ignore_index=-100,
                          reduce=None, reduction='mean'):
    input_p = torch.softmax(input, dim=-1)
    return nll_loss(torch.log(input_p), target, None, None, ignore_index, None, reduction)

def cross_entropy_tax(tax_targets, target_dims, input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):

    # softmax to convert scores to probabilities
    input_p = torch.softmax(input,dim=1)

    # Sum the probabilities for each taxonomy classification 
    # Could not compile: new_input = scatter_add(input_p, tax_targets)
    tax_index = tax_targets.repeat(len(input_p), 1)
    new_input = torch.zeros(len(input_p), target_dims, dtype=input_p.dtype, device='cuda:0')
    new_input.scatter_add_(1, tax_index, input_p)
    # Create the new target
    new_target = TensorCategory(tax_targets[target].long())
    return nll_loss(torch.log(new_input), new_target, None, None, ignore_index, None, reduction)

def cross_entropy_family(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    return cross_entropy_tax(family_targets, family_dims, input, target, weight, size_average, ignore_index, reduce, reduction)

def cross_entropy_genus(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    return cross_entropy_tax(genus_targets, genus_dims, input, target, weight, size_average, ignore_index, reduce, reduction)


def joint_loss(input, target, w=1, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    ce_species = cross_entropy_species(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean')

    ce_genus = cross_entropy_genus(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean')

    # Linear combination of the cross-entropy scores at the 2 levels in hierarchy.
    return w*ce_species+(1-w)*ce_genus

In [3]:
def get_x(a):
    return a[0]
def get_y(a):
    return a[1]

In [49]:
no_lat_tab_learn = load_learner('models/v0.3/tabular-f4-no-lat-long.pkl')
tab_learn = load_learner('models/v0.3/tabular-f4.pkl')
img_learn = load_learner('models/v0.3/resnet101-f7-fp16-h1.0.pkl')

In [5]:
file = "dbs/training/training-data-v0-3.csv"
data = pd.read_csv(file)

In [38]:
def get_tab_data(data):
    tab_columns = ['kg', 'elu_class1', 'elu_class2', 'elu_class3', 'decimallatitude', 'decimallongitude', 'species', 'normalized_month', 'season']
    return data[tab_columns].copy()

def get_img_data(data):
    img_data = data.copy()
    img_data['img'] = 'dbs/images/224/' + data.gbifid.astype(str) + '-' + data.imgid.astype(int).astype(str) + '.png'
    return img_data[['img', 'species']]

def get_species(indices):
    return [img_learn.dls.vocab[idx] for idx in indices]

def print_top50(probs):
    values, indices = torch.topk(probs, 50)
    print(L(zip(values, indices, get_species(indices))))

def get_results(learner, data):
    row, clas, probs = learner.predict(data)
    # print(clas)
    return probs

def get_bounding_box(lat, lon, dist):
    latdiff = (180 / math.pi) * (dist / 6378137)
    londiff = (180 / math.pi) * (dist / 6378137) / math.cos(lat)
    return (lat - latdiff, lon - londiff), (lat + latdiff, lon + londiff)

def get_db_species(conn, observation, dist):
    p1, p2 = get_bounding_box(tab_item.decimallatitude, tab_item.decimallongitude, dist)
    print(p1, p2)
    cursor = conn.execute("""
    SELECT species, COUNT(*)
    FROM validobservations v 
    JOIN trainingspecies t ON v.specieskey = t.specieskey
    WHERE decimallatitude BETWEEN ? AND ? 
    AND decimallongitude BETWEEN ? AND ? 
    GROUP BY 1 ORDER BY 2;""",
                 (p1[0], p2[0], p1[1], p2[1]))
    results = cursor.fetchall()
    print(len(results))
                 

In [39]:
tab_data = data.loc[~(data.imgid > 1)].copy()
tab_data = get_tab_data(tab_data)

has_img_data = data.loc[data.imgid.notna()].copy()
img_data = get_img_data(has_img_data)
img_tab_data = get_tab_data(has_img_data)

In [124]:
id = randint(0, len(img_data))
print(id)
print(img_tab_data.iloc[id],img_data.iloc[id])
tab_item = img_tab_data.iloc[id]
# with sqlite3.connect('dbs/fungid.sqlite') as conn:
#     get_species(conn, tab_item, 100000)
tab_probs = get_results(tab_learn, tab_item)
print_top50(tab_probs)
# no_lat_probs = get_results(no_lat_tab_learn, tab_item)
# print_top50(no_lat_probs)
img_probs = get_results(img_learn, img_data.iloc[id].img)
print_top50(img_probs)
print_top50(img_probs + (tab_probs * 100))

453399
kg                                                  15
elu_class1                          Cold Wet Mountains
elu_class2              Non-Carbonate Sedimentary Rock
elu_class3          Mostly Needleleaf/Evergreen Forest
decimallatitude                               57.06108
decimallongitude                           -135.327541
species                               Hydnellum peckii
normalized_month                                    10
season                                          autumn
Name: 1612715, dtype: object img        dbs/images/224/3355568145-1.png
species                   Hydnellum peckii
Name: 1612715, dtype: object


[(tensor(0.0212), tensor(90), 'Amanita muscaria'), (tensor(0.0179), tensor(1603), 'Lobaria pulmonaria'), (tensor(0.0150), tensor(1974), 'Peltigera membranacea'), (tensor(0.0124), tensor(1466), 'Laetiporus conifericola'), (tensor(0.0119), tensor(1602), 'Lobaria oregana'), (tensor(0.0114), tensor(2156), 'Pleurocybella porrigens'), (tensor(0.0112), tensor(1411), 'Lactarius fallax'), (tensor(0.0110), tensor(358), 'Cantharellus formosus'), (tensor(0.0107), tensor(615), 'Coprinus comatus'), (tensor(0.0103), tensor(1965), 'Peltigera britannica'), (tensor(0.0101), tensor(1967), 'Peltigera collina'), (tensor(0.0100), tensor(2149), 'Platismatia glauca'), (tensor(0.0080), tensor(976), 'Fomitopsis ochracea'), (tensor(0.0077), tensor(523), 'Clavulina coralloides'), (tensor(0.0074), tensor(1323), 'Icmadophila ericetorum'), (tensor(0.0073), tensor(2617), 'Sphaerophorus globosus'), (tensor(0.0072), tensor(1601), 'Lobaria linita'), (tensor(0.0071), tensor(854), 'Dolichousnea longissima'), (tensor(0.006

[(TensorBase(18.5630), TensorBase(1196), 'Hydnellum peckii'), (TensorBase(15.5111), TensorBase(1191), 'Hydnellum ferrugineum'), (TensorBase(12.3529), TensorBase(1184), 'Hydnellum aurantiacum'), (TensorBase(12.1856), TensorBase(1188), 'Hydnellum concrescens'), (TensorBase(11.9205), TensorBase(1198), 'Hydnellum spongiosipes'), (TensorBase(11.6931), TensorBase(2721), 'Thelephora terrestris'), (TensorBase(11.5354), TensorBase(1375), 'Laccaria amethysteo-occidentalis'), (TensorBase(11.3519), TensorBase(2545), 'Sarcodon imbricatus'), (TensorBase(11.1884), TensorBase(1379), 'Laccaria ochropurpurea'), (TensorBase(10.7803), TensorBase(2028), 'Phaeolus schweinitzii'), (TensorBase(10.7747), TensorBase(1468), 'Laetiporus persicinus'), (TensorBase(10.5181), TensorBase(1189), 'Hydnellum cyanopodium'), (TensorBase(10.4231), TensorBase(2424), 'Rhodofomes cajanderi'), (TensorBase(10.1791), TensorBase(1016), 'Ganoderma tsugae'), (TensorBase(10.0328), TensorBase(1194), 'Hydnellum lundellii'), (TensorBase

In [84]:
len(img_learn.dls.vocab), len(tab_learn.dls.vocab)

(2957, 2957)

In [34]:
tab_probs.min()

tensor(3.0625e-07)

In [125]:
url = 'https://inaturalist-open-data.s3.amazonaws.com/photos/3897840/large.jpg'
filename = 'tmp/tmp.jpg'
resized = 'tmp/tmp-resized.jpg'
resize = Resize(224, ResizeMethod.Pad, pad_mode='zeros')
download_url(url, filename)
img = PILImage.create(filename)
resize(img).save(resized)
test_probs = get_results(img_learn, resized)
print_top50(test_probs)

[(TensorBase(11.1817), TensorBase(883), 'Entoloma hochstetteri'), (TensorBase(10.8582), TensorBase(902), 'Entoloma virescens'), (TensorBase(3.0313), TensorBase(864), 'Entocybe nitida'), (TensorBase(2.9586), TensorBase(876), 'Entoloma euchroum'), (TensorBase(2.6364), TensorBase(1766), 'Mycena interrupta'), (TensorBase(2.4956), TensorBase(415), 'Chlorociboria aeruginascens'), (TensorBase(2.2694), TensorBase(2713), 'Terana coerulea'), (TensorBase(1.8844), TensorBase(157), 'Arrhenia chlorocyanea'), (TensorBase(1.7535), TensorBase(329), 'Caloscypha fulgens'), (TensorBase(1.0795), TensorBase(521), 'Clavogaster virescens'), (TensorBase(0.8473), TensorBase(2612), 'Sparassis crispa'), (TensorBase(0.7782), TensorBase(770), 'Cyanoboletus pulverulentus'), (TensorBase(0.7490), TensorBase(2423), 'Rhodocollybia maculata'), (TensorBase(0.4456), TensorBase(1744), 'Mycena amicta'), (TensorBase(0.3630), TensorBase(300), 'Byssocorticium atrovirens'), (TensorBase(0.2923), TensorBase(2627), 'Stereocaulon co