In [1]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.data.external import *
from ipywidgets import *
import pandas as pd
import numpy as np
import math
import sqlite3
from random import randint

In [2]:
def accuracy_species(inp, targ, axis=-1):
    pred,targ = flatten_check(inp.argmax(dim=axis), targ)
    return (pred == targ).float().mean()

def top_5(inp, targ, axis=-1):
    return top_n(5, inp, targ, axis)

def top_10(inp, targ, axis=-1):
    return top_n(10, inp, targ, axis)

def top_n(n, inp, targ, axis=-1):
    _, idx = torch.topk(inp, n)
    return (idx == targ.unsqueeze(axis)).any(axis).float().mean()

def accuracy_tax(tax_targets, inp, targ, axis=-1):
    temp = [torch.argmax(x) for x in inp]
    new_inp = tensor([tax_targets[x] for x in temp])
    new_targ = tensor([tax_targets[x] for x in targ])
    return (new_inp == new_targ).float().mean()    

def accuracy_family(inp, targ, axis=-1):
    return accuracy_tax(family_targets, inp, targ, axis)

def accuracy_genus(inp, targ, axis=-1):
    return accuracy_tax(genus_targets, inp, targ, axis)

def cross_entropy_species(input, target, weight=None, size_average=None, ignore_index=-100,
                          reduce=None, reduction='mean'):
    input_p = torch.softmax(input, dim=-1)
    return nll_loss(torch.log(input_p), target, None, None, ignore_index, None, reduction)

def cross_entropy_tax(tax_targets, target_dims, input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):

    # softmax to convert scores to probabilities
    input_p = torch.softmax(input,dim=1)

    # Sum the probabilities for each taxonomy classification 
    # Could not compile: new_input = scatter_add(input_p, tax_targets)
    tax_index = tax_targets.repeat(len(input_p), 1)
    new_input = torch.zeros(len(input_p), target_dims, dtype=input_p.dtype, device='cuda:0')
    new_input.scatter_add_(1, tax_index, input_p)
    # Create the new target
    new_target = TensorCategory(tax_targets[target].long())
    return nll_loss(torch.log(new_input), new_target, None, None, ignore_index, None, reduction)

def cross_entropy_family(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    return cross_entropy_tax(family_targets, family_dims, input, target, weight, size_average, ignore_index, reduce, reduction)

def cross_entropy_genus(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    return cross_entropy_tax(genus_targets, genus_dims, input, target, weight, size_average, ignore_index, reduce, reduction)


def joint_loss(input, target, w=1, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean'):
    ce_species = cross_entropy_species(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean')

    ce_genus = cross_entropy_genus(input, target, weight=None, size_average=None, ignore_index=-100,
                  reduce=None, reduction='mean')

    # Linear combination of the cross-entropy scores at the 2 levels in hierarchy.
    return w*ce_species+(1-w)*ce_genus

In [3]:
def get_x(a):
    return a[0]
def get_y(a):
    return a[1]

In [4]:
no_lat_tab_learn = load_learner('models/v0.3/tabular-f4-no-lat-long.pkl')
tab_learn = load_learner('models/v0.3/tabular-f4.pkl')
img_learn = load_learner('models/v0.3/resnet101-f7-fp16-h1.0.pkl')

In [6]:
file = "dbs/training/training-data-v0-4.csv"
data = pd.read_csv(file)

In [43]:
species_stats = None
with sqlite3.connect('dbs/fungid.sqlite') as con:
    species_stats = pd.read_sql_query("SELECT s.species, s.stat, s.value, s.likelihood FROM speciesstats s;", con)

species_stats = species_stats.set_index(['stat', 'value', 'species'])
species_stats = species_stats.sort_index(level=species_stats.index.names)
species_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,likelihood
stat,value,species,Unnamed: 3_level_1
elu_class1,,Agaricus arvensis,0.001766
elu_class1,,Agaricus bitorquis,0.000710
elu_class1,,Agaricus campestris,0.005053
elu_class1,,Agaricus cupreobrunneus,0.015625
elu_class1,,Agaricus xanthodermus,0.002222
...,...,...,...
season,winter,Xylodon flaviporus,0.917563
season,winter,Xylodon radula,0.775200
season,winter,Xylodon sambuci,1.000000
season,winter,Xylodon subtropicus,0.983122


In [127]:
def get_tab_data(data):
    tab_columns = ['kg', 'elu_class1', 'elu_class2', 'elu_class3', 'decimallatitude', 'decimallongitude', 'species', 'normalized_month', 'season']
    return data[tab_columns].copy()

def get_img_data(data):
    img_data = data.copy()
    img_data['img'] = 'dbs/images/224/' + data.gbifid.astype(str) + '-' + data.imgid.astype(int).astype(str) + '.png'
    return img_data[['img', 'species']]

def get_results(learner, data):
    row, clas, probs = learner.predict(data)
    # print(clas)
    return probs

def get_bounding_box(lat, lon, dist):
    latdiff = (180 / math.pi) * (dist / 6378137)
    londiff = (180 / math.pi) * (dist / 6378137) / math.cos(lat)
    return (lat - latdiff, lon - londiff), (lat + latdiff, lon + londiff)

def get_db_species(conn, observation, dist):
    p1, p2 = get_bounding_box(tab_item.decimallatitude, tab_item.decimallongitude, dist)
    print(p1, p2)
    cursor = conn.execute("""
    SELECT species, COUNT(*)
    FROM validobservations v 
    JOIN trainingspecies t ON v.specieskey = t.specieskey
    WHERE decimallatitude BETWEEN ? AND ? 
    AND decimallongitude BETWEEN ? AND ? 
    GROUP BY 1 ORDER BY 2;""",
                 (p1[0], p2[0], p1[1], p2[1]))
    results = cursor.fetchall()
    print(len(results))
                 

In [124]:
vocab_df = pd.DataFrame(img_learn.dls.vocab, columns=['species']).set_index('species')

In [125]:
def get_stats(observation, vocab_df):
    stats = pd.concat([
        species_stats.loc[('kg', observation.kg.astype(str))],
        species_stats.loc[('elu_class1', observation.elu_class1)],
        species_stats.loc[('elu_class2', observation.elu_class2)],
        species_stats.loc[('elu_class3', observation.elu_class3)],
        species_stats.loc[('normalizedmonth', observation.normalized_month.astype(str))],
        species_stats.loc[('season', observation.season)]
    ]).groupby('species').sum()
    return vocab_df.join(stats)
    
id = 1647265
tab_item = img_tab_data.iloc[id]

lh = get_stats(tab_item, vocab_df)
print(lh)

                        likelihood
species                           
Abortiporus biennis       1.474794
Acanthophysium oakesii    0.729179
Acarospora fuscata        2.890364
Acarospora moenium        2.609207
Acarospora sinopica       2.639458
...                            ...
Xylodon flaviporus        1.765200
Xylodon radula            1.195467
Xylodon sambuci           1.588659
Xylodon subtropicus       1.320004
Xylopsora friesii         2.116894

[2957 rows x 1 columns]


In [69]:
tab_data = data.loc[~(data.imgid > 1)].copy()
tab_data = get_tab_data(tab_data)

has_img_data = data.loc[data.imgid.notna()].copy()
img_data = get_img_data(has_img_data)
img_tab_data = get_tab_data(has_img_data)

In [229]:
def compare(probs1, probs2, vals2, num):
    values1, indices1 = torch.topk(probs1, num)
    idx2 = probs2.argsort(descending=True).argsort()
    values1 = values1.cpu().detach().numpy()
    probs2 = probs2.cpu().detach().numpy()
    return pd.DataFrame(
        zip(
            [img_learn.dls.vocab[idx] for idx in indices1], 
            values1, 
            [idx2[i].item() for i in indices1], 
            [probs2[i] for i in indices1],
            [vals2[i].item() for i in indices1]
        )
    )

# 124978
# 2119839 - Bad
# 1292929 - Negative nubmers = 
# 454673 - Dropped to 10 from 1
# 1068383 - Artificial fucks with stuff. 
# 193834 & 2073568- Totall throws the data into mayhem 
# 1370890 - Ignore Surface Water in ELU?
# Need to only pull 4-5 images per obs. 
id = randint(0, len(img_data))
print(id)
print(img_tab_data.iloc[id],img_data.iloc[id])
tab_item = img_tab_data.iloc[id]
lh = get_stats(tab_item, vocab_df)
lh_tensor = torch.tensor(lh['likelihood'].values)
print((lh.max(), lh.min(), lh.loc[tab_item.species]))
# with sqlite3.connect('dbs/fungid.sqlite') as conn:
#     get_stats(conn, tab_item)
# tab_probs = get_results(tab_learn, tab_item)
# print_top50(tab_probs)
# no_lat_probs = get_results(no_lat_tab_learn, tab_item)
# print_top50(no_lat_probs)
img_probs = get_results(img_learn, img_data.iloc[id].img)
compare(img_probs, img_probs * lh_tensor, lh_tensor, 50)
# img_probs.argsort(descending=True)

2056054
kg                                                   8
elu_class1                        Warm Moist Mountains
elu_class2              Non-Carbonate Sedimentary Rock
elu_class3          Mostly Needleleaf/Evergreen Forest
decimallatitude                              38.101116
decimallongitude                           -120.379223
species                             Aphroditeola olida
normalized_month                                     4
season                                          spring
Name: 10794698, dtype: object img        dbs/images/224/2005363073-1.png
species                 Aphroditeola olida
Name: 10794698, dtype: object
(likelihood    5.125481
dtype: float64, likelihood    0.01703
dtype: float64, likelihood    1.107878
Name: Aphroditeola olida, dtype: float64)


Unnamed: 0,0,1,2,3,4
0,Aphroditeola olida,18.358494,131,20.338967,1.107878
1,Clitopilus geminus,15.757133,1721,7.587691,0.48154
2,Paralepista flaccida,15.079522,973,11.161418,0.740171
3,Lentinellus cochleatus,14.916476,1956,6.568201,0.440332
4,Paralepista gilva,14.721306,363,16.516587,1.121951
5,Clitocybe costata,14.71471,653,13.350602,0.907296
6,Hygrophoropsis pallida,13.770617,514,14.639034,1.063063
7,Picipes melanopus,13.476491,151,19.82382,1.470993
8,Panus conchatus,13.120254,93,21.377804,1.629374
9,Hygrophoropsis aurantiaca,13.060636,320,17.016294,1.302869


In [202]:
lh.max()

likelihood    4.0
dtype: float64

In [191]:
lh.loc['Clitocybe rivulosa']

likelihood    4.0
Name: Clitocybe rivulosa, dtype: float64

In [84]:
len(img_learn.dls.vocab), len(tab_learn.dls.vocab)

(2957, 2957)

In [34]:
tab_probs.min()

tensor(3.0625e-07)

In [125]:
url = 'https://inaturalist-open-data.s3.amazonaws.com/photos/3897840/large.jpg'
filename = 'tmp/tmp.jpg'
resized = 'tmp/tmp-resized.jpg'
resize = Resize(224, ResizeMethod.Pad, pad_mode='zeros')
download_url(url, filename)
img = PILImage.create(filename)
resize(img).save(resized)
test_probs = get_results(img_learn, resized)
print_top50(test_probs)

[(TensorBase(11.1817), TensorBase(883), 'Entoloma hochstetteri'), (TensorBase(10.8582), TensorBase(902), 'Entoloma virescens'), (TensorBase(3.0313), TensorBase(864), 'Entocybe nitida'), (TensorBase(2.9586), TensorBase(876), 'Entoloma euchroum'), (TensorBase(2.6364), TensorBase(1766), 'Mycena interrupta'), (TensorBase(2.4956), TensorBase(415), 'Chlorociboria aeruginascens'), (TensorBase(2.2694), TensorBase(2713), 'Terana coerulea'), (TensorBase(1.8844), TensorBase(157), 'Arrhenia chlorocyanea'), (TensorBase(1.7535), TensorBase(329), 'Caloscypha fulgens'), (TensorBase(1.0795), TensorBase(521), 'Clavogaster virescens'), (TensorBase(0.8473), TensorBase(2612), 'Sparassis crispa'), (TensorBase(0.7782), TensorBase(770), 'Cyanoboletus pulverulentus'), (TensorBase(0.7490), TensorBase(2423), 'Rhodocollybia maculata'), (TensorBase(0.4456), TensorBase(1744), 'Mycena amicta'), (TensorBase(0.3630), TensorBase(300), 'Byssocorticium atrovirens'), (TensorBase(0.2923), TensorBase(2627), 'Stereocaulon co