In [None]:
#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.data.external import *
from ipywidgets import *
import pandas as pd
import numpy as np
import math
import sqlite3
from random import randint

In [60]:
# Local stuff
from api.helpers import *
import api.tab_model as tm
import api.imageclassifier as ic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
img_learn = ic.ImageClassifier('models/v0.4/resnet101-f7-fp16-h1.0.pkl')

In [8]:
tab_learn = tm.TabModel('dbs/fungid.sqlite')

In [94]:
file = "dbs/training/training-data-v0-4.csv"
data = pd.read_csv(file)
data = data.loc[data.imgid.notna()].copy()
data['img'] = 'dbs/images/500/' + data.gbifid.astype(str) + '-' + data.imgid.astype(int).astype(str) + '.png'

In [13]:
def get_bounding_box(lat, lon, dist):
    latdiff = (180 / math.pi) * (dist / 6378137)
    londiff = (180 / math.pi) * (dist / 6378137) / math.cos(lat)
    return (lat - latdiff, lon - londiff), (lat + latdiff, lon + londiff)

def get_db_species(conn, observation, dist):
    p1, p2 = get_bounding_box(tab_item.decimallatitude, tab_item.decimallongitude, dist)
    print(p1, p2)
    cursor = conn.execute("""
    SELECT species, COUNT(*)
    FROM validobservations v 
    JOIN trainingspecies t ON v.specieskey = t.specieskey
    WHERE decimallatitude BETWEEN ? AND ? 
    AND decimallongitude BETWEEN ? AND ? 
    GROUP BY 1 ORDER BY 2;""",
                 (p1[0], p2[0], p1[1], p2[1]))
    results = cursor.fetchall()
    print(len(results))
    
def get_tab_data(data):
    tab_columns = ['kg', 'elu_class1', 'elu_class2', 'elu_class3', 'decimallatitude', 'decimallongitude', 'species', 'normalizedmonth', 'season']
    return data[tab_columns].copy()

def get_results(learner, data):
    row, clas, probs = learner.predict(data)
    return probs
                 


In [95]:
def compare(probs1, probs2, vals2, num) -> pd.DataFrame:
    df = pd.DataFrame()
    df["probs1"] = probs1
    df["rank1"] = probs1.rank(ascending=False)
    df["probs2"] = probs2
    df["rank2"] = probs2.rank(ascending=False)
    df["vals2"] = vals2

    return df.sort_values(by="rank1").head(num)

# 124978
# 2119839 - Bad
# 1292929 - Negative nubmers = 
# 454673 - Dropped to 10 from 1
# 1068383 - Artificial fucks with stuff. 
# 193834 & 2073568- Totall throws the data into mayhem 
# 1370890 - Ignore Surface Water in ELU?
# Need to only pull 4-5 images per obs. 

id = randint(0, len(img_data))
print(id)
item = data.iloc[id]
print(item)

obs = obs_from_series(item)

tab_preds = tab_learn.get_predictions(obs)
print((tab_preds.max(), tab_preds.min(), tab_preds.loc[tab_item.species]))
img_preds = img_learn.get_predictions(obs.image)
df = compare(img_preds, img_preds * tab_preds, tab_preds, 50)
df

1893670
gbifid                                   3335137899
_family                                Hypocreaceae
genus                                     Hypomyces
species                      Hypomyces lactifluorum
familykey                                    8419.0
genuskey                                    7805465
specieskey                                  2561802
eventyear                                      2021
eventmonth                                        7
eventday                                         28
eventdate                       2021-07-28 13:40:00
decimallatitude                           43.779442
decimallongitude                         -75.206864
kg                                               26
elu                                             375
elu_class1                           Cold Wet Hills
elu_class2                         Metamorphic Rock
elu_class3                  Mostly Deciduous Forest
normalizedmonth                                   8
seas

Unnamed: 0_level_0,probs1,rank1,probs2,rank2,vals2
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Hypomyces lactifluorum,22.644676,1.0,81.857754,1.0,3.614879
Clathrus columnatus,10.448808,2.0,3.532539,1092.0,0.338081
Rhizina undulata,9.683091,3.0,30.533531,3.0,3.153283
Ceriporia spissa,9.517293,4.0,26.441506,6.0,2.778259
Trametes coccinea,9.410911,5.0,8.557209,366.0,0.909286
Aleuria aurantia,9.070466,6.0,17.817844,44.0,1.96438
Cantharellus roseocanus,8.931212,7.0,7.321134,479.0,0.819725
Hypomyces aurantius,8.71139,8.0,28.477597,4.0,3.269007
Clathrus ruber,8.353658,9.0,14.092861,98.0,1.687029
Laetiporus conifericola,8.323435,10.0,4.675251,870.0,0.561697


In [125]:
url = 'https://inaturalist-open-data.s3.amazonaws.com/photos/3897840/large.jpg'
filename = 'tmp/tmp.jpg'
resized = 'tmp/tmp-resized.jpg'
resize = Resize(224, ResizeMethod.Pad, pad_mode='zeros')
download_url(url, filename)
img = PILImage.create(filename)
resize(img).save(resized)
test_probs = get_results(img_learn, resized)
print_top50(test_probs)

[(TensorBase(11.1817), TensorBase(883), 'Entoloma hochstetteri'), (TensorBase(10.8582), TensorBase(902), 'Entoloma virescens'), (TensorBase(3.0313), TensorBase(864), 'Entocybe nitida'), (TensorBase(2.9586), TensorBase(876), 'Entoloma euchroum'), (TensorBase(2.6364), TensorBase(1766), 'Mycena interrupta'), (TensorBase(2.4956), TensorBase(415), 'Chlorociboria aeruginascens'), (TensorBase(2.2694), TensorBase(2713), 'Terana coerulea'), (TensorBase(1.8844), TensorBase(157), 'Arrhenia chlorocyanea'), (TensorBase(1.7535), TensorBase(329), 'Caloscypha fulgens'), (TensorBase(1.0795), TensorBase(521), 'Clavogaster virescens'), (TensorBase(0.8473), TensorBase(2612), 'Sparassis crispa'), (TensorBase(0.7782), TensorBase(770), 'Cyanoboletus pulverulentus'), (TensorBase(0.7490), TensorBase(2423), 'Rhodocollybia maculata'), (TensorBase(0.4456), TensorBase(1744), 'Mycena amicta'), (TensorBase(0.3630), TensorBase(300), 'Byssocorticium atrovirens'), (TensorBase(0.2923), TensorBase(2627), 'Stereocaulon co