In [9]:
"""

This script collects image data from the biodiversity project of the Belgian
Royal Institute which contains 4,200 images and taxonomy entries of 
Colepotera
"""

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import os
import urllib.request as req
from urllib.request import urlopen
import re

from google.colab import drive
drive.mount('/content/drive')


In [0]:
os.chdir('/content/drive/My Drive/beetles')

In [292]:
tables = pd.read_html('http://projects.biodiversity.be/openuprbins/search?page=3')
print(tables[0])

             0            1
0      Family:  Buprestidae
1   Subfamily:            /
2       Genus:    Buprestis
3     Species:      rufipes
4  Subspecies:            /


In [0]:
http://projects.biodiversity.be/openup/rbins/pictures_only/1.jpg

In [0]:
# 1) PREP DATA
# the data for each entry is always split into 3 tables... we need to combine the 3 tables 
# to get the total information for a given entry
# ASSUMPTION : the relevant tables for each entry is always [0,1,2], [3,4,5], ... [15,16,17]
# ASSUMPTION : Always 6 entries per page

# 2) EXTRACT LABELS
# We assume that the Family, Genus, Species, and View are relevant.
# Only if the View is DORSAL will we collect the labels

# 3) COLLECT IMAGE
# If there is a dorsal view, we will download the image.
# We must find the url in the html code.
# ASSUMPTION : the order of our regex for the files matches order of data 
#                 present on the webpage

# 4) CREATE CSV
# The csv will contain everything needed to do the classification. This is
# namely the images and the labels in a convenient package.



In [104]:
for i in range(0,17,3):
  print(i)

0
3
6
9
12
15


In [0]:
#MANUALLY CODE THE FIRST IMAGE ON THE WEBPAGE

  # collect the total information split in 3 tables
entry1  = [tables[0], tables[1], tables[2]] 
  # combine the 3 tables into one table
entry1 = pd.concat(entry1)
  # for convenicnce, set the first column to row names (Family,Subspecies,...)
entry1 = entry1.set_index(entry1.columns[0])

entry1

In [0]:
def collectEntry(tables,index):

# tables = pd.read_html('http://projects.biodiversity.be/openuprbins/search?page=1')

    # collect the total information split in 3 tables
  entry = [tables[index],tables[index+1],tables[index+2]]
    # combine the 3 tables into one table
  entry = pd.concat(entry)
    # for convenience, set the first column to row names (Family,Subspecies,...)
  entry = entry.set_index(entry.columns[0])
    # Grabel relevants labels + view
  entry = entry.loc[['Family:',"Genus:","Species:","View:"]]
  entry = entry.T
  entry['Filename:'] = np.nan
  return entry

In [236]:
entry_oi = collectEntry(tables,i)
entry_oi['Filename:'] = 'hi'
entry_oi

Unnamed: 0,Family:,Genus:,Species:,View:,Filename:
1,Brentidae,Eutrachelus,temmincki,Lateral,hi


In [0]:
## CHECK IF VIEW IS DORSAL 

def whichImages(tables):
  images_to_pull = []
  for i in range(0,17,3):
    entry_oi = collectEntry(tables,i)
      #v if this is true, then we want to grab the associated image for our
      #  data set
    if (entry_oi.loc[ 'View:' , :] == 'Dorsal')[1]:
      image_order_ = (int(i/3))
      image_id = thumbnails[0].split('/')[5]
      image_grab = req.urlretrieve("http://projects.biodiversity.be/openup/rbins/pictures_only/" + image_id, image_id + ".jpg")

  return(images_to_pull)

In [307]:
entry1 = collectEntry(tables,0)
entry2 = collectEntry(tables,3)
entry3 = collectEntry(tables,6)

allentry = pd.concat([entry1,entry2,entry3])
allentry

Unnamed: 0,Family:,Genus:,Species:,View:,Filename:
1,Buprestidae,Buprestis,rufipes,Dorsal,
1,Buprestidae,Calodema,kirbyi,Dorsal,
1,Buprestidae,Calodema,plebeja,Dorsal,


In [0]:

def getBeetleDataset(URL):
  tables = pd.read_html(URL)
  relevant_tables = pd.DataFrame()
  file = urlopen(URL)
  html = file.read()
  for i in range(0,17,3): # this needs to be fixed!
    entry_oi = collectEntry(tables,i)
    if (entry_oi.loc[ : , 'View:'] == 'Dorsal')[1]:
        #THI

      os.chdir('/content/drive/My Drive/beetles')
      image_order = (int(i/3))
      thumbnails = re.findall(pattern = "http://projects.biodiversity.be/openup/rbins/[0-9]*.jpg", string=str(html))
      image_id = thumbnails[image_order].split('/')[5] # this grabs the jpg name of interest
      print('Grabbing...'+ image_id)

      entry_oi['Filename:'] = image_id
      relevant_tables = pd.concat([relevant_tables,entry_oi])
      image_grab = req.urlretrieve("http://projects.biodiversity.be/openup/rbins/pictures_only/" + image_id,  image_id)
  print('Complete')
 # print(entry_oi)
  return(relevant_tables)

In [13]:
master_csv = pd.DataFrame()
for i in range(1,680):
  beetles = getBeetleDataset('http://projects.biodiversity.be/openuprbins/search?page=' + str(i))
  print(beetles)
  master_csv = pd.concat([master_csv,beetles])

Grabbing...1.jpg
Grabbing...3.jpg
Grabbing...4.jpg
Grabbing...5.jpg
Complete
0      Family:       Genus:   Species:   View: Filename:
1  Anthribidae  Phaeotragus      gigas  Dorsal     1.jpg
1  Anthribidae  Tophroderes  phrenatus  Dorsal     3.jpg
1    Brentidae  Eutrachelus  temmincki  Dorsal     4.jpg
1    Brentidae  Eutrachelus  temmincki  Dorsal     5.jpg
Grabbing...7.jpg
Grabbing...9.jpg
Grabbing...10.jpg
Grabbing...12.jpg
Complete
0      Family:       Genus:     Species:   View: Filename:
1  Buprestidae        Aaata       finchi  Dorsal     7.jpg
1  Buprestidae    Actenodes  costipennis  Dorsal     9.jpg
1  Buprestidae  Archardella    americana  Dorsal    10.jpg
1  Buprestidae    Buprestis   panamensis  Dorsal    12.jpg
Grabbing...13.jpg
Grabbing...14.jpg
Grabbing...15.jpg
Grabbing...17.jpg
Complete
0      Family:     Genus: Species:   View: Filename:
1  Buprestidae  Buprestis  rufipes  Dorsal    13.jpg
1  Buprestidae   Calodema   kirbyi  Dorsal    14.jpg
1  Buprestidae   Calodem

In [0]:
export_csv = master_csv.to_csv(r'master_csv.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path


In [0]:
from urllib.request import urlopen
import re
file.open('/content/drive/  )

In [21]:
master_csv.head()

Unnamed: 0,Family:,Genus:,Species:,View:,Filename:
1,Anthribidae,Phaeotragus,gigas,Dorsal,1.jpg
1,Anthribidae,Tophroderes,phrenatus,Dorsal,3.jpg
1,Brentidae,Eutrachelus,temmincki,Dorsal,4.jpg
1,Brentidae,Eutrachelus,temmincki,Dorsal,5.jpg
1,Buprestidae,Aaata,finchi,Dorsal,7.jpg


In [22]:
master_csv.describe()

Unnamed: 0,Family:,Genus:,Species:,View:,Filename:
count,2982,2982,2982,2982,2982
unique,20,804,1573,1,2982
top,Scarabaeidae,Phanaeus,bicornis,Dorsal,2589.jpg
freq,1468,65,21,2982,1


In [38]:
master_csv['Family:'].value_counts()

Scarabaeidae        True
Cerambycidae        True
Lucanidae           True
Buprestidae         True
Carabidae           True
Cicindelidae        True
Geotrupidae        False
Silphidae          False
Chrysomelidae      False
Curculionidae      False
Meloidae           False
Elateridae         False
Dytiscidae         False
Brentidae          False
Cleridae           False
Anthribidae        False
Zopheridae         False
Trictenotomidae    False
Lampyridae         False
Lycidae            False
Name: Family:, dtype: bool

In [28]:
master_csv['Family:'].value_counts(normalize=True)

Scarabaeidae       0.492287
Cerambycidae       0.241449
Lucanidae          0.100604
Buprestidae        0.069081
Carabidae          0.048625
Cicindelidae       0.030181
Geotrupidae        0.007042
Silphidae          0.001677
Chrysomelidae      0.001677
Curculionidae      0.001341
Meloidae           0.001006
Elateridae         0.001006
Dytiscidae         0.000671
Brentidae          0.000671
Cleridae           0.000671
Anthribidae        0.000671
Zopheridae         0.000335
Trictenotomidae    0.000335
Lampyridae         0.000335
Lycidae            0.000335
Name: Family:, dtype: float64

In [0]:
# we can see that Scarabaeidae comprises ~half of our dataset.
# only 6 families have about 100 or more individuals which isn't evenly distributed.

#This will make classification less than ideal due to the imbalance.
# Another question is what is the distribution between the Genus in
# Scarabaeidae

# Maybe this will make a good classification 

In [0]:
scarabs = master_csv['Family:'] == 'Scarabaeidae'

In [32]:
scarabs.head()

1    False
1    False
1    False
1    False
1    False
Name: Family:, dtype: bool

In [0]:
scarabs = master_csv[scarabs]

In [34]:
scarabs.describe()

Unnamed: 0,Family:,Genus:,Species:,View:,Filename:
count,1468,1468,1468,1468,1468
unique,1,346,801,1,1468
top,Scarabaeidae,Phanaeus,bicornis,Dorsal,2716.jpg
freq,1468,65,21,1468,1


In [35]:
scarabs['Genus:'].value_counts()

Phanaeus              65
Eudicella             33
Chrysina              30
Heterorhina           27
Trigonophorus         26
Dicranocephalus       25
Proagoderus           24
Rhomborrhina          24
Dicheros              24
Lomaptera             23
Phaedimus             23
Sulcophanaeus         23
Torynorrhina          22
Potosia               21
Gymnetis              19
Protaetia             18
Euchroea              18
Netocia               18
Pachnoda              17
Ischiopsopha          16
Macraspis             16
Cyprolais             15
Pelidnota             14
Anoplognathus         14
Heliocopris           14
Pseudotorynorrhina    14
Catharsius            13
Neptunides            13
Stephanorrhina        12
Anochilia             12
                      ..
Telaugis               1
Wernoryctes            1
Anomala                1
Gnorimus               1
Megaphonia             1
Archophanes            1
Paratelaugis           1
Coelodera              1
Poecilocharis          1


In [0]:
## test run classification. using only familes with 90+ examples?

In [43]:
master_csv['Family:'].value_counts() > 88

Scarabaeidae        True
Cerambycidae        True
Lucanidae           True
Buprestidae         True
Carabidae           True
Cicindelidae        True
Geotrupidae        False
Silphidae          False
Chrysomelidae      False
Curculionidae      False
Meloidae           False
Elateridae         False
Dytiscidae         False
Brentidae          False
Cleridae           False
Anthribidae        False
Zopheridae         False
Trictenotomidae    False
Lampyridae         False
Lycidae            False
Name: Family:, dtype: bool