In [1]:
import pandas as pd
import numpy as np

In [2]:
media_df = pd.read_hdf('/mnt/gbif/clean_data.h5', 'media_merged_filtered-by-species_350pt')

In [3]:
media_df.columns.to_list()

['gbifID',
 'format',
 'identifier_media',
 'identifier_obs',
 'eventDate',
 'decimalLatitude',
 'decimalLongitude',
 'taxonID',
 'taxonRank',
 'taxonomicStatus',
 'taxonKey',
 'acceptedTaxonKey',
 'phylumKey',
 'classKey',
 'orderKey',
 'familyKey',
 'genusKey',
 'speciesKey',
 'species',
 'acceptedScientificName',
 'verbatimScientificName',
 'media_count_per_taxonID']

In [4]:
len(media_df['speciesKey'].unique())

2388

In [5]:
media_df['speciesKey'].dtype

dtype('float64')

In [6]:
media_df[media_df['speciesKey'].isna()]['species'].isna().any()

True

In [7]:
len(media_df[media_df['speciesKey'].isna()]['species'].isna())

16026

In [8]:
len(media_df['species'].unique())

2388

In [9]:
media_df[media_df['speciesKey'].isna()]['acceptedScientificName'].isna().any()

False

In [10]:
len(media_df['acceptedScientificName'].unique()) # use this

2451

In [11]:
media_df[media_df['speciesKey'].isna()]['verbatimScientificName'].isna().any()

False

In [12]:
len(media_df['verbatimScientificName'].unique())

2505

In [13]:
len(media_df['taxonID'].unique())

2507

In [14]:
media_df['taxonRank'].unique()

array(['SPECIES', 'VARIETY', 'SUBSPECIES', 'FAMILY', 'GENUS', 'FORM'],
      dtype=object)

In [15]:
len(media_df['taxonKey'].unique())

2495

In [16]:
media_df['taxonKey'].head()

0    2608086
5    2548311
6    7241318
7    2605405
8    2605405
Name: taxonKey, dtype: int64

In [17]:
species_map = media_df['acceptedScientificName'].unique()

In [18]:
len(species_map)

2451

In [19]:
species_map[:5]

array(['Niebla homalea (Ach.) Rundel & Bowler',
       'Trametes versicolor (L.) Lloyd', 'Ganoderma tsugae Murrill',
       'Letharia columbiana (Nutt.) J.W.Thomson',
       'Amanita muscaria (L.) Lam.'], dtype=object)

In [20]:
def apply_map( species_str ):
    res = np.where(species_map == species_str )[0]
    if ( len(res) != 1 ):
        print(res)

    return res[0]

new_col = media_df['acceptedScientificName'].map(apply_map)

In [21]:
new_col

0             0
5             1
6             2
7             3
8             3
           ... 
2803458    2110
2803459    2110
2803460    2110
2803483    2303
2803517    1457
Name: acceptedScientificName, Length: 665803, dtype: int64

In [22]:
species_map

array(['Niebla homalea (Ach.) Rundel & Bowler',
       'Trametes versicolor (L.) Lloyd', 'Ganoderma tsugae Murrill', ...,
       'Pluteus podospileus Sacc. & Cub.',
       'Hydropus praedecurrens (Murrill) Redhead',
       'Synchytrium australe Speg.'], dtype=object)

In [23]:
len(new_col.unique())

2451

In [24]:
len(species_map)

2451

In [25]:
import json

In [26]:
with open('/mnt/gbif/map.json', 'w') as map_file:
    json.dump(dict(enumerate(species_map)), map_file, indent=2)

In [27]:
with open('/mnt/gbif/labels.json', 'w') as map_file:
    json.dump(dict(enumerate(new_col)), map_file, indent=2)

In [28]:
new_col.value_counts().describe()

count    2451.000000
mean      271.645451
std       107.778660
min       101.000000
25%       168.000000
50%       333.000000
75%       350.000000
max      1050.000000
Name: acceptedScientificName, dtype: float64

In [29]:
species_map_df = pd.DataFrame(species_map)

In [30]:
species_map_df = species_map_df.reset_index()

In [31]:
species_map_df.columns = ['mapping', 'acceptedScientificName']

In [32]:
species_map_df.head()

Unnamed: 0,mapping,acceptedScientificName
0,0,Niebla homalea (Ach.) Rundel & Bowler
1,1,Trametes versicolor (L.) Lloyd
2,2,Ganoderma tsugae Murrill
3,3,Letharia columbiana (Nutt.) J.W.Thomson
4,4,Amanita muscaria (L.) Lam.


In [33]:
species_map_df.to_hdf('/mnt/gbif/clean_data.h5', 'map')

In [34]:
type(new_col)

pandas.core.series.Series

In [35]:
new_col

0             0
5             1
6             2
7             3
8             3
           ... 
2803458    2110
2803459    2110
2803460    2110
2803483    2303
2803517    1457
Name: acceptedScientificName, Length: 665803, dtype: int64

In [36]:
len(new_col)

665803

In [39]:
labels_df = pd.DataFrame(new_col)
labels_df = labels_df.reset_index()
labels_df.columns = ['mapping_index', 'acceptedScientificName']

In [40]:
labels_df.head()

Unnamed: 0,mapping_index,acceptedScientificName
0,0,0
1,5,1
2,6,2
3,7,3
4,8,3


In [41]:
labels_df.to_hdf('/mnt/gbif/clean_data.h5', 'labels')