In [1]:
import pandas as pd
from pathlib import Path
import requests
import json
import time
import numpy as np

from process_files import process_inat_data, add_concatenated_columns


In [2]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [3]:
def print_json(obj):
    print(json.dumps(obj,  indent=4))

In [4]:
all_taxa =  Path('outputs', 'combine_taxa_list.csv')
all_taxa_inat =  Path('outputs', 'combine_taxa_list_inat_data.csv')
de_taxa =  Path('outputs', 'de_taxa_list.csv')


# create combine taxa list

create one taxa file from all the project observations files and LA indicator list

In [7]:
dfs = []

taxa_cols = [ 
    'scientific_name', 
    'common_name',
    'iconic_taxon_name', 
    'taxon_id', 
    'taxon_kingdom_name',
    'taxon_phylum_name',
    'taxon_class_name', 
    'taxon_order_name',
    'taxon_family_name', 
    'taxon_genus_name', 
    'taxon_species_name',
]

new_cols = {
    'taxon_kingdom_name': 'kingdom',
    'taxon_phylum_name': 'phylum',
    'taxon_class_name': 'class',
    'taxon_order_name': 'order',
    'taxon_family_name': 'family',
    'taxon_genus_name': 'genus',
    'taxon_species_name': 'species',
    
}

for file_path in Path().glob('../data/**/observations*.csv'):
    df = pd.read_csv(file_path,  usecols=taxa_cols, dtype=str)
    df = df.dropna(subset=['taxon_id'])
    dfs.append(df)


indicator_cols =  [ 
    'scientific_name', 
    'common_name',
    'iconic_taxon_name', 
    'taxon_id', 
    'kingdom',
    'phylum',
    'class', 
    'order',
    'family', 
    'genus', 
    'species',
]    
file = '../data/los-angeles-bioblitz/indicator_species_with_inat.csv'
df = pd.read_csv(file,  dtype=str, usecols=indicator_cols)
dfs.append(df)
    
    
combine_df = pd.concat(dfs)
combine_df = combine_df.drop_duplicates(subset='taxon_id')
combine_df.rename(columns=new_cols, inplace=True)

combine_df.to_csv(all_taxa, index=False)

log_df(combine_df)

(3851, 18)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,species,species.1,kingdom.1,phylum.1,class.1,order.1,family.1,genus.1
0,Deidamia inscriptum,Lettered Sphinx,Insecta,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,Deidamia inscriptum,,,,,,,
1,Acer,maples,Plantae,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,,,,,,,,
2,Plantago lanceolata,ribwort plantain,Plantae,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,Plantago lanceolata,,,,,,,
3,Poa pratensis,Kentucky bluegrass,Plantae,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,Poa pratensis,,,,,,,
4,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,,,,,,,,


# add inat data to combine taxa list

get info from inat API for each taxa

In [8]:
inat_df = pd.read_csv(all_taxa, dtype=str)
log_df(inat_df)

(3851, 11)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,species
0,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,
1,Taraxacum officinale,common dandelion,Plantae,47602,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Taraxacum,Taraxacum officinale
2,Trifolium repens,white clover,Plantae,55745,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,Trifolium,Trifolium repens
3,Plantae,plants,Plantae,47126,Plantae,,,,,,
4,Turdus migratorius,American Robin,Aves,12727,Animalia,Chordata,Aves,Passeriformes,Turdidae,Turdus,Turdus migratorius


break up taxon_ids into strings with multiple ids to send multiple ids per api call

In [11]:
            
ids = []
indexes = {}
temp_ids =[]

limit = 20
for index, row in inat_df.iterrows():
    
    temp_ids.append(str(row['taxon_id']))    
    indexes[row['taxon_id']] = index
    
    if index % limit == 0 or index == len(inat_df) - 1:
        str_ids = ','.join(temp_ids)
        ids.append(str_ids)
        temp_ids = []

        
print(ids[0:3] )


['47124', '47602,55745,47126,12727,78882,42223,46017,47125,46217,47603,47434,7089,51875,6930,211194,49487,53438,48230,55830,46095', '311249,47170,53178,51876,47336,129902,58732,52856,52720,57278,48484,50278,865017,53196,84227,47604,60307,57849,47219,7251']


In [84]:
for index, value in enumerate(ids):
    time.sleep(2)
    print(index, end=' ')
    url = f'https://api.inaturalist.org/v1/taxa/{value}'
    
    response = requests.get(url)
    if response.status_code == 200:
        for json_data in response.json()['results']:
            index = indexes[str(json_data['id'])]
            result  = process_inat_data(json_data)
            
            for col in result:
                inat_df.at[index, col] = result[col]
   

log_df(inat_df)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 (3851, 209)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,...,subterclass_id,subterclass_iconic_taxon_name,subterclass_photo_url,subterclass_photo_attribution,subterclass_photo_license_code,stateofmatter_iconic_taxon_name,stateofmatter_common_name,stateofmatter_photo_license_code,subterclass_wikipedia_url,subterclass_common_name
0,Deidamia inscriptum,Lettered Sphinx,Insecta,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,...,,,,,,,,,,
1,Acer,maples,Plantae,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,...,,,,,,,,,,
2,Plantago lanceolata,ribwort plantain,Plantae,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,...,,,,,,,,,,
3,Poa pratensis,Kentucky bluegrass,Plantae,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,...,,,,,,,,,,
4,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,...,,,,,,,,,,


In [86]:
inat_df.to_csv(all_taxa_inat, index=False)

## add concatnated taxon columns

In [31]:
inat_df = pd.read_csv(all_taxa_inat, dtype=str)
inat_df = inat_df.fillna('')
log_df(inat_df)

(3851, 212)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,...,subterclass_photo_attribution,subterclass_photo_license_code,stateofmatter_iconic_taxon_name,stateofmatter_common_name,stateofmatter_photo_license_code,subterclass_wikipedia_url,subterclass_common_name,taxon_ids,names,common_names
0,Deidamia inscriptum,Lettered Sphinx,Insecta,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,...,,,,,,,,1|47120|47158|47157|47213|143454|143452,Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...,Animals|Arthropods|Insects|Butterflies and Mot...
1,Acer,maples,Plantae,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,...,,,,,,,,47126|211194|47124|47729|58321|47727|,Plantae|Tracheophyta|Magnoliopsida|Sapindales|...,"plants|vascular plants|dicots|soapberries, cas..."
2,Plantago lanceolata,ribwort plantain,Plantae,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,...,,,,,,,,47126|211194|47124|48151|50638|50636|53178,Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...,"plants|vascular plants|dicots|mints, plantains..."
3,Poa pratensis,Kentucky bluegrass,Plantae,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,...,,,,,,,,47126|211194|47163|47162|47434|52809|60307,Plantae|Tracheophyta|Liliopsida|Poales|Poaceae...,"plants|vascular plants|monocots|grasses, sedge..."
4,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,...,,,,,,,,47126|211194|47124||||,Plantae|Tracheophyta|Magnoliopsida||||,plants|vascular plants|dicots||||


In [11]:
add_concatenated_columns(inat_df)

In [12]:
inat_df['taxon_ids'][0:3]

0       1|47120|47158|47157|47213|143454|143452
1         47126|211194|47124|47729|58321|47727|
2    47126|211194|47124|48151|50638|50636|53178
Name: taxon_ids, dtype: object

In [13]:
inat_df['scientific_names'][0:3]

0    Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...
1    Plantae|Tracheophyta|Magnoliopsida|Sapindales|...
2    Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...
Name: names, dtype: object

In [14]:
inat_df['common_names'][0:3]

0    Animals|Arthropods|Insects|Butterflies and Mot...
1    plants|vascular plants|dicots|soapberries, cas...
2    plants|vascular plants|dicots|mints, plantains...
Name: common_names, dtype: object

In [16]:
inat_df.to_csv(all_taxa_inat, index=False)

## browse data

In [54]:
inat_df = pd.read_csv(all_taxa_inat, dtype=str)

In [55]:
inat_df[inat_df['rank'] == 'genus'].shape

(776, 212)

some names are used multiple times for different organisisms

In [28]:
len(inat_df['genus'].dropna().unique())

1864

In [29]:
len(inat_df['genus_id'].dropna().unique())

1867

In [30]:
inat_df['rank'].unique()

array(['species', 'genus', 'class', 'order', 'kingdom', 'complex',
       'family', 'superfamily', 'subspecies', 'subfamily', 'phylum',
       'zoosubsection', 'section', 'subgenus', 'subphylum', 'suborder',
       'subtribe', 'infraorder', 'hybrid', 'tribe', 'subclass', 'variety',
       'zoosection', 'superorder', 'stateofmatter', 'infraclass',
       'epifamily', 'subterclass'], dtype=object)

# normalize taxa list

create a taxa list  with separate records for all higher level taxa

In [8]:
inat_df = pd.read_csv(all_taxa_inat, dtype=str)
log_df(inat_df)

(3851, 212)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,...,subterclass_photo_attribution,subterclass_photo_license_code,stateofmatter_iconic_taxon_name,stateofmatter_common_name,stateofmatter_photo_license_code,subterclass_wikipedia_url,subterclass_common_name,taxon_ids,scientific_names,common_names
0,Deidamia inscriptum,Lettered Sphinx,Insecta,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,...,,,,,,,,1|47120|47158|47157|47213|143454|143452,Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...,Animals|Arthropods|Insects|Butterflies and Mot...
1,Acer,maples,Plantae,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,...,,,,,,,,47126|211194|47124|47729|58321|47727|,Plantae|Tracheophyta|Magnoliopsida|Sapindales|...,"plants|vascular plants|dicots|soapberries, cas..."
2,Plantago lanceolata,ribwort plantain,Plantae,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,...,,,,,,,,47126|211194|47124|48151|50638|50636|53178,Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...,"plants|vascular plants|dicots|mints, plantains..."
3,Poa pratensis,Kentucky bluegrass,Plantae,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,...,,,,,,,,47126|211194|47163|47162|47434|52809|60307,Plantae|Tracheophyta|Liliopsida|Poales|Poaceae...,"plants|vascular plants|monocots|grasses, sedge..."
4,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,...,,,,,,,,47126|211194|47124||||,Plantae|Tracheophyta|Magnoliopsida||||,plants|vascular plants|dicots||||


In [9]:
combine_df = pd.read_csv(all_taxa, dtype=str)
log_df(combine_df)

(3851, 11)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,species
0,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,
1,Taraxacum officinale,common dandelion,Plantae,47602,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Taraxacum,Taraxacum officinale
2,Trifolium repens,white clover,Plantae,55745,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,Trifolium,Trifolium repens
3,Plantae,plants,Plantae,47126,Plantae,,,,,,
4,Turdus migratorius,American Robin,Aves,12727,Animalia,Chordata,Aves,Passeriformes,Turdidae,Turdus,Turdus migratorius


In [11]:
# list(inat_df.columns)

In [67]:
columns = [ 
    'id', 
    'iconic_taxon_name',
    'common_name', 
    'wikipedia_url', 
    'photo_url',
    'photo_attribution', 
    'photo_license_code',
]

ranks = [
    'kingdom', 'phylum', 'class','order','family', 'genus', 'species'
]

# https://en.wikipedia.org/wiki/Taxonomic_rank
all_ranks = [
    'stateofmatter',
    'kingdom',
    'phylum', 'subphylum',
    'superclass','class','subclass', 'infraclass', 'subterclass',
    'superorder','order','suborder', 'infraorder',
    'zoosection', 'zoosubsection', 

    'superfamily','epifamily','family','subfamily', 
    'supertribe','tribe','subtribe',
    'genus', 'subgenus',
    'section',
    'complex',

    'species',
    'subspecies','variety','form',
    
        'genushybrid', 'hybrid',

]

    
def get_columns_for_rank(rank):
    return ranks + [f'{rank}_{col}' for col in columns] + ['scientific_name'] 

def normalize_column_names(rank, rank_cols):
    temp = ranks + ['taxon_id'] + columns[1:]

    return dict(zip(rank_cols, temp))


def get_taxa_for_rank(inat_df, rank):
    rank_columns = get_columns_for_rank(rank)
    df = inat_df[rank_columns].copy()
    df = df.dropna(subset=[f'{rank}_id'])
    df = df.drop_duplicates(subset=[f'{rank}_id'])
    
    normalized_columns = normalize_column_names(rank, rank_columns)
    df = df.rename(columns=normalized_columns)
    df['rank'] = rank    
    
    return df
    
    

In [68]:
dfs = []
for rank in inat_df['rank'].unique():
    normalized_df = get_taxa_for_rank(inat_df, rank)
    dfs.append(normalized_df)
    
all_df = pd.concat(dfs)
all_df = all_df.drop_duplicates(subset='taxon_id')

log_df(all_df)

(5665, 16)


Unnamed: 0,kingdom,phylum,class,order,family,genus,species,taxon_id,iconic_taxon_name,common_name,wikipedia_url,photo_url,photo_attribution,photo_license_code,scientific_name,rank
0,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,Deidamia inscriptum,143452,Insecta,Lettered Sphinx,http://en.wikipedia.org/wiki/Deidamia_inscriptum,https://static.inaturalist.org/photos/16015652...,"(c) Steven Daniel, all rights reserved",,Deidamia inscriptum,species
2,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,Plantago lanceolata,53178,Plantae,ribwort plantain,http://en.wikipedia.org/wiki/Plantago_lanceolata,https://static.inaturalist.org/photos/39432141...,"(c) Tig, all rights reserved",,Plantago lanceolata,species
3,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,Poa pratensis,60307,Plantae,Smooth Meadow-grass,http://en.wikipedia.org/wiki/Poa_pratensis,https://inaturalist-open-data.s3.amazonaws.com...,"(c) Matt Lavin, some rights reserved (CC BY-SA)",cc-by-sa,Poa pratensis,species
6,Animalia,Chordata,Mammalia,Primates,Hominidae,Homo,Homo sapiens,43584,Mammalia,Human,http://en.wikipedia.org/wiki/Homo_sapiens,https://inaturalist-open-data.s3.amazonaws.com...,no rights reserved,cc0,Homo sapiens,species
12,Animalia,Arthropoda,Insecta,Orthoptera,Tettigoniidae,Meconema,Meconema thalassinum,119170,Insecta,Oak Bush-cricket,http://en.wikipedia.org/wiki/Meconema_thalassinum,https://inaturalist-open-data.s3.amazonaws.com...,"(c) Martin Grimm, some rights reserved (CC BY-NC)",cc-by-nc,Meconema thalassinum,species


In [69]:
len(set(inat_df['taxon_id']) - set(all_df['taxon_id']))

0

In [66]:
all_df.to_csv(de_taxa, index=False)