In [1]:
import pandas as pd
from pathlib import Path
import requests
import json
import numpy as np

from process_inat_data import format_inat_data
from create_normalized_taxa import (
     create_taxa_df,   all_ranks, get_row_ranks,
    main_ranks, create_taxon, add_count_column, create_taxa_la_df
)



In [2]:
inat_taxa =  Path('outputs', 'combine_taxa_list_inat_data.csv')
de_taxa =  Path('outputs', 'de_taxa_list.csv')


In [3]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [4]:
raw_data_paths = Path('../data').rglob('observations*.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

['../data/clarkstown-high-school-north/observations-200303.csv', '../data/cedar-creek-reserve/observations-199064.csv', '../data/los-angeles-bioblitz/observations-190446.csv', '../data/ciencia-ciudadana-peru-bats/observations-199065.csv', '../data/ciencia-ciudadana-peru-bees/observations-199066.csv']


# Add iNat data to LA indicator species

In [34]:
file = '../data/los-angeles-bioblitz/indicator_species.tsv'

indicator_cols = [ 'type', 'taxon_group', 'taxon_id' ]
indicator_df = pd.read_csv(file, sep='\t',  usecols=indicator_cols, dtype=str)
log_df(indicator_df)

(38, 3)


Unnamed: 0,taxon_group,type,taxon_id
0,Birds,Park/Natural Area Species,1409
1,Birds,Park/Natural Area Species,1986
2,Birds,Stream/Riparian Species,4956
3,Birds,Neighborhood Species,5212
4,Birds,Stream/Riparian Species,7109


In [35]:
for index, row in indicator_df.iterrows():
    print(index, end=' ')

    response = requests.get(f'https://api.inaturalist.org/v1/taxa/{row["taxon_id"]}')
    if response.status_code == 200:
        json_data = response.json()['results'][0]
        result = format_inat_data(json_data)
        
        for col in result:
            indicator_df.at[index, col] = result[col]
            
        indicator_df.at[index, 'common_name'] = json_data['preferred_common_name']
        indicator_df.at[index, 'iconic_taxon_name'] = json_data['iconic_taxon_name']
        indicator_df.at[index, 'scientific_name'] = json_data['name']
        indicator_df.at[index, 'image_url'] = json_data['default_photo']['medium_url']
        indicator_df.at[index, 'user_login'] = json_data['default_photo']['attribution']
        indicator_df.at[index, 'parent_id'] = json_data['parent_id']

    

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 

In [36]:
indicator_df['parent_id'] = indicator_df['parent_id'].astype(int)


In [37]:
log_df(indicator_df)

(38, 211)


Unnamed: 0,taxon_group,type,taxon_id,rank,ancestor_ids,species_id,species_wikipedia_url,species_iconic_taxon_name,species_common_name,species_parent_id,...,subspecies_id,subspecies_wikipedia_url,subspecies_iconic_taxon_name,subspecies_common_name,subspecies_parent_id,subspecies_ancestor_ids,subspecies,subspecies_photo_url,subspecies_photo_attribution,subspecies_photo_license_code
0,Birds,Park/Natural Area Species,1409,species,48460|1|2|355675|3|573|1278|1405|1409,1409,http://en.wikipedia.org/wiki/California_quail,Aves,California Quail,1405,...,,,,,,,,,,
1,Birds,Park/Natural Area Species,1986,species,48460|1|2|355675|3|1623|1627|1985|1986,1986,http://en.wikipedia.org/wiki/Greater_roadrunner,Aves,Greater Roadrunner,1985,...,,,,,,,,,,
2,Birds,Stream/Riparian Species,4956,species,48460|1|2|355675|3|67566|4929|597395|4950|4956,4956,http://en.wikipedia.org/wiki/Great_blue_heron,Aves,Great Blue Heron,4950,...,,,,,,,,,,
3,Birds,Neighborhood Species,5212,species,48460|1|2|355675|3|71261|5067|5179|5212,5212,https://en.wikipedia.org/wiki/Red-tailed_hawk,Aves,Red-tailed Hawk,5179,...,,,,,,,,,,
4,Birds,Stream/Riparian Species,7109,species,48460|1|2|355675|3|6888|6912|7108|7109,7109,https://en.wikipedia.org/wiki/Hooded_merganser,Aves,Hooded Merganser,7108,...,,,,,,,,,,


In [38]:
file = '../data/los-angeles-bioblitz/indicator_species_with_inat.csv'
indicator_df.to_csv(file, index=False)

# create observation json

don't covert dtype=str since there are columns (latitude, longitude, taxon_ids) that need to be numbers

In [65]:
taxa_cols =['taxon_id', 'taxon_ids']
taxa_df = pd.read_csv(de_taxa,  usecols=taxa_cols)

taxa_df['taxon_id'] = taxa_df['taxon_id'].astype(int)

log_df(taxa_df)

(5665, 2)


Unnamed: 0,taxon_id,taxon_ids
0,1,1
1,47120,1|47120
2,47158,1|47120|47158
3,47157,1|47120|47158|47157
4,47213,1|47120|47158|47157|47213


In [66]:
all_cols = [
    'time_observed_at',
    'image_url',  
    'latitude', 
    'longitude',
    'user_login', 
    'scientific_name', 
    'common_name',
    'taxon_id',
    'id',
    'geoprivacy',
    'taxon_geoprivacy',
    'coordinates_obscured',
    'quality_grade',
    'license',
    'description',
    'positional_accuracy'
]

In [67]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' + dir_path.name + '/observations*.csv'  ):
            print(file_path.name)
            df = pd.read_csv(file_path,  usecols=all_cols)
            df['taxon_id'].fillna(0, inplace=True)
            df['taxon_id'] = df['taxon_id'].astype(int)
    
            df = df.merge(taxa_df, on="taxon_id", how="left")
            
            # Safari won't parse dates in the format given by iNaturalist          
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) (UTC)', r'\1T\2Z', regex = True) 
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) -0700', r'\1T\2Z', regex = True) 

            dfs.append(df)
            
        combine_df = pd.concat(dfs)

         
        new_path = Path('..','app', 'src', 'lib', 'data', dir_path.name)
        new_path.mkdir(parents=True, exist_ok=True)
        combine_df.to_json(new_path/ 'observations.json', orient = "records")


observations-200303.csv
observations-199064.csv
observations-190446.csv
observations-199065.csv
observations-199066.csv


# create taxa json and csv


In [26]:
inat_df = pd.read_csv(inat_taxa, dtype=str)

# taxa_df['parent_id'] = taxa_df['parent_id'].fillna(0)
# taxa_df['parent_id'] = taxa_df['parent_id'].astype(int)

log_df(inat_df)

(3851, 336)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,...,genushybrid_id,genushybrid_wikipedia_url,genushybrid_iconic_taxon_name,genushybrid_common_name,genushybrid_parent_id,genushybrid_ancestor_ids,genushybrid,genushybrid_photo_url,genushybrid_photo_attribution,genushybrid_photo_license_code
0,Deidamia inscriptum,Lettered Sphinx,Insecta,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,...,,,,,,,,,,
1,Acer,maples,Plantae,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,...,,,,,,,,,,
2,Plantago lanceolata,ribwort plantain,Plantae,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,...,,,,,,,,,,
3,Poa pratensis,Kentucky bluegrass,Plantae,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,...,,,,,,,,,,
4,Magnoliopsida,dicots,Plantae,47124,Plantae,Tracheophyta,Magnoliopsida,,,,...,,,,,,,,,,


In [27]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            cols = ['taxon_id', 'user_login', 'image_url', 'id']
            df = pd.read_csv(file_path, dtype=str, usecols=cols)
            dfs.append(df)

        combine_df = pd.concat(dfs) 
        combine_df = combine_df.merge(inat_df, on="taxon_id", how="left")
        adjust_df = add_count_column(combine_df, "observations_count")
        
        
        taxa_df = create_taxa_df(adjust_df)
        cols = [
            "id", "taxon_id", "common_name", "scientific_name", 
            "user_login", "image_url", "rank", "parent_id", 
            "taxon_ids", "common_names", "scientific_names", 
            "observations_count", "taxa_count"
        ]
        taxa_df = taxa_df[cols]
        

        taxa_df['taxon_id'] = taxa_df['taxon_id'].astype(int)
        taxa_df['id'] = taxa_df['id'].astype(int)
        taxa_df['parent_id'] = taxa_df['parent_id'].fillna(0)
        taxa_df['parent_id'] = taxa_df['parent_id'].astype(int)

        new_path = Path('..','app', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        print(new_path)
        taxa_df.to_json(new_path/ "taxa.json", orient = "records")
        taxa_df.to_csv(new_path/ "taxa.csv", index=False)


../app/src/lib/data/clarkstown-high-school-north
../app/src/lib/data/cedar-creek-reserve
../app/src/lib/data/los-angeles-bioblitz
../app/src/lib/data/ciencia-ciudadana-peru-bats
../app/src/lib/data/ciencia-ciudadana-peru-bees


# create taxa json for LA indicator species

In [51]:
path = '../app/src/lib/data/los-angeles-bioblitz/taxa.json'
df = pd.read_json(path, dtype=str)
df.shape

(4161, 13)

Pandas treat null in json as None, which will be be saved as 'None' string if they aren't replaced.

In [52]:
df = df.replace('None', np.nan)
df[df['taxon_id'] == '244378']

Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,common_names,scientific_names,observations_count,taxa_count
55,90899495,244378,,Mallophora fautrix,nenetzinr,https://inaturalist-open-data.s3.amazonaws.com...,species,61636,1|47120|47158|47822|47982|61636|244378,Animals|Arthropods|Insects|True Flies|Robber F...,Animalia|Arthropoda|Insecta|Diptera|Asilidae|M...,39,39


In [53]:
taxa_ids = df['taxon_id'].unique()
len(taxa_ids)

4161

In [54]:
file = '../data/los-angeles-bioblitz/indicator_species_with_inat.csv'
indicator_df = pd.read_csv(file , dtype=str )

# indicator_df['taxon_id'] = indicator_df['taxon_id'].astype(int)

log_df(indicator_df)

(38, 211)


Unnamed: 0,taxon_group,type,taxon_id,rank,ancestor_ids,species_id,species_wikipedia_url,species_iconic_taxon_name,species_common_name,species_parent_id,...,subspecies_id,subspecies_wikipedia_url,subspecies_iconic_taxon_name,subspecies_common_name,subspecies_parent_id,subspecies_ancestor_ids,subspecies,subspecies_photo_url,subspecies_photo_attribution,subspecies_photo_license_code
0,Birds,Park/Natural Area Species,1409,species,48460|1|2|355675|3|573|1278|1405|1409,1409,http://en.wikipedia.org/wiki/California_quail,Aves,California Quail,1405,...,,,,,,,,,,
1,Birds,Park/Natural Area Species,1986,species,48460|1|2|355675|3|1623|1627|1985|1986,1986,http://en.wikipedia.org/wiki/Greater_roadrunner,Aves,Greater Roadrunner,1985,...,,,,,,,,,,
2,Birds,Stream/Riparian Species,4956,species,48460|1|2|355675|3|67566|4929|597395|4950|4956,4956,http://en.wikipedia.org/wiki/Great_blue_heron,Aves,Great Blue Heron,4950,...,,,,,,,,,,
3,Birds,Neighborhood Species,5212,species,48460|1|2|355675|3|71261|5067|5179|5212,5212,https://en.wikipedia.org/wiki/Red-tailed_hawk,Aves,Red-tailed Hawk,5179,...,,,,,,,,,,
4,Birds,Stream/Riparian Species,7109,species,48460|1|2|355675|3|6888|6912|7108|7109,7109,https://en.wikipedia.org/wiki/Hooded_merganser,Aves,Hooded Merganser,7108,...,,,,,,,,,,


get all indicator species that are not in taxa.json 

In [55]:
no_observations_df = indicator_df[indicator_df['taxon_id'].isin(taxa_ids) == False].copy()
no_observations_df.shape

(10, 211)

In [56]:
no_observations_df['id'] = 0
no_observations_df['observations_count'] = 0

no_observations_df.shape

(10, 213)

In [57]:

taxa_df = create_taxa_la_df(no_observations_df)
taxa_df.shape

(46, 15)

get taxon that aren't already in taxa.json

In [58]:
taxa_df = taxa_df[taxa_df['taxon_id'].isin(taxa_ids) == False]
log_df(taxa_df)

(19, 15)


Unnamed: 0,taxon_id,common_name,scientific_name,rank,parent_id,taxon_ids,common_names,scientific_names,id,user_login,image_url,observations_count,taxon_group,type,taxa_count
32,9535,Western Meadowlark,Sturnella neglecta,species,9526,1|2|3|7251|11989|9526|9535,Animals|Chordates|Birds|Perching Birds|New Wor...,Animalia|Chordata|Aves|Passeriformes|Icteridae...,0,"(c) greglasley, some rights reserved (CC BY-NC...",https://inaturalist-open-data.s3.amazonaws.com...,0,Birds,Park/Natural Area Species,1
60,346970,Lotus Hairstreak,Callophrys dumetorum,species,58554,1|47120|47158|47157|47923|58554|346970,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Lycaen...,0,"(c) Ken-ichi Ueda, some rights reserved (CC BY)",https://inaturalist-open-data.s3.amazonaws.com...,0,Invertebrates,Park/Natural Area Species,1
37,27474,Black-bellied Slender Salamander,Batrachoseps nigriventris,species,27444,1|2|20978|26718|26909|27444|27474,Animals|Chordates|Amphibians|Salamanders|Lungl...,Animalia|Chordata|Amphibia|Caudata|Plethodonti...,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,Amphibians,Stream/Riparian Species,1
36,27444,Slender Salamanders,Batrachoseps,genus,26909,1|2|20978|26718|26909|27444,Animals|Chordates|Amphibians|Salamanders|Lungl...,Animalia|Chordata|Amphibia|Caudata|Plethodonti...,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1
35,26909,Lungless Salamanders,Plethodontidae,family,26718,1|2|20978|26718|26909,Animals|Chordates|Amphibians|Salamanders|Lungl...,Animalia|Chordata|Amphibia|Caudata|Plethodontidae,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1


add taxon_group and type to taxa.json data

In [59]:
df['taxon_group'] = np.nan
df['type'] = np.nan

for index, row in indicator_df.iterrows():
    df.loc[df['taxon_id'] == row['taxon_id'], 'taxon_group'] = row['taxon_group']
    df.loc[df['taxon_id'] == row['taxon_id'], 'type'] = row['type']
                                                                  
log_df(df)
     

(4161, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,common_names,scientific_names,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,36141,1|2|26036|26172|36074|36141|36204,Animals|Chordates|Reptiles|Snakes and Lizards|...,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,199910,1|2|3|7251|9079|199910|199840,Animals|Chordates|Birds|Perching Birds|Finches...,Animalia|Chordata|Aves|Passeriformes|Fringilli...,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,48663,1|47120|47158|47157|47922|48663|48662,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,5179,1|2|3|71261|5067|5179|5212,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ...",Animalia|Chordata|Aves|Accipitriformes|Accipit...,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,3438,1|2|3|2708|2715|3438|3454,Animals|Chordates|Birds|Pigeons and Doves|Pige...,Animalia|Chordata|Aves|Columbiformes|Columbida...,126,126,,


In [60]:
# reorder columns to match taxa.json
taxa_df = taxa_df[df.columns]

df.columns == taxa_df.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [61]:
combine_df = pd.concat([df, taxa_df])        
log_df(combine_df)

(4180, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,common_names,scientific_names,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,36141,1|2|26036|26172|36074|36141|36204,Animals|Chordates|Reptiles|Snakes and Lizards|...,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,199910,1|2|3|7251|9079|199910|199840,Animals|Chordates|Birds|Perching Birds|Finches...,Animalia|Chordata|Aves|Passeriformes|Fringilli...,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,48663,1|47120|47158|47157|47922|48663|48662,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,5179,1|2|3|71261|5067|5179|5212,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ...",Animalia|Chordata|Aves|Accipitriformes|Accipit...,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,3438,1|2|3|2708|2715|3438|3454,Animals|Chordates|Birds|Pigeons and Doves|Pige...,Animalia|Chordata|Aves|Columbiformes|Columbida...,126,126,,


In [21]:
combine_df[combine_df['taxon_group'].notna()].shape

(38, 15)

In [22]:
df[df['taxon_group'].notna()].shape

(28, 15)

In [23]:
combine_df[combine_df['taxon_group'].notna() & (combine_df['observations_count'] == 0)].shape

(10, 15)

In [62]:
combine_df[combine_df['taxon_id'] == '244378']

Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,common_names,scientific_names,observations_count,taxa_count,taxon_group,type
55,90899495,244378,,Mallophora fautrix,nenetzinr,https://inaturalist-open-data.s3.amazonaws.com...,species,61636,1|47120|47158|47822|47982|61636|244378,Animals|Arthropods|Insects|True Flies|Robber F...,Animalia|Arthropoda|Insecta|Diptera|Asilidae|M...,39,39,,


In [63]:
combine_df['taxon_id'] = combine_df['taxon_id'].astype(int)
combine_df['id'] = combine_df['id'].astype(int)
combine_df['parent_id'] = combine_df['parent_id'].astype(int)
combine_df['observations_count'] = combine_df['observations_count'].astype(int)
combine_df['taxa_count'] = combine_df['taxa_count'].astype(int)

In [64]:

new_path = Path('..','app', 'src', 'lib', 'data', 'los-angeles-bioblitz')
new_path.mkdir(parents=True, exist_ok=True)
combine_df.to_json(new_path/ 'taxa.json', orient = "records")

