In [1]:
import pandas as pd
from pathlib import Path
import requests
import json
import numpy as np

from process_inat_data import format_inat_data
from create_normalized_taxa import (
     create_taxa_df,   all_ranks, get_row_ranks,
    main_ranks, create_taxon, add_count_column, 
    create_taxa_la_df,
    create_taxa_gosea_df
)



In [2]:
inat_taxa =  Path('outputs', 'combine_taxa_list_inat_data.csv')
de_taxa =  Path('outputs', 'de_taxa_list.csv')
interactions_path =  Path('outputs', 'interactions.csv')


In [3]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [4]:
raw_data_paths = Path('../data').rglob('observations*.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

['../data/gosea/observations-209247.csv', '../data/clarkstown-high-school-north/observations-200303.csv', '../data/cedar-creek-reserve/observations-199064.csv', '../data/los-angeles-bioblitz/observations-190446.csv', '../data/ciencia-ciudadana-peru-bats/observations-199065.csv', '../data/ciencia-ciudadana-peru-bees/observations-199066.csv']


# create observation json

don't covert dtype=str since there are columns (latitude, longitude, taxon_ids) that need to be numbers

In [5]:
taxa_cols =['taxon_id', 'taxon_ids', 'rank']
taxa_df = pd.read_csv(de_taxa,  usecols=taxa_cols)

taxa_df['taxon_id'] = taxa_df['taxon_id'].astype(int)

log_df(taxa_df)

(7315, 3)


Unnamed: 0,taxon_id,rank,taxon_ids
0,1,kingdom,1
1,47534,phylum,1|47534
2,48921,class,1|47534|48921
3,152823,order,1|47534|48921|152823
4,117304,family,1|47534|48921|152823|117304


In [6]:
all_cols = [
    'time_observed_at',
    'image_url',  
    'latitude', 
    'longitude',
    'user_login', 
    'scientific_name', 
    'common_name',
    'taxon_id',
    'id',
    'geoprivacy',
    'taxon_geoprivacy',
    'coordinates_obscured',
    'quality_grade',
    'license',
    'description',
]

In [7]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' + dir_path.name + '/observations*.csv'  ):
            print(file_path.name)
            df = pd.read_csv(file_path,  usecols=all_cols)
            df['taxon_id'].fillna(0, inplace=True)
            df['taxon_id'] = df['taxon_id'].astype(int)
    
            df = df.merge(taxa_df, on="taxon_id", how="left")
            
            # Safari won't parse dates in the format given by iNaturalist          
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) (UTC)', r'\1T\2Z', regex = True) 
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) -0700', r'\1T\2Z', regex = True) 
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) -0800', r'\1T\2Z', regex = True) 

            dfs.append(df)
         
        if(len(dfs) == 0):
            continue
        combine_df = pd.concat(dfs)

         
        new_path = Path('..','app', 'src', 'lib', 'data', dir_path.name)
        new_path.mkdir(parents=True, exist_ok=True)
        # combine_df.to_json(new_path/ 'observations.json', orient = "records")
        combine_df.to_csv(new_path/ 'observations.csv', index=False)


observations-209247.csv
observations-200303.csv
observations-199064.csv
observations-190446.csv
observations-199065.csv
observations-199066.csv


# create taxa  csv


In [5]:
inat_df = pd.read_csv(inat_taxa, dtype=str)

# taxa_df['parent_id'] = taxa_df['parent_id'].fillna(0)
# taxa_df['parent_id'] = taxa_df['parent_id'].astype(int)

log_df(inat_df)

(4752, 336)


Unnamed: 0,scientific_name,common_name,iconic_taxon_name,taxon_id,kingdom,phylum,class,order,family,genus,...,genushybrid_id,genushybrid_wikipedia_url,genushybrid_iconic_taxon_name,genushybrid_common_name,genushybrid_parent_id,genushybrid_ancestor_ids,genushybrid,genushybrid_photo_url,genushybrid_photo_attribution,genushybrid_photo_license_code
0,Physalia physalis,Portuguese Man o' War,Animalia,117302,Animalia,Cnidaria,Hydrozoa,Siphonophorae,Physaliidae,Physalia,...,,,,,,,,,,
1,Aurelia marginalis,Southern Moon Jelly,Animalia,986245,Animalia,Cnidaria,Scyphozoa,Semaeostomeae,Ulmaridae,Aurelia,...,,,,,,,,,,
2,Chrysaora colorata,Purple-striped Sea Nettle,Animalia,69839,Animalia,Cnidaria,Scyphozoa,Semaeostomeae,Pelagiidae,Chrysaora,...,,,,,,,,,,
3,Chrysaora fuscescens,Pacific Sea Nettle,Animalia,48479,Animalia,Cnidaria,Scyphozoa,Semaeostomeae,Pelagiidae,Chrysaora,...,,,,,,,,,,
4,Chrysaora achlyos,Black Jelly,Animalia,83415,Animalia,Cnidaria,Scyphozoa,Semaeostomeae,Pelagiidae,Chrysaora,...,,,,,,,,,,


In [6]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            cols = ['taxon_id', 'user_login', 'image_url', 'id']
            df = pd.read_csv(file_path, dtype=str, usecols=cols)
            dfs.append(df)

        if(len(dfs) == 0):
            continue
        combine_df = pd.concat(dfs) 
        combine_df = combine_df.merge(inat_df, on="taxon_id", how="left")
        adjust_df = add_count_column(combine_df, "observations_count")
        
        
        taxa_df = create_taxa_df(adjust_df)
        cols = [
            "id", "taxon_id", "common_name", "scientific_name", 
            "user_login", "image_url", "rank", "parent_id", 
            "taxon_ids", 
            "observations_count", "taxa_count"
        ]
        taxa_df = taxa_df[cols]
        

        taxa_df['taxon_id'] = taxa_df['taxon_id'].astype(int)
        taxa_df['id'] = taxa_df['id'].astype(int)
        taxa_df['parent_id'] = taxa_df['parent_id'].fillna(0)
        taxa_df['parent_id'] = taxa_df['parent_id'].astype(int)

        new_path = Path('..','app', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        print(new_path, len(taxa_df))
        # taxa_df.to_json(new_path/ "taxa.json", orient = "records")
        taxa_df.to_csv(new_path/ "taxa.csv", index=False)


../app/src/lib/data/clarkstown-high-school-north 831
../app/src/lib/data/cedar-creek-reserve 1852
../app/src/lib/data/los-angeles-bioblitz 4161
../app/src/lib/data/ciencia-ciudadana-peru-bats 28
../app/src/lib/data/go-sea 2035
../app/src/lib/data/ciencia-ciudadana-peru-bees 57


# create taxa json for LA indicator species

In [7]:
path = '../app/src/lib/data/los-angeles-bioblitz/taxa.csv'
df = pd.read_csv(path, dtype=str)
df.shape

(4161, 11)

Pandas treat null in json as None, which will be be saved as 'None' string if they aren't replaced.

In [8]:
df = df.replace('None', np.nan)
df[df['taxon_id'] == '244378']

Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count
55,90899495,244378,,Mallophora fautrix,nenetzinr,https://inaturalist-open-data.s3.amazonaws.com...,species,61636,1|47120|47158|47822|47982|61636|244378,39,39


In [9]:
taxa_ids = df['taxon_id'].unique()
len(taxa_ids)

4161

In [10]:
file = '../data/los-angeles-bioblitz/indicator_species_with_inat.csv'
indicator_df = pd.read_csv(file , dtype=str )

# indicator_df['taxon_id'] = indicator_df['taxon_id'].astype(int)

log_df(indicator_df)

(38, 211)


Unnamed: 0,taxon_group,type,taxon_id,rank,ancestor_ids,species_id,species_wikipedia_url,species_iconic_taxon_name,species_common_name,species_parent_id,...,subspecies_id,subspecies_wikipedia_url,subspecies_iconic_taxon_name,subspecies_common_name,subspecies_parent_id,subspecies_ancestor_ids,subspecies,subspecies_photo_url,subspecies_photo_attribution,subspecies_photo_license_code
0,Birds,Park/Natural Area Species,1409,species,48460|1|2|355675|3|573|1278|1405|1409,1409,http://en.wikipedia.org/wiki/California_quail,Aves,California Quail,1405,...,,,,,,,,,,
1,Birds,Park/Natural Area Species,1986,species,48460|1|2|355675|3|1623|1627|1985|1986,1986,http://en.wikipedia.org/wiki/Greater_roadrunner,Aves,Greater Roadrunner,1985,...,,,,,,,,,,
2,Birds,Stream/Riparian Species,4956,species,48460|1|2|355675|3|67566|4929|597395|4950|4956,4956,http://en.wikipedia.org/wiki/Great_blue_heron,Aves,Great Blue Heron,4950,...,,,,,,,,,,
3,Birds,Neighborhood Species,5212,species,48460|1|2|355675|3|71261|5067|5179|5212,5212,https://en.wikipedia.org/wiki/Red-tailed_hawk,Aves,Red-tailed Hawk,5179,...,,,,,,,,,,
4,Birds,Stream/Riparian Species,7109,species,48460|1|2|355675|3|6888|6912|7108|7109,7109,https://en.wikipedia.org/wiki/Hooded_merganser,Aves,Hooded Merganser,7108,...,,,,,,,,,,


get all indicator species that are not in taxa.json 

In [11]:
no_observations_df = indicator_df[indicator_df['taxon_id'].isin(taxa_ids) == False].copy()
no_observations_df.shape

(10, 211)

In [12]:
no_observations_df['id'] = 0
no_observations_df['observations_count'] = 0

no_observations_df.shape

(10, 213)

In [13]:

taxa_df = create_taxa_la_df(no_observations_df)
log_df(taxa_df)

(46, 13)


Unnamed: 0,taxon_id,common_name,scientific_name,rank,parent_id,taxon_ids,id,user_login,image_url,observations_count,taxon_group,type,taxa_count
32,9535,Western Meadowlark,Sturnella neglecta,species,9526,1|2|3|7251|11989|9526|9535,0,"(c) greglasley, some rights reserved (CC BY-NC...",https://inaturalist-open-data.s3.amazonaws.com...,0,Birds,Park/Natural Area Species,1
62,26172,Snakes and Lizards,Squamata,order,26036,1|2|26036|26172,0,"(c) dianaterryhibbitts, some rights reserved (...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1
39,43698,Rodents,Rodentia,order,40151,1|2|40151|43698,0,"Peterson B Moose, U.S. Fish and Wildlife Servi...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1
38,40151,Mammals,Mammalia,class,2,1|2|40151,0,"Peterson B Moose, U.S. Fish and Wildlife Servi...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1
60,346970,Lotus Hairstreak,Callophrys dumetorum,species,58554,1|47120|47158|47157|47923|58554|346970,0,"(c) Ken-ichi Ueda, some rights reserved (CC BY)",https://inaturalist-open-data.s3.amazonaws.com...,0,Invertebrates,Park/Natural Area Species,1


get taxon that aren't already in taxa.json

In [14]:
taxa_df = taxa_df[taxa_df['taxon_id'].isin(taxa_ids) == False]
log_df(taxa_df)

(19, 13)


Unnamed: 0,taxon_id,common_name,scientific_name,rank,parent_id,taxon_ids,id,user_login,image_url,observations_count,taxon_group,type,taxa_count
32,9535,Western Meadowlark,Sturnella neglecta,species,9526,1|2|3|7251|11989|9526|9535,0,"(c) greglasley, some rights reserved (CC BY-NC...",https://inaturalist-open-data.s3.amazonaws.com...,0,Birds,Park/Natural Area Species,1
60,346970,Lotus Hairstreak,Callophrys dumetorum,species,58554,1|47120|47158|47157|47923|58554|346970,0,"(c) Ken-ichi Ueda, some rights reserved (CC BY)",https://inaturalist-open-data.s3.amazonaws.com...,0,Invertebrates,Park/Natural Area Species,1
37,27474,Black-bellied Slender Salamander,Batrachoseps nigriventris,species,27444,1|2|20978|26718|26909|27444|27474,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,Amphibians,Stream/Riparian Species,1
36,27444,Slender Salamanders,Batrachoseps,genus,26909,1|2|20978|26718|26909|27444,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1
35,26909,Lungless Salamanders,Plethodontidae,family,26718,1|2|20978|26718|26909,0,"(c) Marshal Hedin, some rights reserved (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,,1


add taxon_group and type to taxa.json data

In [15]:
df['taxon_group'] = np.nan
df['type'] = np.nan

for index, row in indicator_df.iterrows():
    df.loc[df['taxon_id'] == row['taxon_id'], 'taxon_group'] = row['taxon_group']
    df.loc[df['taxon_id'] == row['taxon_id'], 'type'] = row['type']
                                                                  
log_df(df)
     

(4161, 13)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,36141,1|2|26036|26172|36074|36141|36204,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,199910,1|2|3|7251|9079|199910|199840,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,48663,1|47120|47158|47157|47922|48663|48662,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,5179,1|2|3|71261|5067|5179|5212,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,3438,1|2|3|2708|2715|3438|3454,126,126,,


In [16]:
# reorder columns to match taxa.json
taxa_df = taxa_df[df.columns]

df.columns == taxa_df.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [17]:
combine_df = pd.concat([df, taxa_df])        
log_df(combine_df)

(4180, 13)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,36141,1|2|26036|26172|36074|36141|36204,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,199910,1|2|3|7251|9079|199910|199840,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,48663,1|47120|47158|47157|47922|48663|48662,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,5179,1|2|3|71261|5067|5179|5212,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,3438,1|2|3|2708|2715|3438|3454,126,126,,


In [18]:
combine_df[combine_df['taxon_group'].notna()].shape

(38, 13)

In [19]:
df[df['taxon_group'].notna()].shape

(28, 13)

In [20]:
combine_df[combine_df['taxon_group'].notna() & (combine_df['observations_count'] == 0)].shape

(10, 13)

In [21]:
combine_df[combine_df['taxon_id'] == '244378']

Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count,taxon_group,type
55,90899495,244378,,Mallophora fautrix,nenetzinr,https://inaturalist-open-data.s3.amazonaws.com...,species,61636,1|47120|47158|47822|47982|61636|244378,39,39,,


In [22]:
combine_df['taxon_id'] = combine_df['taxon_id'].astype(int)
combine_df['id'] = combine_df['id'].astype(int)
combine_df['parent_id'] = combine_df['parent_id'].astype(int)
combine_df['observations_count'] = combine_df['observations_count'].astype(int)
combine_df['taxa_count'] = combine_df['taxa_count'].astype(int)

In [23]:

new_path = Path('..','app', 'src', 'lib', 'data', 'los-angeles-bioblitz')
new_path.mkdir(parents=True, exist_ok=True)
# combine_df.to_json(new_path/ 'taxa.json', orient = "records")
combine_df.to_csv(new_path/ 'taxa.csv', index=False)



# create taxa json for GOSEA field guide species

In [24]:
path = '../app/src/lib/data/go-sea/taxa.csv'
df = pd.read_csv(path, dtype=str)
log_df(df)

(2035, 11)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count
0,99955539,117302,Portuguese Man o' War,Physalia physalis,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,117305,1|47534|48921|152823|117304|117305|117302,394,394
1,99998229,59698,By-the-wind Sailor,Velella velella,natael51,https://inaturalist-open-data.s3.amazonaws.com...,species,59699,1|47534|48921|48922|59692|59699|59698,321,321
2,99955344,254081,Dwarf Violet Snail,Janthina exigua,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,121656,1|47115|47114|122558|121656|254081,101,101
3,92086359,606078,the beachcomber,Eurynebria complanata,charcoscompanhia,https://inaturalist-open-data.s3.amazonaws.com...,species,606079,1|47120|47158|47208|49567|606079|606078,86,86
4,99853593,59683,Blue Button,Porpita porpita,samarah2,https://inaturalist-open-data.s3.amazonaws.com...,species,59679,1|47534|48921|48922|59692|59679|59683,82,82


In [25]:
taxa_ids = df['taxon_id'].unique()
len(taxa_ids)

2035

In [26]:
file = '../data/go-sea/field_guide_species.csv'
guide_df = pd.read_csv(file , dtype=str )

log_df(guide_df)

(14, 181)


Unnamed: 0,name,field_guide,taxon_id,scientific_name,image_url,user_login,parent_id,iconic_taxon_name,rank,ancestor_ids,...,subfamily_id,subfamily_wikipedia_url,subfamily_iconic_taxon_name,subfamily_common_name,subfamily_parent_id,subfamily_ancestor_ids,subfamily,subfamily_photo_url,subfamily_photo_attribution,subfamily_photo_license_code
0,Porpita,True,59679,Porpita,https://static.inaturalist.org/photos/89090274...,"(c) Tsz-Yan NG, all rights reserved",59692.0,Animalia,genus,48460|1|47534|48921|551473|48922|813988|59692|...,...,,,,,,,,,,
1,Velella,True,59699,Velella,https://inaturalist-open-data.s3.amazonaws.com...,"(с) Abhishek Jamalabad, некоторые права защище...",59692.0,Animalia,genus,48460|1|47534|48921|551473|48922|813988|59692|...,...,,,,,,,,,,
2,Physalia,True,117305,Physalia,https://static.inaturalist.org/photos/14273574...,"(c) Flight69, tutti i diritti riservati",117304.0,Animalia,genus,48460|1|47534|48921|551473|152823|777050|11730...,...,,,,,,,,,,
3,Actinecta,True,1210955,Actinecta,https://inaturalist-open-data.s3.amazonaws.com...,"(c) kmiller34, certains droits réservés (CC BY...",814008.0,Animalia,genus,48460|1|47534|47533|202756|47797|813978|813994...,...,,,,,,,,,,
4,Dosima fascicularis,True,462187,Dosima fascicularis,https://static.inaturalist.org/photos/5935878/...,"(c) Donna Eriwata, all rights reserved",462188.0,Animalia,species,48460|1|47120|85493|473790|1091452|144117|2102...,...,,,,,,,,,,


In [27]:
no_observations_df = guide_df[guide_df['taxon_id'].isin(taxa_ids) == False].copy()
log_df(no_observations_df)

(1, 181)


Unnamed: 0,name,field_guide,taxon_id,scientific_name,image_url,user_login,parent_id,iconic_taxon_name,rank,ancestor_ids,...,subfamily_id,subfamily_wikipedia_url,subfamily_iconic_taxon_name,subfamily_common_name,subfamily_parent_id,subfamily_ancestor_ids,subfamily,subfamily_photo_url,subfamily_photo_attribution,subfamily_photo_license_code
3,Actinecta,True,1210955,Actinecta,https://inaturalist-open-data.s3.amazonaws.com...,"(c) kmiller34, certains droits réservés (CC BY...",814008.0,Animalia,genus,48460|1|47534|47533|202756|47797|813978|813994...,...,,,,,,,,,,


In [28]:
no_observations_df['id'] = 0
no_observations_df['observations_count'] = 0

no_observations_df.shape

(1, 183)

In [29]:

taxa_df = create_taxa_gosea_df(no_observations_df)
log_df(taxa_df)

(6, 12)


Unnamed: 0,taxon_id,common_name,scientific_name,rank,parent_id,taxon_ids,id,user_login,image_url,observations_count,field_guide,taxa_count
4,814008,,Minyadidae,family,47797,1|47534|47533|47797|814008,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,1
3,47797,Sea Anemones,Actiniaria,order,47533,1|47534|47533|47797,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,1
1,47534,Cnidarians,Cnidaria,phylum,1,1|47534,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,1
2,47533,Sea Anemones and Corals,Anthozoa,class,47534,1|47534|47533,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,1
5,1210955,,Actinecta,genus,814008,1|47534|47533|47797|814008|1210955,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,True,1


In [30]:
taxa_df = taxa_df[taxa_df['taxon_id'].isin(taxa_ids) == False]
log_df(taxa_df)

(2, 12)


Unnamed: 0,taxon_id,common_name,scientific_name,rank,parent_id,taxon_ids,id,user_login,image_url,observations_count,field_guide,taxa_count
4,814008,,Minyadidae,family,47797,1|47534|47533|47797|814008,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,,1
5,1210955,,Actinecta,genus,814008,1|47534|47533|47797|814008|1210955,0,"(c) kmiller34, certains droits réservés (CC BY...",https://inaturalist-open-data.s3.amazonaws.com...,0,True,1


In [31]:
df['field_guide'] = False

for index, row in guide_df.iterrows():
    df.loc[df['taxon_id'] == row['taxon_id'], 'field_guide'] = row['field_guide']
                                                                  
log_df(df)

(2035, 12)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count,field_guide
0,99955539,117302,Portuguese Man o' War,Physalia physalis,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,117305,1|47534|48921|152823|117304|117305|117302,394,394,False
1,99998229,59698,By-the-wind Sailor,Velella velella,natael51,https://inaturalist-open-data.s3.amazonaws.com...,species,59699,1|47534|48921|48922|59692|59699|59698,321,321,False
2,99955344,254081,Dwarf Violet Snail,Janthina exigua,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,121656,1|47115|47114|122558|121656|254081,101,101,False
3,92086359,606078,the beachcomber,Eurynebria complanata,charcoscompanhia,https://inaturalist-open-data.s3.amazonaws.com...,species,606079,1|47120|47158|47208|49567|606079|606078,86,86,False
4,99853593,59683,Blue Button,Porpita porpita,samarah2,https://inaturalist-open-data.s3.amazonaws.com...,species,59679,1|47534|48921|48922|59692|59679|59683,82,82,False


In [32]:
# reorder columns to match taxa.json
taxa_df = taxa_df[df.columns]

df.columns == taxa_df.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [33]:
combine_df = pd.concat([df, taxa_df])        
log_df(combine_df)



(2037, 12)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,parent_id,taxon_ids,observations_count,taxa_count,field_guide
0,99955539,117302,Portuguese Man o' War,Physalia physalis,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,117305,1|47534|48921|152823|117304|117305|117302,394,394,False
1,99998229,59698,By-the-wind Sailor,Velella velella,natael51,https://inaturalist-open-data.s3.amazonaws.com...,species,59699,1|47534|48921|48922|59692|59699|59698,321,321,False
2,99955344,254081,Dwarf Violet Snail,Janthina exigua,arnim,https://inaturalist-open-data.s3.amazonaws.com...,species,121656,1|47115|47114|122558|121656|254081,101,101,False
3,92086359,606078,the beachcomber,Eurynebria complanata,charcoscompanhia,https://inaturalist-open-data.s3.amazonaws.com...,species,606079,1|47120|47158|47208|49567|606079|606078,86,86,False
4,99853593,59683,Blue Button,Porpita porpita,samarah2,https://inaturalist-open-data.s3.amazonaws.com...,species,59679,1|47534|48921|48922|59692|59679|59683,82,82,False


In [34]:

combine_df['taxon_id'] = combine_df['taxon_id'].astype(int)
combine_df['id'] = combine_df['id'].astype(int)
combine_df['parent_id'] = combine_df['parent_id'].astype(int)
combine_df['observations_count'] = combine_df['observations_count'].astype(int)
combine_df['taxa_count'] = combine_df['taxa_count'].astype(int)


In [35]:

new_path = Path('..','app', 'src', 'lib', 'data', 'go-sea')
combine_df.to_csv(new_path/ 'taxa.csv', index=False)
