In [62]:
import pandas as pd
from pathlib import Path
import requests
import json
import numpy as np

from process_files import process_inat_data, add_concatenated_columns



In [63]:
all_taxa =  Path('outputs', 'combine_taxa_list_inat_data.csv')

In [64]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [65]:
raw_data_paths = Path('../data').rglob('observations*.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

['../data/clarkstown-high-school-north/observations-200303.csv', '../data/cedar-creek-reserve/observations-199064.csv', '../data/los-angeles-bioblitz/observations-190446.csv', '../data/ciencia-ciudadana-peru-bats/observations-199065.csv', '../data/ciencia-ciudadana-peru-bees/observations-199066.csv']


# create observation json

In [70]:
inat_ids_cols =['taxon_id', 'taxon_ids']
inat_ids_df = pd.read_csv(all_taxa, dtype=str, usecols=inat_ids_cols)
inat_ids_df['taxon_id'] = inat_ids_df['taxon_id'].astype(int)

inat_ids_df = inat_ids_df.fillna('')
log_df(inat_ids_df)

(3851, 2)


Unnamed: 0,taxon_id,taxon_ids
0,143452,1|47120|47158|47157|47213|143454|143452
1,47727,47126|211194|47124|47729|58321|47727|
2,53178,47126|211194|47124|48151|50638|50636|53178
3,60307,47126|211194|47163|47162|47434|52809|60307
4,47124,47126|211194|47124||||


In [71]:
all_cols = [
    'time_observed_at',
    'image_url',  
    'latitude', 
    'longitude',
    'user_login', 
    'scientific_name', 
    'common_name',
    'taxon_id',
    'id',
    'geoprivacy',
    'taxon_geoprivacy',
    'coordinates_obscured'
]

In [72]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' + dir_path.name + '/observations*.csv'  ):
            print(file_path.name)
            df = pd.read_csv(file_path,  usecols=all_cols)
            df['taxon_id'].fillna(0, inplace=True)
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_ids_df, on="taxon_id", how="left")
            
            # Safari won't parse dates in the format given by iNaturalist          
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) (UTC)', r'\1T\2Z', regex = True) 
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) -0700', r'\1T\2Z', regex = True) 

            dfs.append(df)
            
        combine_df = pd.concat(dfs)
        
        new_path = Path('..','app', 'src', 'lib', 'data', dir_path.name)
        new_path.mkdir(parents=True, exist_ok=True)
        combine_df.to_json(new_path/ 'observations.json', orient = "records")


observations-200303.csv
observations-199064.csv
observations-190446.csv
observations-199065.csv
observations-199066.csv


# create taxa json and csv


inat_names_cols =['taxon_id', 'scientific_names', 'common_names', 'taxon_ids']
inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=inat_names_cols)
inat_names_df['taxon_id'] = inat_names_df['taxon_id'].astype(int)
inat_names_df = inat_names_df.fillna('')
log_df(inat_names_df)


taxa_cols = [ 
    'scientific_name', 
    'common_name',
    'taxon_id', 
    'iconic_taxon_name', 
    'taxon_kingdom_name',
    'taxon_phylum_name',
    'taxon_class_name', 
    'taxon_order_name',
    'taxon_family_name', 
    'taxon_genus_name', 
    'taxon_species_name',
]
basic_taxa_cols = ['scientific_name', 'common_name', 'taxon_id']

basic_cols = basic_taxa_cols + ['image_url', 'user_login']
cols = taxa_cols + ['image_url', 'user_login']



def create_taxa_df_v1(df):
    df['taxon_id'] = df['taxon_id'].astype(int)
    
    df['is_species'] = df['taxon_species_name'].notna()
    adjust_is_species_for_higher_ranks_v1(df)    

    temp = df.copy()
    temp['temp_count'] = 1
    temp['temp_count'] = temp['temp_count'].astype(int)

    # create a df with 2 columns: taxon_id and temp_count       
    count_df = temp.groupby(['taxon_id'])['temp_count'].sum().reset_index()
    count_df.rename(columns={"temp_count": "count"}, inplace=True)


    df = df.drop_duplicates(subset=['taxon_id'])
    df = df.merge(count_df)
    df = df.sort_values('count', ascending=False)
    
    return df


def adjust_is_species_for_higher_ranks_v1(df):
    adjust_is_species_for_rank_v1(df, 'taxon_genus_name')    
    adjust_is_species_for_rank_v1(df, 'taxon_family_name')   
    adjust_is_species_for_rank_v1(df, 'taxon_order_name')  
    adjust_is_species_for_rank_v1(df, 'taxon_class_name')  
    adjust_is_species_for_rank_v1(df, 'taxon_phylum_name')    
    adjust_is_species_for_rank_v1(df, 'taxon_kingdom_name') 
    df.loc[df['scientific_name'].str.contains(' × ') == True, 'is_species'] = True

def adjust_is_species_for_rank_v1(df, rank):
    tmp = df.copy()
    taxa = list(df[df['is_species'] == True][rank].unique())
    tmp = tmp[(tmp['is_species'] == False) & (tmp[rank].notna())]
    for index, row in tmp[~ tmp[rank].isin(taxa)].iterrows():
        df.at[index, 'is_species'] = True
        

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            df = pd.read_csv(file_path,  usecols=cols)
            df = df.dropna(subset=['taxon_id'])
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_names_df, on="taxon_id", how="left")

                
            dfs.append(df)

        combine_df = pd.concat(dfs)
        taxa_df = create_taxa_df_v1(combine_df)

        taxa_list_path = Path('..', 'data',  dir_path.name, 'taxa_list_v1.csv' )
        taxa_df.to_csv(taxa_list_path, index=False)

        new_path = Path('..','app', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        basic_df = taxa_df[basic_cols + ['count', 'is_species', 'taxon_ids', 'scientific_names', 'common_names']]
        basic_df.to_json(new_path/ "taxa_v1.json", orient = "records")


In [170]:
columns = []
ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
[columns.append(rank + '_id') for rank in ranks]
[columns.append(rank + '_common_name') for rank in ranks]
columns = ['taxon_id', 'scientific_name', 'common_name',
           'rank', 
           'taxon_ids', 'scientific_names', 'common_names'] + ranks + columns 

inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=columns)
inat_names_df['taxon_id'] = inat_names_df['taxon_id'].astype(int)
log_df(inat_names_df)

(3851, 28)


Unnamed: 0,scientific_name,common_name,taxon_id,kingdom,phylum,class,order,family,genus,species,...,class_common_name,order_id,order_common_name,family_id,family_common_name,genus_id,genus_common_name,taxon_ids,scientific_names,common_names
0,Deidamia inscriptum,Lettered Sphinx,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,Deidamia inscriptum,...,Insects,47157.0,Butterflies and Moths,47213.0,Sphinx Moths,143454.0,,1|47120|47158|47157|47213|143454|143452,Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...,Animals|Arthropods|Insects|Butterflies and Mot...
1,Acer,maples,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,,...,dicots,47729.0,"soapberries, cashews, mahoganies, and allies",58321.0,soapberry family,47727.0,maples,47126|211194|47124|47729|58321|47727|,Plantae|Tracheophyta|Magnoliopsida|Sapindales|...,"plants|vascular plants|dicots|soapberries, cas..."
2,Plantago lanceolata,ribwort plantain,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,Plantago lanceolata,...,dicots,48151.0,"mints, plantains, olives, and allies",50638.0,plantain family,50636.0,plantain,47126|211194|47124|48151|50638|50636|53178,Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...,"plants|vascular plants|dicots|mints, plantains..."
3,Poa pratensis,Kentucky bluegrass,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,Poa pratensis,...,monocots,47162.0,"grasses, sedges, cattails, and allies",47434.0,grasses,52809.0,Meadow-grasses,47126|211194|47163|47162|47434|52809|60307,Plantae|Tracheophyta|Liliopsida|Poales|Poaceae...,"plants|vascular plants|monocots|grasses, sedge..."
4,Magnoliopsida,dicots,47124,Plantae,Tracheophyta,Magnoliopsida,,,,,...,dicots,,,,,,,47126|211194|47124||||,Plantae|Tracheophyta|Magnoliopsida||||,plants|vascular plants|dicots||||


In [171]:
def add_count_column(df, count_col):
    # count the number of taxon_id
    temp = df.copy()
    temp['temp_count'] = 1
    temp['temp_count'] = temp['temp_count'].astype(int)

    # create a df with 2 columns: taxon_id and temp_count       
    count_df = temp.groupby(['taxon_id'])['temp_count'].sum().reset_index()
    count_df.rename(columns={"temp_count": count_col}, inplace=True)

    df = df.merge(count_df)
    return df

def append_df(df):  
    #  try to match the species count that is shown on inaturalist project page     
    df['is_species'] = df['species'].notna()
    adjust_is_species_for_higher_ranks(df)  
        
    # count the number of observations for a taxon_id
    df = add_count_column(df, 'observations_count')
    return df

    
def add_row(row, rank, index):
    temp = {}
    temp['id'] = row['id']
    temp['taxon_id'] = row[rank + '_id']
    temp['common_name'] = row[rank + '_common_name']
    temp['scientific_name'] = row[rank]
    
    if pd.notna(row['user_login']):
        temp['user_login'] = row['user_login']
        temp['image_url'] = row['image_url']
    else:
        temp['user_login'] = row[rank + '_photo_attribution']
        temp['image_url'] = row[rank + '_photo_url']
        
    temp['rank'] = rank
    temp['taxon_ids'] = ('|').join(row['taxon_ids'].split('|')[0: index +1])
    temp['common_names'] = ('|').join(row['common_names'].split('|')[0: index+1])
    temp['scientific_names'] = ('|').join(row['scientific_names'].split('|')[0: index+1])
    return temp

def create_taxa_df(df):
    # create a new df with rows for each taxa and eac higher taxa
    new_rows = []
    for index, row in df.iterrows():
        for index, rank in enumerate(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
            if pd.isna(row[rank]):
                continue
                
            temp = add_row(row, rank, index)
            new_rows.append(temp)
            
    new_df =  pd.DataFrame(new_rows) 
    
    temp = df[['is_species', 'observations_count', 'taxon_id']].drop_duplicates()
    new_df['taxon_id'] = new_df['taxon_id'].astype(int)
    new_df = new_df.merge(temp, how='left')
    
    new_df.loc[new_df['observations_count'].isna(), 'observations_count'] = 0
    new_df['observations_count'] = new_df['observations_count'].astype(int)
     
    # count the total number of occurences for a taxa including higher taxa
    new_df = add_count_column(new_df, 'taxa_count')

    # sort newest observations first so that we get use the photo for the newest observations
    new_df = new_df.sort_values(['id'], ascending=False)
    new_df = new_df.drop_duplicates(subset=['taxon_id'])
    new_df = new_df.sort_values(['observations_count'], ascending=False)
    
    return new_df



# try to match the species count that is shown on inaturalist project page   
# if a rank higher than species is the lowest occurence of the taxa, it is treated as
# a species. e.g. if there are no species for genus AA, genus AA 'is_species' is True 
def adjust_is_species_for_higher_ranks(df):
    adjust_is_species_for_rank(df, 'genus')    
    adjust_is_species_for_rank(df, 'family')   
    adjust_is_species_for_rank(df, 'order')  
    adjust_is_species_for_rank(df, 'class')  
    adjust_is_species_for_rank(df, 'phylum')    
    adjust_is_species_for_rank(df, 'kingdom') 
    df.loc[df['scientific_name'].str.contains(' × ') == True, 'is_species'] = True

def adjust_is_species_for_rank(df, rank):
    tmp = df.copy()
    taxa = list(df[df['is_species'] == True][rank].unique())
    tmp = tmp[(tmp['is_species'] == False) & (tmp[rank].notna())]
    for index, row in tmp[~ tmp[rank].isin(taxa)].iterrows():
        df.at[index, 'is_species'] = True

In [172]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            cols = ['taxon_id', 'user_login', 'image_url', 'id']
            df = pd.read_csv(file_path, usecols=cols)
            df = df.dropna(subset=['taxon_id'])
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_names_df, on="taxon_id", how="left")
                                    
            dfs.append(df)


        combine_df = pd.concat(dfs)        
        adjust_df = append_df(combine_df)
        taxa_df = create_taxa_df(adjust_df)

        new_path = Path('..','app', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        print(new_path)
        taxa_df.to_json(new_path/ "taxa.json", orient = "records")


../app/src/lib/data/clarkstown-high-school-north
../app/src/lib/data/cedar-creek-reserve
../app/src/lib/data/los-angeles-bioblitz
../app/src/lib/data/ciencia-ciudadana-peru-bats
../app/src/lib/data/ciencia-ciudadana-peru-bees


# create taxa json for LA indicator species

In [173]:
path = '../app/src/lib/data/los-angeles-bioblitz/taxa.json'
df = pd.read_json(path, dtype=str)
df['taxon_id'] = df['taxon_id'].astype(int)

log_df(df)


(3827, 13)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,taxon_ids,common_names,scientific_names,is_species,observations_count,taxa_count
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|26036|26172|36074|36141|36204,Animals|Chordates|Reptiles|Snakes and Lizards|...,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,True,342,342
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|3|7251|9079|199910|199840,Animals|Chordates|Birds|Perching Birds|Finches...,Animalia|Chordata|Aves|Passeriformes|Fringilli...,True,146,146
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,1|47120|47158|47157|47922|48663|48662,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,True,144,144
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,1|2|3|71261|5067|5179|5212,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ...",Animalia|Chordata|Aves|Accipitriformes|Accipit...,True,140,142
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,1|2|3|2708|2715|3438|3454,Animals|Chordates|Birds|Pigeons and Doves|Pige...,Animalia|Chordata|Aves|Columbiformes|Columbida...,True,126,126


In [174]:
file = '../data/los-angeles-bioblitz/indicator_species.tsv'

indicator_cols = [ 'type', 'taxon_group', 'taxon_id' ]
indicator_df = pd.read_csv(file, sep='\t',  usecols=indicator_cols, dtype=str)
indicator_df['taxon_id'] = indicator_df['taxon_id'].astype(int)
log_df(indicator_df)


(38, 3)


Unnamed: 0,taxon_group,type,taxon_id
0,Birds,Park/Natural Area Species,1409
1,Birds,Park/Natural Area Species,1986
2,Birds,Stream/Riparian Species,4956
3,Birds,Neighborhood Species,5212
4,Birds,Stream/Riparian Species,7109


In [175]:
df = df.merge(indicator_df, how='left')
log_df(df)

(3827, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,taxon_ids,common_names,scientific_names,is_species,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|26036|26172|36074|36141|36204,Animals|Chordates|Reptiles|Snakes and Lizards|...,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,True,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|3|7251|9079|199910|199840,Animals|Chordates|Birds|Perching Birds|Finches...,Animalia|Chordata|Aves|Passeriformes|Fringilli...,True,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,1|47120|47158|47157|47922|48663|48662,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,True,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,1|2|3|71261|5067|5179|5212,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ...",Animalia|Chordata|Aves|Accipitriformes|Accipit...,True,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,1|2|3|2708|2715|3438|3454,Animals|Chordates|Birds|Pigeons and Doves|Pige...,Animalia|Chordata|Aves|Columbiformes|Columbida...,True,126,126,,



### connect to inat api to get data for indicator species with no observations

In [176]:
no_observations_df = indicator_df[indicator_df['taxon_id'].isin(df['taxon_id'].unique()) == False].copy()
log_df(no_observations_df)

(11, 3)


Unnamed: 0,taxon_group,type,taxon_id
1,Birds,Park/Natural Area Species,1986
4,Birds,Stream/Riparian Species,7109
7,Birds,Park/Natural Area Species,9535
12,Amphibians,Stream/Riparian Species,27474
18,Mammals,Park/Natural Area Species,44749


In [177]:
ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
for index, row in no_observations_df.iterrows():
    response = requests.get(f'https://api.inaturalist.org/v1/taxa/{row["taxon_id"]}')
    if response.status_code == 200:
        json_data = response.json()['results'][0]
        result = process_inat_data(json_data)
        
        for col in result:
            no_observations_df.at[index, col] = result[col]
    

In [178]:
no_observations_df['id'] = np.nan
no_observations_df['user_login'] = np.nan
no_observations_df['image_url'] = np.nan
add_concatenated_columns(no_observations_df)

log_df(no_observations_df)

(11, 75)


Unnamed: 0,taxon_group,type,taxon_id,rank,species_id,species_wikipedia_url,species_iconic_taxon_name,species_common_name,species,species_photo_url,...,subspecies,subspecies_photo_url,subspecies_photo_attribution,subspecies_photo_license_code,id,user_login,image_url,taxon_ids,scientific_names,common_names
1,Birds,Park/Natural Area Species,1986,species,1986,http://en.wikipedia.org/wiki/Greater_roadrunner,Aves,Greater Roadrunner,Geococcyx californianus,https://static.inaturalist.org/photos/30952802...,...,,,,,,,,1|2|3|1623|1627|1985|1986,Animalia|Chordata|Aves|Cuculiformes|Cuculidae|...,Animals|Chordates|Birds|Cuckoos|Cuckoos|Roadru...
4,Birds,Stream/Riparian Species,7109,species,7109,https://en.wikipedia.org/wiki/Hooded_merganser,Aves,Hooded Merganser,Lophodytes cucullatus,https://inaturalist-open-data.s3.amazonaws.com...,...,,,,,,,,1|2|3|6888|6912|7108|7109,Animalia|Chordata|Aves|Anseriformes|Anatidae|L...,"Animals|Chordates|Birds|Waterfowl|Ducks, Geese..."
7,Birds,Park/Natural Area Species,9535,species,9535,http://en.wikipedia.org/wiki/Western_meadowlark,Aves,Western Meadowlark,Sturnella neglecta,https://inaturalist-open-data.s3.amazonaws.com...,...,,,,,,,,1|2|3|7251|11989|9526|9535,Animalia|Chordata|Aves|Passeriformes|Icteridae...,Animals|Chordates|Birds|Perching Birds|New Wor...
12,Amphibians,Stream/Riparian Species,27474,species,27474,http://en.wikipedia.org/wiki/Black-bellied_sle...,Amphibia,Black-bellied Slender Salamander,Batrachoseps nigriventris,https://inaturalist-open-data.s3.amazonaws.com...,...,,,,,,,,1|2|20978|26718|26909|27444|27474,Animalia|Chordata|Amphibia|Caudata|Plethodonti...,Animals|Chordates|Amphibians|Salamanders|Lungl...
18,Mammals,Park/Natural Area Species,44749,species,44749,http://en.wikipedia.org/wiki/Dusky-footed_woodrat,Mammalia,Dusky-footed Woodrat,Neotoma fuscipes,https://inaturalist-open-data.s3.amazonaws.com...,...,,,,,,,,1|2|40151|43698|55986|44737|44749,Animalia|Chordata|Mammalia|Rodentia|Cricetidae...,"Animals|Chordates|Mammals|Rodents|Hamsters, Vo..."


In [179]:
def create_taxa_df(df):
    # create a new df with rows for each taxa and eac higher taxa
    new_rows = []
    for index, row in df.iterrows():
        for index, rank in enumerate(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
            if pd.isna(row[rank]):
                continue
                
            temp = add_row(row, rank, index)

            temp['is_species'] = np.nan
            temp['observations_count'] = 0
            temp['taxa_count'] = 0
            temp['taxon_group'] = row['taxon_group']
            temp['type'] = row['type']
                
            new_rows.append(temp)
            
    new_df =  pd.DataFrame(new_rows) 
    new_df['taxon_id'] = new_df['taxon_id'].astype(int)


    new_df = new_df.drop_duplicates(subset=['taxon_id'])
    
    return new_df


In [180]:
all_taxa_df = create_taxa_df(no_observations_df)
log_df(all_taxa_df)

(48, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,taxon_ids,common_names,scientific_names,is_species,observations_count,taxa_count,taxon_group,type
0,,1,Animals,Animalia,no rights reserved,https://inaturalist-open-data.s3.amazonaws.com...,kingdom,1,Animals,Animalia,,0,0,Birds,Park/Natural Area Species
1,,2,Chordates,Chordata,no rights reserved,https://inaturalist-open-data.s3.amazonaws.com...,phylum,1|2,Animals|Chordates,Animalia|Chordata,,0,0,Birds,Park/Natural Area Species
2,,3,Birds,Aves,"(c) Kenny P., some rights reserved (CC BY-NC)",https://inaturalist-open-data.s3.amazonaws.com...,class,1|2|3,Animals|Chordates|Birds,Animalia|Chordata|Aves,,0,0,Birds,Park/Natural Area Species
3,,1623,Cuckoos,Cuculiformes,"(c) Andrew Allen, some rights reserved (CC BY)",https://inaturalist-open-data.s3.amazonaws.com...,order,1|2|3|1623,Animals|Chordates|Birds|Cuckoos,Animalia|Chordata|Aves|Cuculiformes,,0,0,Birds,Park/Natural Area Species
4,,1627,Cuckoos,Cuculidae,"(c) NHMLA Community Science Program, some righ...",https://inaturalist-open-data.s3.amazonaws.com...,family,1|2|3|1623|1627,Animals|Chordates|Birds|Cuckoos|Cuckoos,Animalia|Chordata|Aves|Cuculiformes|Cuculidae,,0,0,Birds,Park/Natural Area Species


In [181]:
df.columns  == all_taxa_df.columns


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [182]:
new_taxa = all_taxa_df[all_taxa_df['taxon_id'].isin(df['taxon_id'].unique()) == False]
log_df(new_taxa)

(19, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,taxon_ids,common_names,scientific_names,is_species,observations_count,taxa_count,taxon_group,type
5,,1985,Roadrunners,Geococcyx,"(c) Nick Chill, some rights reserved (CC BY-NC...",https://inaturalist-open-data.s3.amazonaws.com...,genus,1|2|3|1623|1627|1985,Animals|Chordates|Birds|Cuckoos|Cuckoos|Roadru...,Animalia|Chordata|Aves|Cuculiformes|Cuculidae|...,,0,0,Birds,Park/Natural Area Species
6,,1986,Greater Roadrunner,Geococcyx californianus,"(c) Kim Moore, all rights reserved",https://static.inaturalist.org/photos/30952802...,species,1|2|3|1623|1627|1985|1986,Animals|Chordates|Birds|Cuckoos|Cuckoos|Roadru...,Animalia|Chordata|Aves|Cuculiformes|Cuculidae|...,,0,0,Birds,Park/Natural Area Species
12,,7108,,Lophodytes,"(c) Steve Voght, some rights reserved (CC BY-SA)",https://inaturalist-open-data.s3.amazonaws.com...,genus,1|2|3|6888|6912|7108,"Animals|Chordates|Birds|Waterfowl|Ducks, Geese...",Animalia|Chordata|Aves|Anseriformes|Anatidae|L...,,0,0,Birds,Stream/Riparian Species
13,,7109,Hooded Merganser,Lophodytes cucullatus,"(c) Ashley M Bradford, some rights reserved (C...",https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|3|6888|6912|7108|7109,"Animals|Chordates|Birds|Waterfowl|Ducks, Geese...",Animalia|Chordata|Aves|Anseriformes|Anatidae|L...,,0,0,Birds,Stream/Riparian Species
19,,9526,Eastern and Western Meadowlarks,Sturnella,"(c) Steve Berardi, alcuni diritti riservati (C...",https://inaturalist-open-data.s3.amazonaws.com...,genus,1|2|3|7251|11989|9526,Animals|Chordates|Birds|Perching Birds|New Wor...,Animalia|Chordata|Aves|Passeriformes|Icteridae...,,0,0,Birds,Park/Natural Area Species


In [183]:
combine_df = pd.concat([df, new_taxa])        
log_df(combine_df)

(3846, 15)


Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,taxon_ids,common_names,scientific_names,is_species,observations_count,taxa_count,taxon_group,type
0,97386721,36204,Western Fence Lizard,Sceloporus occidentalis,loganc516,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|26036|26172|36074|36141|36204,Animals|Chordates|Reptiles|Snakes and Lizards|...,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,True,342,342,,
1,94055566,199840,House Finch,Haemorhous mexicanus,marty_and_the_mamas,https://inaturalist-open-data.s3.amazonaws.com...,species,1|2|3|7251|9079|199910|199840,Animals|Chordates|Birds|Perching Birds|Finches...,Animalia|Chordata|Aves|Passeriformes|Fringilli...,True,146,146,,
2,94889284,48662,Monarch,Danaus plexippus,belis1,https://static.inaturalist.org/photos/15746072...,species,1|47120|47158|47157|47922|48663|48662,Animals|Arthropods|Insects|Butterflies and Mot...,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,True,144,144,Invertebrates,Neighborhood Species
3,91586172,5212,Red-tailed Hawk,Buteo jamaicensis,ki6h,https://static.inaturalist.org/photos/15151968...,species,1|2|3|71261|5067|5179|5212,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ...",Animalia|Chordata|Aves|Accipitriformes|Accipit...,True,140,142,Birds,Neighborhood Species
4,92725078,3454,Mourning Dove,Zenaida macroura,jennbastian,https://static.inaturalist.org/photos/15357316...,species,1|2|3|2708|2715|3438|3454,Animals|Chordates|Birds|Pigeons and Doves|Pige...,Animalia|Chordata|Aves|Columbiformes|Columbida...,True,126,126,,


In [184]:

new_path = Path('..','app', 'src', 'lib', 'data', 'los-angeles-bioblitz')
new_path.mkdir(parents=True, exist_ok=True)
combine_df.to_json(new_path/ 'taxa.json', orient = "records")

