In [5]:
import pandas as pd
from pathlib import Path
import requests
import json

In [6]:
all_taxa =  Path('outputs', 'combine_taxa_list_inat_data.csv')

In [7]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [8]:
raw_data_paths = Path('../data').rglob('observations*.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

['../data/clarkstown-high-school-north/observations-200303.csv', '../data/cedar-creek-reserve/observations-199064.csv', '../data/los-angeles-bioblitz/observations-190446.csv', '../data/ciencia-ciudadana-peru-bats/observations-199065.csv', '../data/ciencia-ciudadana-peru-bees/observations-199066.csv']


# create observation json

In [9]:
inat_ids_cols =['taxon_id', 'taxon_ids']
inat_ids_df = pd.read_csv(all_taxa, dtype=str, usecols=inat_ids_cols)
inat_ids_df['taxon_id'] = inat_ids_df['taxon_id'].astype(int)

inat_ids_df = inat_ids_df.fillna('')
log_df(inat_ids_df)

(3851, 2)


Unnamed: 0,taxon_id,taxon_ids
0,143452,1|47120|47158|47157|47213|143454|143452
1,47727,47126|211194|47124|47729|58321|47727|
2,53178,47126|211194|47124|48151|50638|50636|53178
3,60307,47126|211194|47163|47162|47434|52809|60307
4,47124,47126|211194|47124||||


In [46]:
all_cols = [
    'time_observed_at',
    'image_url',  
    'latitude', 
    'longitude',
    'user_login', 
    'scientific_name', 
    'common_name',
    'taxon_id',
    'id',
    'geoprivacy',
    'taxon_geoprivacy',
    'coordinates_obscured'
]

In [47]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' + dir_path.name + '/observations*.csv'  ):
            print(file_path.name)
            df = pd.read_csv(file_path,  usecols=all_cols)
            df['taxon_id'].fillna(0, inplace=True)
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_ids_df, on="taxon_id", how="left")
            
            # Safari won't parse dates in the format given by iNaturalist          
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) (UTC)', r'\1T\2Z', regex = True) 
            df['time_observed_at'] = df['time_observed_at'].replace('([\d-]+) ([\d:]+) -0700', r'\1T\2Z', regex = True) 

            dfs.append(df)
            
        combine_df = pd.concat(dfs)
        
        new_path = Path('..','inat_data_explorer', 'src', 'lib', 'data', dir_path.name)
        new_path.mkdir(parents=True, exist_ok=True)
        combine_df.to_json(new_path/ 'observations.json', orient = "records")


observations-200303.csv
observations-199064.csv
observations-190446.csv
observations-199065.csv
observations-199066.csv


# create taxa json and csv


inat_names_cols =['taxon_id', 'scientific_names', 'common_names', 'taxon_ids']
inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=inat_names_cols)
inat_names_df['taxon_id'] = inat_names_df['taxon_id'].astype(int)
inat_names_df = inat_names_df.fillna('')
log_df(inat_names_df)


taxa_cols = [ 
    'scientific_name', 
    'common_name',
    'taxon_id', 
    'iconic_taxon_name', 
    'taxon_kingdom_name',
    'taxon_phylum_name',
    'taxon_class_name', 
    'taxon_order_name',
    'taxon_family_name', 
    'taxon_genus_name', 
    'taxon_species_name',
]
basic_taxa_cols = ['scientific_name', 'common_name', 'taxon_id']

basic_cols = basic_taxa_cols + ['image_url', 'user_login']
cols = taxa_cols + ['image_url', 'user_login']



def create_taxa_df_v1(df):
    df['taxon_id'] = df['taxon_id'].astype(int)
    
    df['is_species'] = df['taxon_species_name'].notna()
    adjust_is_species_for_higher_ranks_v1(df)    

    temp = df.copy()
    temp['temp_count'] = 1
    temp['temp_count'] = temp['temp_count'].astype(int)

    # create a df with 2 columns: taxon_id and temp_count       
    count_df = temp.groupby(['taxon_id'])['temp_count'].sum().reset_index()
    count_df.rename(columns={"temp_count": "count"}, inplace=True)


    df = df.drop_duplicates(subset=['taxon_id'])
    df = df.merge(count_df)
    df = df.sort_values('count', ascending=False)
    
    return df


def adjust_is_species_for_higher_ranks_v1(df):
    adjust_is_species_for_rank_v1(df, 'taxon_genus_name')    
    adjust_is_species_for_rank_v1(df, 'taxon_family_name')   
    adjust_is_species_for_rank_v1(df, 'taxon_order_name')  
    adjust_is_species_for_rank_v1(df, 'taxon_class_name')  
    adjust_is_species_for_rank_v1(df, 'taxon_phylum_name')    
    adjust_is_species_for_rank_v1(df, 'taxon_kingdom_name') 
    df.loc[df['scientific_name'].str.contains(' × ') == True, 'is_species'] = True

def adjust_is_species_for_rank_v1(df, rank):
    tmp = df.copy()
    taxa = list(df[df['is_species'] == True][rank].unique())
    tmp = tmp[(tmp['is_species'] == False) & (tmp[rank].notna())]
    for index, row in tmp[~ tmp[rank].isin(taxa)].iterrows():
        df.at[index, 'is_species'] = True
        

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            df = pd.read_csv(file_path,  usecols=cols)
            df = df.dropna(subset=['taxon_id'])
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_names_df, on="taxon_id", how="left")

                
            dfs.append(df)

        combine_df = pd.concat(dfs)
        taxa_df = create_taxa_df_v1(combine_df)

        taxa_list_path = Path('..', 'data',  dir_path.name, 'taxa_list_v1.csv' )
        taxa_df.to_csv(taxa_list_path, index=False)

        new_path = Path('..','inat_data_explorer', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        basic_df = taxa_df[basic_cols + ['count', 'is_species', 'taxon_ids', 'scientific_names', 'common_names']]
        basic_df.to_json(new_path/ "taxa_v1.json", orient = "records")


In [36]:
columns = []
ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
[columns.append(rank + '_id') for rank in ranks]
[columns.append(rank + '_common_name') for rank in ranks]
columns = ['taxon_id', 'scientific_name', 'common_name',
           'rank', 
           'taxon_ids', 'scientific_names', 'common_names'] + ranks + columns 

inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=columns)
inat_names_df['taxon_id'] = inat_names_df['taxon_id'].astype(int)
log_df(inat_names_df)

(3851, 28)


Unnamed: 0,scientific_name,common_name,taxon_id,kingdom,phylum,class,order,family,genus,species,...,class_common_name,order_id,order_common_name,family_id,family_common_name,genus_id,genus_common_name,taxon_ids,scientific_names,common_names
0,Deidamia inscriptum,Lettered Sphinx,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,Deidamia inscriptum,...,Insects,47157.0,Butterflies and Moths,47213.0,Sphinx Moths,143454.0,,1|47120|47158|47157|47213|143454|143452,Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...,Animals|Arthropods|Insects|Butterflies and Mot...
1,Acer,maples,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,,...,dicots,47729.0,"soapberries, cashews, mahoganies, and allies",58321.0,soapberry family,47727.0,maples,47126|211194|47124|47729|58321|47727|,Plantae|Tracheophyta|Magnoliopsida|Sapindales|...,"plants|vascular plants|dicots|soapberries, cas..."
2,Plantago lanceolata,ribwort plantain,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,Plantago lanceolata,...,dicots,48151.0,"mints, plantains, olives, and allies",50638.0,plantain family,50636.0,plantain,47126|211194|47124|48151|50638|50636|53178,Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...,"plants|vascular plants|dicots|mints, plantains..."
3,Poa pratensis,Kentucky bluegrass,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,Poa pratensis,...,monocots,47162.0,"grasses, sedges, cattails, and allies",47434.0,grasses,52809.0,Meadow-grasses,47126|211194|47163|47162|47434|52809|60307,Plantae|Tracheophyta|Liliopsida|Poales|Poaceae...,"plants|vascular plants|monocots|grasses, sedge..."
4,Magnoliopsida,dicots,47124,Plantae,Tracheophyta,Magnoliopsida,,,,,...,dicots,,,,,,,47126|211194|47124||||,Plantae|Tracheophyta|Magnoliopsida||||,plants|vascular plants|dicots||||


In [43]:
def add_count_column(df, count_col):
    # count the number of taxon_id
    temp = df.copy()
    temp['temp_count'] = 1
    temp['temp_count'] = temp['temp_count'].astype(int)

    # create a df with 2 columns: taxon_id and temp_count       
    count_df = temp.groupby(['taxon_id'])['temp_count'].sum().reset_index()
    count_df.rename(columns={"temp_count": count_col}, inplace=True)

    df = df.merge(count_df)
    return df

def append_df(df):  
    #  try to match the species count that is shown on inaturalist project page     
    df['is_species'] = df['species'].notna()
    adjust_is_species_for_higher_ranks(df)  
        
    # count the number of observations for a taxon_id
    df = add_count_column(df, 'observations_count')
    return df

    
def add_row(row, rank, index):
    temp = {}
    temp['id'] = row['id']
    temp['taxon_id'] = row[rank + '_id']
    temp['common_name'] = row[rank + '_common_name']
    temp['scientific_name'] = row[rank]
    temp['user_login'] = row['user_login']
    temp['image_url'] = row['image_url']
    temp['rank'] = rank
    temp['taxon_ids'] = ('|').join(row['taxon_ids'].split('|')[0: index +1])
    temp['common_names'] = ('|').join(row['common_names'].split('|')[0: index+1])
    temp['scientific_names'] = ('|').join(row['scientific_names'].split('|')[0: index+1])
    return temp

def create_taxa_df(df):
    # create a new df with rows for each taxa and eac higher taxa
    new_rows = []
    for index, row in df.iterrows():
        for index, rank in enumerate(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
            if pd.isna(row[rank]):
                continue
                
            temp = add_row(row, rank, index)
            new_rows.append(temp)
            
    new_df =  pd.DataFrame(new_rows) 
    
    temp = df[['is_species', 'observations_count', 'taxon_id']].drop_duplicates()
    new_df['taxon_id'] = new_df['taxon_id'].astype(int)
    new_df = new_df.merge(temp, how='left')
    
    new_df.loc[new_df['observations_count'].isna(), 'observations_count'] = 0
    new_df['observations_count'] = new_df['observations_count'].astype(int)
     
    # count the total number of occurences for a taxa including higher taxa
    new_df = add_count_column(new_df, 'taxa_count')

    # sort newest observations first so that we get use the photo for the newest observations
    new_df = new_df.sort_values(['id'], ascending=False)
    new_df = new_df.drop_duplicates(subset=['taxon_id'])
    new_df = new_df.sort_values(['taxa_count'], ascending=False)
    
    return new_df



# try to match the species count that is shown on inaturalist project page   
# if a rank higher than species is the lowest occurence of the taxa, it is treated as
# a species. e.g. if there are no species for genus AA, genus AA 'is_species' is True 
def adjust_is_species_for_higher_ranks(df):
    adjust_is_species_for_rank(df, 'genus')    
    adjust_is_species_for_rank(df, 'family')   
    adjust_is_species_for_rank(df, 'order')  
    adjust_is_species_for_rank(df, 'class')  
    adjust_is_species_for_rank(df, 'phylum')    
    adjust_is_species_for_rank(df, 'kingdom') 
    df.loc[df['scientific_name'].str.contains(' × ') == True, 'is_species'] = True

def adjust_is_species_for_rank(df, rank):
    tmp = df.copy()
    taxa = list(df[df['is_species'] == True][rank].unique())
    tmp = tmp[(tmp['is_species'] == False) & (tmp[rank].notna())]
    for index, row in tmp[~ tmp[rank].isin(taxa)].iterrows():
        df.at[index, 'is_species'] = True

In [44]:
# combine multiple observations csvs for a project into one json         

for dir_path in Path().glob('../data/**/'):
    if dir_path.name != 'data':
        dfs = []
        for file_path in  Path().glob('../data/' +  dir_path.name + '/observations-*.csv'  ):
            print(file_path)
            cols = ['taxon_id', 'user_login', 'image_url', 'id']
            df = pd.read_csv(file_path, usecols=cols)
            df = df.dropna(subset=['taxon_id'])
            df['taxon_id'] = df['taxon_id'].astype(int)
            df = df.merge(inat_names_df, on="taxon_id", how="left")
                                    
            dfs.append(df)


        combine_df = pd.concat(dfs)        
        adjust_df = append_df(combine_df)
        taxa_df = create_taxa_df(adjust_df)

        new_path = Path('..','inat_data_explorer', 'src', 'lib', 'data') /dir_path.name
        new_path.mkdir(parents=True, exist_ok=True)
        taxa_df.to_json(new_path/ "taxa.json", orient = "records")


../data/clarkstown-high-school-north/observations-200303.csv
../data/cedar-creek-reserve/observations-199064.csv
../data/los-angeles-bioblitz/observations-190446.csv
../data/ciencia-ciudadana-peru-bats/observations-199065.csv
../data/ciencia-ciudadana-peru-bees/observations-199066.csv


# create taxa json for LA indicator species

In [124]:
taxa_file = '../data/los-angeles-bioblitz/indicator_species.tsv'

indicator_cols = [ 'type', 'taxon_group', 'taxon_id']
df = pd.read_csv(taxa_file, sep='\t',  usecols=indicator_cols)
log_df(df)


(38, 3)


Unnamed: 0,taxon_group,type,taxon_id
0,Birds,Park/Natural Area Species,1409
1,Birds,Park/Natural Area Species,1986
2,Birds,Stream/Riparian Species,4956
3,Birds,Neighborhood Species,5212
4,Birds,Stream/Riparian Species,7109


In [125]:
observation_file = '../data/los-angeles-bioblitz/observations-190446.csv'
df2 = pd.read_csv(observation_file, dtype=str, usecols=cols)
df2 = df2.dropna(subset=['taxon_id'])
log_df(df2)

(14838, 13)


Unnamed: 0,user_login,image_url,scientific_name,common_name,iconic_taxon_name,taxon_id,taxon_kingdom_name,taxon_phylum_name,taxon_class_name,taxon_order_name,taxon_family_name,taxon_genus_name,taxon_species_name
0,gardenchris01,https://inaturalist-open-data.s3.amazonaws.com...,Nyctaginaceae,four o'clock family,Plantae,50256,Plantae,Tracheophyta,Magnoliopsida,Caryophyllales,Nyctaginaceae,,
1,amyeweiss,https://static.inaturalist.org/photos/13460686...,Metaltella simoni,South American Toothed Hacklemesh Weaver,Arachnida,224975,Animalia,Arthropoda,Arachnida,Araneae,Desidae,Metaltella,Metaltella simoni
2,jklabin,https://static.inaturalist.org/photos/13463191...,Canis latrans,Coyote,Mammalia,42051,Animalia,Chordata,Mammalia,Carnivora,Canidae,Canis,Canis latrans
3,jklabin,https://static.inaturalist.org/photos/13463220...,Buteo jamaicensis,Red-tailed Hawk,Aves,5212,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis
4,jklabin,https://static.inaturalist.org/photos/13463623...,Buteo jamaicensis,Red-tailed Hawk,Aves,5212,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis


In [126]:
inat_names_cols =['taxon_id', 'scientific_names', 'common_names', 'taxon_ids']
inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=inat_names_cols)
inat_names_df['taxon_id'] = inat_names_df['taxon_id'].astype(int)
inat_names_df = inat_names_df.fillna('')
log_df(inat_names_df)

(3851, 4)


Unnamed: 0,taxon_id,taxon_ids,scientific_names,common_names
0,143452,1|47120|47158|47157|47213|143454|143452,Animalia|Arthropoda|Insecta|Lepidoptera|Sphing...,Animals|Arthropods|Insects|Butterflies and Mot...
1,47727,47126|211194|47124|47729|58321|47727|,Plantae|Tracheophyta|Magnoliopsida|Sapindales|...,"plants|vascular plants|dicots|soapberries, cas..."
2,53178,47126|211194|47124|48151|50638|50636|53178,Plantae|Tracheophyta|Magnoliopsida|Lamiales|Pl...,"plants|vascular plants|dicots|mints, plantains..."
3,60307,47126|211194|47163|47162|47434|52809|60307,Plantae|Tracheophyta|Liliopsida|Poales|Poaceae...,"plants|vascular plants|monocots|grasses, sedge..."
4,47124,47126|211194|47124||||,Plantae|Tracheophyta|Magnoliopsida||||,plants|vascular plants|dicots||||


In [127]:
taxa_df = create_taxa_df(df2)
log_df(taxa_df)

(2775, 15)


Unnamed: 0,user_login,image_url,scientific_name,common_name,iconic_taxon_name,taxon_id,taxon_kingdom_name,taxon_phylum_name,taxon_class_name,taxon_order_name,taxon_family_name,taxon_genus_name,taxon_species_name,is_species,count
159,reese_bernstein,https://inaturalist-open-data.s3.amazonaws.com...,Sceloporus occidentalis,Western Fence Lizard,Reptilia,36204,Animalia,Chordata,Reptilia,Squamata,Phrynosomatidae,Sceloporus,Sceloporus occidentalis,True,342
227,naturephotosuze,https://static.inaturalist.org/photos/13499159...,Haemorhous mexicanus,House Finch,Aves,199840,Animalia,Chordata,Aves,Passeriformes,Fringillidae,Haemorhous,Haemorhous mexicanus,True,146
166,kikster,https://inaturalist-open-data.s3.amazonaws.com...,Danaus plexippus,Monarch,Insecta,48662,Animalia,Arthropoda,Insecta,Lepidoptera,Nymphalidae,Danaus,Danaus plexippus,True,144
3,jklabin,https://static.inaturalist.org/photos/13463220...,Buteo jamaicensis,Red-tailed Hawk,Aves,5212,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis,True,140
4,jklabin,https://static.inaturalist.org/photos/13463649...,Zenaida macroura,Mourning Dove,Aves,3454,Animalia,Chordata,Aves,Columbiformes,Columbidae,Zenaida,Zenaida macroura,True,126


In [128]:
taxa_df = taxa_df.merge(inat_names_df, on="taxon_id", how="left")
log_df(taxa_df)

(2775, 18)


Unnamed: 0,user_login,image_url,scientific_name,common_name,iconic_taxon_name,taxon_id,taxon_kingdom_name,taxon_phylum_name,taxon_class_name,taxon_order_name,taxon_family_name,taxon_genus_name,taxon_species_name,is_species,count,taxon_ids,scientific_names,common_names
0,reese_bernstein,https://inaturalist-open-data.s3.amazonaws.com...,Sceloporus occidentalis,Western Fence Lizard,Reptilia,36204,Animalia,Chordata,Reptilia,Squamata,Phrynosomatidae,Sceloporus,Sceloporus occidentalis,True,342,1|2|26036|26172|36074|36141|36204,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,Animals|Chordates|Reptiles|Snakes and Lizards|...
1,naturephotosuze,https://static.inaturalist.org/photos/13499159...,Haemorhous mexicanus,House Finch,Aves,199840,Animalia,Chordata,Aves,Passeriformes,Fringillidae,Haemorhous,Haemorhous mexicanus,True,146,1|2|3|7251|9079|199910|199840,Animalia|Chordata|Aves|Passeriformes|Fringilli...,Animals|Chordates|Birds|Perching Birds|Finches...
2,kikster,https://inaturalist-open-data.s3.amazonaws.com...,Danaus plexippus,Monarch,Insecta,48662,Animalia,Arthropoda,Insecta,Lepidoptera,Nymphalidae,Danaus,Danaus plexippus,True,144,1|47120|47158|47157|47922|48663|48662,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,Animals|Arthropods|Insects|Butterflies and Mot...
3,jklabin,https://static.inaturalist.org/photos/13463220...,Buteo jamaicensis,Red-tailed Hawk,Aves,5212,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis,True,140,1|2|3|71261|5067|5179|5212,Animalia|Chordata|Aves|Accipitriformes|Accipit...,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ..."
4,jklabin,https://static.inaturalist.org/photos/13463649...,Zenaida macroura,Mourning Dove,Aves,3454,Animalia,Chordata,Aves,Columbiformes,Columbidae,Zenaida,Zenaida macroura,True,126,1|2|3|2708|2715|3438|3454,Animalia|Chordata|Aves|Columbiformes|Columbida...,Animals|Chordates|Birds|Pigeons and Doves|Pige...


In [129]:
taxa_df = df.merge(taxa_df, how="outer")
taxa_df['count'].fillna(0, inplace=True)
taxa_df['count'] = taxa_df['count'].astype(int)
taxa_df = taxa_df.sort_values('count', ascending=False)
log_df(taxa_df)

(2785, 20)


Unnamed: 0,taxon_group,type,taxon_id,user_login,image_url,scientific_name,common_name,iconic_taxon_name,taxon_kingdom_name,taxon_phylum_name,taxon_class_name,taxon_order_name,taxon_family_name,taxon_genus_name,taxon_species_name,is_species,count,taxon_ids,scientific_names,common_names
38,,,36204,reese_bernstein,https://inaturalist-open-data.s3.amazonaws.com...,Sceloporus occidentalis,Western Fence Lizard,Reptilia,Animalia,Chordata,Reptilia,Squamata,Phrynosomatidae,Sceloporus,Sceloporus occidentalis,True,342,1|2|26036|26172|36074|36141|36204,Animalia|Chordata|Reptilia|Squamata|Phrynosoma...,Animals|Chordates|Reptiles|Snakes and Lizards|...
39,,,199840,naturephotosuze,https://static.inaturalist.org/photos/13499159...,Haemorhous mexicanus,House Finch,Aves,Animalia,Chordata,Aves,Passeriformes,Fringillidae,Haemorhous,Haemorhous mexicanus,True,146,1|2|3|7251|9079|199910|199840,Animalia|Chordata|Aves|Passeriformes|Fringilli...,Animals|Chordates|Birds|Perching Birds|Finches...
20,Invertebrates,Neighborhood Species,48662,kikster,https://inaturalist-open-data.s3.amazonaws.com...,Danaus plexippus,Monarch,Insecta,Animalia,Arthropoda,Insecta,Lepidoptera,Nymphalidae,Danaus,Danaus plexippus,True,144,1|47120|47158|47157|47922|48663|48662,Animalia|Arthropoda|Insecta|Lepidoptera|Nympha...,Animals|Arthropods|Insects|Butterflies and Mot...
3,Birds,Neighborhood Species,5212,jklabin,https://static.inaturalist.org/photos/13463220...,Buteo jamaicensis,Red-tailed Hawk,Aves,Animalia,Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis,True,140,1|2|3|71261|5067|5179|5212,Animalia|Chordata|Aves|Accipitriformes|Accipit...,"Animals|Chordates|Birds|Hawks, Eagles, Kites, ..."
40,,,3454,jklabin,https://static.inaturalist.org/photos/13463649...,Zenaida macroura,Mourning Dove,Aves,Animalia,Chordata,Aves,Columbiformes,Columbidae,Zenaida,Zenaida macroura,True,126,1|2|3|2708|2715|3438|3454,Animalia|Chordata|Aves|Columbiformes|Columbida...,Animals|Chordates|Birds|Pigeons and Doves|Pige...


### connect to inat api to get data for indicator species with no observations

In [130]:
for index, row in taxa_df[taxa_df['user_login'].isna()].iterrows():
    response = requests.get(f'https://api.inaturalist.org/v1/taxa/{row["taxon_id"]}')
    if response.status_code == 200:
        json_data = response.json()['results'][0]
        
        result = {
            'scientific_name': json_data['name'], 
            'common_name': json_data['preferred_common_name'],
            'image_url': json_data['default_photo']['medium_url'],
            'user_login':  json_data['default_photo']['attribution'],
        }

        for taxon in json_data['ancestors']:
            if taxon['rank'] == 'kingdom':
                result['taxon_kingdom_name'] = taxon['name']
            elif taxon['rank'] == 'phylum':
                result['taxon_phylum_name'] = taxon['name']
            elif taxon['rank'] == 'class':
                result['taxon_class_name'] = taxon['name']
            elif taxon['rank'] == 'order':
                result['taxon_order_name'] = taxon['name']
            elif taxon['rank'] == 'family':
                result['taxon_family_name'] = taxon['name']
            elif taxon['rank'] == 'genus':
                result['taxon_genus_name'] = taxon['name']
            elif taxon['rank'] == 'species':
                result['taxon_species_name'] = taxon['name']
                
        if json_data['rank'] == 'kingdom':
            result['taxon_kingdom_name'] = json_data['name']
        elif json_data['rank'] == 'phylum':
            result['taxon_phylum_name'] = json_data['name']
        elif json_data['rank'] == 'class':
            result['taxon_class_name'] = json_data['name']
        elif json_data['rank'] == 'order':
            result['taxon_order_name'] = json_data['name']
        elif json_data['rank'] == 'family':
            result['taxon_family_name'] = json_data['name']
        elif json_data['rank'] == 'genus':
            result['taxon_genus_name'] = json_data['name']
        elif json_data['rank'] == 'species':
            result['taxon_species_name'] = json_data['name']
            
            
        for col in result:
            taxa_df.at[index, col] = result[col]
           

In [132]:
taxa_list_path = Path('..', 'data',  'los-angeles-bioblitz', 'taxa_list.csv' )
taxa_df.to_csv(taxa_list_path, index=False)

new_path = Path('..','inat_data_explorer', 'src', 'lib', 'data', 'los-angeles-bioblitz')
new_path.mkdir(parents=True, exist_ok=True)
basic_df = taxa_df[basic_cols + ['taxon_group', 'type', 'count', 'is_species', 'taxon_ids', 'scientific_names', 'common_names']]
basic_df.to_json(new_path/ 'taxa.json', orient = "records")



# create  a list of all taxa ids for a project

In [150]:
path = '../data/clarkstown-high-school-north/observations-200303.csv'

cols = [
    'taxon_id', 'user_login', 'image_url', 'id',
    # 'taxon_species_name',
    # 'taxon_genus_name', 'taxon_family_name', 'taxon_order_name',
    # 'taxon_class_name','taxon_phylum_name','taxon_kingdom_name',
    # 'scientific_name',
    
]
df = pd.read_csv(path, dtype=str, usecols=cols)
df = df.dropna(subset=['taxon_id'])
log_df(df)

(1077, 4)


Unnamed: 0,id,user_login,image_url,taxon_id
0,1551812,lgertzer,https://inaturalist-open-data.s3.amazonaws.com...,143452
1,1556119,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,47727
2,1556120,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,53178
3,1556121,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,60307
4,1556122,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,47124


In [151]:
columns = []
ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
[columns.append(rank + '_id') for rank in ranks]
[columns.append(rank + '_common_name') for rank in ranks]
columns = ['taxon_id', 'scientific_name', 'common_name', 'rank'] + ranks + columns 

inat_names_df = pd.read_csv(all_taxa, dtype=str, usecols=columns)
log_df(inat_names_df)

(3851, 25)


Unnamed: 0,scientific_name,common_name,taxon_id,kingdom,phylum,class,order,family,genus,species,...,phylum_id,phylum_common_name,class_id,class_common_name,order_id,order_common_name,family_id,family_common_name,genus_id,genus_common_name
0,Deidamia inscriptum,Lettered Sphinx,143452,Animalia,Arthropoda,Insecta,Lepidoptera,Sphingidae,Deidamia,Deidamia inscriptum,...,47120,Arthropods,47158,Insects,47157.0,Butterflies and Moths,47213.0,Sphinx Moths,143454.0,
1,Acer,maples,47727,Plantae,Tracheophyta,Magnoliopsida,Sapindales,Sapindaceae,Acer,,...,211194,vascular plants,47124,dicots,47729.0,"soapberries, cashews, mahoganies, and allies",58321.0,soapberry family,47727.0,maples
2,Plantago lanceolata,ribwort plantain,53178,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Plantaginaceae,Plantago,Plantago lanceolata,...,211194,vascular plants,47124,dicots,48151.0,"mints, plantains, olives, and allies",50638.0,plantain family,50636.0,plantain
3,Poa pratensis,Kentucky bluegrass,60307,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Poa,Poa pratensis,...,211194,vascular plants,47163,monocots,47162.0,"grasses, sedges, cattails, and allies",47434.0,grasses,52809.0,Meadow-grasses
4,Magnoliopsida,dicots,47124,Plantae,Tracheophyta,Magnoliopsida,,,,,...,211194,vascular plants,47124,dicots,,,,,,


In [152]:
merge_df = df.merge(inat_names_df, how='left', on='taxon_id')
adjust_df = append_df(merge_df)
new_df = create_taxa_df(adjust_df)


(1077, 28)


Unnamed: 0,id,user_login,image_url,taxon_id,scientific_name,common_name,kingdom,phylum,class,order,...,phylum_id,phylum_common_name,class_id,class_common_name,order_id,order_common_name,family_id,family_common_name,genus_id,genus_common_name
0,1551812,lgertzer,https://inaturalist-open-data.s3.amazonaws.com...,143452,Deidamia inscriptum,Lettered Sphinx,Animalia,Arthropoda,Insecta,Lepidoptera,...,47120,Arthropods,47158,Insects,47157.0,Butterflies and Moths,47213.0,Sphinx Moths,143454.0,
1,1556119,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,47727,Acer,maples,Plantae,Tracheophyta,Magnoliopsida,Sapindales,...,211194,vascular plants,47124,dicots,47729.0,"soapberries, cashews, mahoganies, and allies",58321.0,soapberry family,47727.0,maples
2,1556120,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,53178,Plantago lanceolata,ribwort plantain,Plantae,Tracheophyta,Magnoliopsida,Lamiales,...,211194,vascular plants,47124,dicots,48151.0,"mints, plantains, olives, and allies",50638.0,plantain family,50636.0,plantain
3,1556121,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,60307,Poa pratensis,Kentucky bluegrass,Plantae,Tracheophyta,Liliopsida,Poales,...,211194,vascular plants,47163,monocots,47162.0,"grasses, sedges, cattails, and allies",47434.0,grasses,52809.0,Meadow-grasses
4,1556122,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,47124,Magnoliopsida,dicots,Plantae,Tracheophyta,Magnoliopsida,,...,211194,vascular plants,47124,dicots,,,,,,


Unnamed: 0,id,user_login,image_url,taxon_id,scientific_name,common_name,kingdom,phylum,class,order,...,class_id,class_common_name,order_id,order_common_name,family_id,family_common_name,genus_id,genus_common_name,is_species,observations_count
0,1551812,lgertzer,https://inaturalist-open-data.s3.amazonaws.com...,143452,Deidamia inscriptum,Lettered Sphinx,Animalia,Arthropoda,Insecta,Lepidoptera,...,47158,Insects,47157,Butterflies and Moths,47213,Sphinx Moths,143454,,True,1
1,1556119,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,47727,Acer,maples,Plantae,Tracheophyta,Magnoliopsida,Sapindales,...,47124,dicots,47729,"soapberries, cashews, mahoganies, and allies",58321,soapberry family,47727,maples,False,3
2,3306988,rebeccab2,https://inaturalist-open-data.s3.amazonaws.com...,47727,Acer,maples,Plantae,Tracheophyta,Magnoliopsida,Sapindales,...,47124,dicots,47729,"soapberries, cashews, mahoganies, and allies",58321,soapberry family,47727,maples,False,3
3,81561936,emilyd5,https://inaturalist-open-data.s3.amazonaws.com...,47727,Acer,maples,Plantae,Tracheophyta,Magnoliopsida,Sapindales,...,47124,dicots,47729,"soapberries, cashews, mahoganies, and allies",58321,soapberry family,47727,maples,False,3
4,1556120,mcgovernm97,https://inaturalist-open-data.s3.amazonaws.com...,53178,Plantago lanceolata,ribwort plantain,Plantae,Tracheophyta,Magnoliopsida,Lamiales,...,47124,dicots,48151,"mints, plantains, olives, and allies",50638,plantain family,50636,plantain,True,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,81656257,moosty123,https://inaturalist-open-data.s3.amazonaws.com...,9915,Piranga rubra,Summer Tanager,Animalia,Chordata,Aves,Passeriformes,...,3,Birds,7251,Perching Birds,71305,Cardinals and Allies,9913,Piranga Tanagers,True,1
1073,81658816,moosty123,https://inaturalist-open-data.s3.amazonaws.com...,1097013,Schizomantodea,,Animalia,Arthropoda,Insecta,Mantodea,...,47158,Insects,48112,Mantises,,,,,True,1
1074,81659440,moosty123,https://inaturalist-open-data.s3.amazonaws.com...,50863,Chlorophyta,green algae,Plantae,Chlorophyta,,,...,,,,,,,,,True,1
1075,83406604,kwadwo,https://inaturalist-open-data.s3.amazonaws.com...,52989,Cirsium vulgare,Bull Thistle,Plantae,Tracheophyta,Magnoliopsida,Asterales,...,47124,dicots,47605,"sunflowers, bellflowers, fanflowers, and allies",47604,"sunflowers, daisies, asters, and allies",48561,thistles,True,1


In [155]:
new_df = create_taxa_df(adjust_df)


new_df 

Unnamed: 0,id,taxon_id,common_name,scientific_name,user_login,image_url,rank,is_species,observations_count,taxa_count
790,83683271,47126,plants,Plantae,leo25097,https://inaturalist-open-data.s3.amazonaws.com...,kingdom,False,31,666
1406,83431868,211194,vascular plants,Tracheophyta,maxkahann,https://inaturalist-open-data.s3.amazonaws.com...,phylum,False,11,618
2019,83431868,47124,dicots,Magnoliopsida,maxkahann,https://inaturalist-open-data.s3.amazonaws.com...,class,False,45,498
181,83431860,1,Animals,Animalia,maxkahann,https://inaturalist-open-data.s3.amazonaws.com...,kingdom,,0,372
2916,83431860,2,Chordates,Chordata,maxkahann,https://inaturalist-open-data.s3.amazonaws.com...,phylum,,0,204
...,...,...,...,...,...,...,...,...,...,...
6177,48148609,50790,common toadflax,Linaria vulgaris,kylasevim,https://inaturalist-open-data.s3.amazonaws.com...,species,True,1,1
6184,48151853,55844,lesser periwinkle,Vinca minor,matthewc6,https://inaturalist-open-data.s3.amazonaws.com...,species,True,1,1
6195,48154700,39771,Painted Turtle,Chrysemys picta,sarahb2449,https://inaturalist-open-data.s3.amazonaws.com...,species,,0,1
6194,48154700,39770,Painted Turtles,Chrysemys,sarahb2449,https://inaturalist-open-data.s3.amazonaws.com...,genus,,0,1
