In [1]:
import pandas as pd
import json 
import os

In [2]:
# load json
# using with...as allows for automatic closure of file instead of manually using f.close()
#TODO: loop through directory? might have better performance than going through each one

with open("../files/raw/inat_observations.json", "r", encoding="utf-8") as f:
    observations = json.load(f)

# Commenting these ones as I want to work right now just with observations

# with open("../files/raw/inat_identifications.json", "r", encoding="utf-8") as f:
#     identifications = json.load(f)

# with open("../files/raw/inat_profile.json", "r", encoding="utf-8") as f:
#     profile = json.load(f)

In [3]:
# create dataframe from json

df_observations = pd.json_normalize(observations)
# check all columns 
df_observations.columns.tolist()


['quality_grade',
 'time_observed_at',
 'taxon_geoprivacy',
 'annotations',
 'uuid',
 'id',
 'cached_votes_total',
 'identifications_most_agree',
 'species_guess',
 'identifications_most_disagree',
 'tags',
 'positional_accuracy',
 'comments_count',
 'site_id',
 'created_time_zone',
 'license_code',
 'observed_time_zone',
 'quality_metrics',
 'public_positional_accuracy',
 'reviewed_by',
 'oauth_application_id',
 'flags',
 'created_at',
 'description',
 'time_zone_offset',
 'project_ids_with_curator_id',
 'observed_on',
 'observed_on_string',
 'updated_at',
 'sounds',
 'place_ids',
 'captive',
 'ident_taxon_ids',
 'outlinks',
 'faves_count',
 'ofvs',
 'num_identification_agreements',
 'comments',
 'map_scale',
 'uri',
 'project_ids',
 'community_taxon_id',
 'owners_identification_from_vision',
 'identifications_count',
 'obscured',
 'num_identification_disagreements',
 'geoprivacy',
 'location',
 'votes',
 'spam',
 'mappable',
 'identifications_some_agree',
 'project_ids_without_curato

In [4]:
#subset of data I want to keep as total columns are like 499

df_observations = df_observations[
    ["quality_grade"
    , "observed_on"
    , "identifications_most_agree"
    , "num_identification_agreements"
    , "community_taxon_id"
    , "location"
    , "taxon.name"]
    ]


In [5]:
# add underscore to scientific names between genus and species epithet
df_observations['taxon.name'] = df_observations['taxon.name'].str.replace(' ', '_')

In [6]:
# fix location

df_observations[['lat', 'lon']] = df_observations['location'].str.split(',', expand=True)
df_observations = df_observations.drop(columns=['location'])

In [7]:
df_observations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   quality_grade                  499 non-null    object 
 1   observed_on                    499 non-null    object 
 2   identifications_most_agree     499 non-null    bool   
 3   num_identification_agreements  499 non-null    int64  
 4   community_taxon_id             201 non-null    float64
 5   taxon.name                     499 non-null    object 
 6   lat                            499 non-null    object 
 7   lon                            499 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 27.9+ KB


In [8]:
df_observations.to_csv("../files/tidy/inat_observations.csv", index=False)