# Data Preparation for iNaturalist Data Analysis

In [143]:
# Required Imports
import pandas as pd

In [144]:
# Loading Data
obs = pd.read_csv('Data/plant_observations.csv', parse_dates=['observed_on'])
location = pd.read_csv('Data/geocoded_locations.csv')
invasives = pd.read_csv('Data/invasive_species.csv')
arche_native = pd.read_csv('Data/archewild_natives_cleaned.csv')
city_density = pd.read_csv('Data/us-cities-table.csv')

In [145]:
# Drop columns where all are all Null/False and columns with unnecessary data (Place Guess)
obs = obs.drop(columns=['native', 'most_disagree', 'place_guess'])

# Ensure correct data types
obs['id'] = obs['id'].astype(object)

# Drop duplicates
location = location.drop_duplicates(subset=['latitude', 'longitude'])

# Duplicate data for observation data will be ignored. 
## It is impossible to know if its the exact same plant being identified repeatedly or if there are multiple plants in the area being idenfited separately.

print(obs.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     10000 non-null  object        
 1   species                10000 non-null  object        
 2   common_name            10000 non-null  object        
 3   observed_on            10000 non-null  datetime64[ns]
 4   latitude               10000 non-null  float64       
 5   longitude              10000 non-null  float64       
 6   location               10000 non-null  object        
 7   quality_grade          10000 non-null  object        
 8   captive                10000 non-null  bool          
 9   identifications_count  10000 non-null  int64         
dtypes: bool(1), datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 713.0+ KB
None


In [146]:
print(arche_native.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829 entries, 0 to 828
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Scientific Name  829 non-null    object
 1   Common Name      829 non-null    object
dtypes: object(2)
memory usage: 13.1+ KB
None


In [147]:
print(invasives.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   number           63 non-null     int64 
 1   scientific_name  63 non-null     object
 2   common_name      63 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.6+ KB
None


### Standardizing Species 
The observations and native plants dataframes have slightly different formatting and level of specificity in their species names. 
- First, I convert both to first letter to capitalized and remaining to lowercase. 
- Then, I separate iNaturalist observations into two separate dataframes - those that report specific species and those that only report the family. 
    - I will focus on specific species when checking for nativeness.

In [148]:
# Normalize Species - Case
obs['species'] = obs['species'].str.capitalize()
obs['common_name'] = obs['common_name'].str.capitalize()

arche_native['Scientific Name'].str.capitalize()

# Separate out by species specificity. Hybrids, identified by and × symbol, do not include the species. 
obs_has_species = obs[obs['species'].str.contains(' ') & ~obs['species'].str.contains(r'\s[x×]\s', regex=True)]
obs_just_family = obs[~obs['species'].str.contains(' ') | obs['species'].str.contains(r'\s[x×]\s', regex=True)]

obs_has_species = obs_has_species.copy() 

### Handling Species Varieties

Some observations include specific varieties of the species, whereas the data source for native species does not include variety information. 
I will identify these species and manually check for nativeness in the USDA Plant Finder.

In [149]:
obs_vars = obs_has_species[obs_has_species['species'].str.split().str.len() ==3]
print(obs_vars['species'].unique())

['Hedera helix helix' 'Pteridium aquilinum latiusculum'
 'Symphyotrichum racemosum subdumosum' 'Juniperus virginiana virginiana'
 'Pelargonium graveolens citrosum' 'Cakile edentula lacustris'
 'Silphium integrifolium integrifolium'
 'Ampelopsis glandulosa brevipedunculata' 'Phytolacca americana americana']


Of the 9 species identified in observations that include a specific native variety, none have other non-native varieties previously idenfified in the state. Therefore I will ignore varieties when flagging observations as native. 

However, it is important to note that some species that don't include variety and are tagged as native may have varieties that ARE non-native in the state. For example, Viburnum opulus americanum is native in Cuyahoga county while Viburnum opulus opulus is not (previously known information). However, we do not have information about specific varieties of Viburnum opulus in any dataset. 

### Merging Location Data

In [150]:
obs_has_species = obs_has_species.merge(location, on=['longitude', 'latitude'])

In [151]:
city_density = city_density.rename(columns={'city': 'City'})
obs_has_species = obs_has_species.merge(city_density[['City', 'pop2024', 'densityMi', 'areaMi', 'park_acres', 'vacant_acres']], on='City')

### Tagging Observations as Native

In [152]:
def first_two_words(name):
    return " ".join(name.split()[:2])  # Get the species without the variety

obs_has_species['native'] = obs_has_species['species'].apply(lambda x: first_two_words(x) in set(arche_native['Scientific Name']))

# Confirm native plants marked properly 
display(obs_has_species[obs_has_species['native'] == True].head())

Unnamed: 0,id,species,common_name,observed_on,latitude,longitude,location,quality_grade,captive,identifications_count,City,State,pop2024,densityMi,areaMi,park_acres,vacant_acres,native
0,263098951,Rudbeckia hirta,Black-eyed susan,2025-02-25,41.472049,-81.555201,"41.4720492438,-81.5552013741",needs_id,False,0,Shaker Heights,OH,28262,4492,6.291,262,135,True
2,262985952,Andropogon virginicus,Broomsedge bluestem,2025-02-24,41.521881,-81.5758,"41.521880755,-81.575800118",research,False,0,East Cleveland,OH,13219,4288,3.083,197,329,True
3,262966456,Hamamelis virginiana,American witch-hazel,2025-02-24,41.377991,-81.86575,"41.377991,-81.8657499",research,False,1,Berea,OH,17728,3121,5.68,458,257,True
9,262731541,Polystichum acrostichoides,Christmas fern,2025-02-22,41.390063,-81.545372,"41.3900633333,-81.5453716667",research,False,0,Bedford,OH,12650,2368,5.342,631,349,True
11,262721128,Carya cordiformis,Bitternut hickory,2025-02-21,41.409979,-81.883491,"41.4099794,-81.8834911",needs_id,False,0,North Olmsted,OH,31258,2678,11.673,807,375,True


### Tagging Observations as Invasive

In [153]:
obs_has_species['invasive'] = obs_has_species['species'].apply(lambda x: first_two_words(x) in set(invasives['scientific_name']))

display(obs_has_species[obs_has_species['invasive'] == True].head())

Unnamed: 0,id,species,common_name,observed_on,latitude,longitude,location,quality_grade,captive,identifications_count,City,State,pop2024,densityMi,areaMi,park_acres,vacant_acres,native,invasive
90,259271340,Rhamnus cathartica,Common buckthorn,2025-01-22,41.348929,-81.841252,"41.3489290113,-81.8412520877",needs_id,False,0,Strongsville,OH,45206,1836,24.622,2511,1306,False,True
102,258409269,Lonicera japonica,Japanese honeysuckle,2025-01-13,41.430565,-81.66275,"41.4305649,-81.6627502",research,False,0,Cuyahoga Heights,OH,548,179,3.07,314,421,False,True
139,257189948,Ailanthus altissima,Tree-of-heaven,2024-09-21,41.465755,-81.565637,"41.465755,-81.5656366667",research,False,0,Shaker Heights,OH,28262,4492,6.291,262,135,False,True
141,257162152,Alliaria petiolata,Garlic mustard,2025-01-02,41.313794,-81.595131,"41.3137944444,-81.5951305556",research,False,1,Brecksville,OH,13922,716,19.448,4531,1306,False,True
143,257162083,Rosa multiflora,Multiflora rose,2025-01-02,41.313822,-81.595164,"41.3138222222,-81.5951638889",research,False,1,Brecksville,OH,13922,716,19.448,4531,1306,False,True


### Manual Tagging

The dataset for native plants, while extensive, is not exhaustive. Known native species (as confirmed by USDA Plant Finder) will be manually tagged as native when discovered. 

In [154]:
obs_has_species.loc[obs_has_species['species'] == 'Impatiens capensis', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Impatiens pallida', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Pinus strobus', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Cercis canadensis', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Conoclinium coelestinum', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Persicaria virginiana', 'native'] = True
obs_has_species.loc[obs_has_species['species'] == 'Eutrochium fistulosum', 'native'] = True





### Final Dataset Check

In [155]:
print(obs_has_species.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7632 entries, 0 to 7631
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     7632 non-null   object        
 1   species                7632 non-null   object        
 2   common_name            7632 non-null   object        
 3   observed_on            7632 non-null   datetime64[ns]
 4   latitude               7632 non-null   float64       
 5   longitude              7632 non-null   float64       
 6   location               7632 non-null   object        
 7   quality_grade          7632 non-null   object        
 8   captive                7632 non-null   bool          
 9   identifications_count  7632 non-null   int64         
 10  City                   7632 non-null   object        
 11  State                  7632 non-null   object        
 12  pop2024                7632 non-null   int64         
 13  den

In [156]:
obs_has_species.to_csv('obs_species.csv')