# Loading the Data

In [11]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import glob

observation_df = pd.read_csv('observations.csv', encoding='utf-8')
species_info_df = pd.read_csv('species_info.csv', encoding='utf-8')

print(observation_df.head())
print(species_info_df.head())

print(observation_df.info())
print(species_info_df.info())

            scientific_name                            park_name  observations
0        Vicia benghalensis  Great Smoky Mountains National Park            68
1            Neovison vison  Great Smoky Mountains National Park            77
2         Prunus subcordata               Yosemite National Park           138
3      Abutilon theophrasti                  Bryce National Park            84
4  Githopsis specularioides  Great Smoky Mountains National Park            85
  category                scientific_name  \
0   Mammal  Clethrionomys gapperi gapperi   
1   Mammal                      Bos bison   
2   Mammal                     Bos taurus   
3   Mammal                     Ovis aries   
4   Mammal                 Cervus elaphus   

                                        common_names conservation_status  
0                           Gapper's Red-Backed Vole                 NaN  
1                              American Bison, Bison                 NaN  
2  Aurochs, Aurochs, Domestic 

# Exploring the Data

In [19]:
print(f"species info shape: {species_info_df.shape}")
print(f"observation shape: {observation_df.shape}")

species info shape: (5824, 4)
observation shape: (23296, 3)


In [25]:
print(f"number of species:{species_info_df.scientific_name.nunique()}")


number of species:5541


In [23]:
print(f"number of categories:{species_info_df.category.nunique()}")
print(f"categories:{species_info_df.category.unique()}")

number of categories:7
categories:['Mammal' 'Bird' 'Reptile' 'Amphibian' 'Fish' 'Vascular Plant'
 'Nonvascular Plant']


In [26]:
species_info_df.groupby("category").size()

category
Amphibian              80
Bird                  521
Fish                  127
Mammal                214
Nonvascular Plant     333
Reptile                79
Vascular Plant       4470
dtype: int64

### Conservation Status

In [29]:
print(f"number of conservation statuses: {species_info_df.conservation_status.nunique()}")
print(f"unique conservation statuses: {species_info_df.conservation_status.unique()}")

number of conservation statuses: 4
unique conservation statuses: [nan 'Species of Concern' 'Endangered' 'Threatened' 'In Recovery']


### NaN Values (species info)

In [30]:
print(f"number of na values:{species_info_df.conservation_status.isna().sum()}")
print(species_info_df.groupby("conservation_status").size())

number of na values:5633
conservation_status
Endangered             16
In Recovery             4
Species of Concern    161
Threatened             10
dtype: int64


In [34]:
print(f"number of parks:{observation_df.park_name.nunique()}") 
print(f"park names:{observation_df.park_name.unique()}")

number of parks:4
park names:['Great Smoky Mountains National Park' 'Yosemite National Park'
 'Bryce National Park' 'Yellowstone National Park']


In [35]:
print(f"number of observations:{observation_df.observations.sum()}")

number of observations:3314739


# Analysis
-Converting NaN values to "No Intervention", as these species are not at risk or of any concern.

In [37]:
species_info_df.fillna("No Intervention", inplace=True)
species_info_df.groupby("conservation_status").size()

conservation_status
Endangered              16
In Recovery              4
No Intervention       5633
Species of Concern     161
Threatened              10
dtype: int64

In [44]:
conservation_category = species_info_df[species_info_df.conservation_status != "No Intervention"]\
.groupby(["conservation_status","category"])['scientific_name']\
.count().unstack()

conservation_category

category,Amphibian,Bird,Fish,Mammal,Nonvascular Plant,Reptile,Vascular Plant
conservation_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Endangered,1.0,4.0,3.0,7.0,,,1.0
In Recovery,,3.0,,1.0,,,
Species of Concern,4.0,72.0,4.0,28.0,5.0,5.0,43.0
Threatened,2.0,,4.0,2.0,,,2.0
