In [1]:
# Data science
import pandas as pd

# API
import requests
import json

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, text, inspect, func

In [2]:
# Read the CSV files
df1 = pd.read_csv('Resources/parks.csv')
df = pd.read_csv('Resources/species.csv')

In [3]:
df1.head()

Unnamed: 0,Park Code,Park Name,State,Acres,Latitude,Longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08


In [4]:
df.head()

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Record Status,Occurrence,Nativeness,Abundance,Seasonality,Conservation Status
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,Approved,Present,Native,Rare,Resident,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",Approved,Present,Native,Abundant,,
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Approved,Present,Not Native,Common,,Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Approved,Not Confirmed,Native,,,Endangered
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",Approved,Present,Unknown,Common,Breeder,


In [5]:
#drop_list = ["Record Status", "Occurrence", "Nativeness", "Abundance", "Seasonality"]
df.drop(["Record Status", "Occurrence", "Nativeness", "Abundance", "Seasonality"], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Conservation Status
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Endangered
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",


In [7]:
# Create engine for conection to the SQLite
engine = create_engine('sqlite:///national_parks.sqlite')

# Save the dataframes to the database
df1.to_sql('parks', engine, index=False, if_exists='replace')
df.to_sql('species', engine, index=False, if_exists='replace')

119248

In [8]:
# Read the data from the database
with engine.connect() as connection:
    parks_table = pd.read_sql('SELECT * FROM parks', connection)
    species_table = pd.read_sql('SELECT * FROM species', connection)

species_table.head()

Unnamed: 0,Species ID,Park Name,Category,Order,Family,Scientific Name,Common Names,Conservation Status
0,ACAD-1000,Acadia National Park,Mammal,Artiodactyla,Cervidae,Alces alces,Moose,
1,ACAD-1001,Acadia National Park,Mammal,Artiodactyla,Cervidae,Odocoileus virginianus,"Northern White-Tailed Deer, Virginia Deer, Whi...",
2,ACAD-1002,Acadia National Park,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Species of Concern
3,ACAD-1003,Acadia National Park,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Endangered
4,ACAD-1004,Acadia National Park,Mammal,Carnivora,Canidae,Vulpes vulpes,"Black Fox, Cross Fox, Eastern Red Fox, Fox, Re...",


In [11]:
parks_table.head(20)

Unnamed: 0,Park Code,Park Name,State,Acres,Latitude,Longitude
0,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ARCH,Arches National Park,UT,76519,38.68,-109.57
2,BADL,Badlands National Park,SD,242756,43.75,-102.5
3,BIBE,Big Bend National Park,TX,801163,29.25,-103.25
4,BISC,Biscayne National Park,FL,172924,25.65,-80.08
5,BLCA,Black Canyon of the Gunnison National Park,CO,32950,38.57,-107.72
6,BRCA,Bryce Canyon National Park,UT,35835,37.57,-112.18
7,CANY,Canyonlands National Park,UT,337598,38.2,-109.93
8,CARE,Capitol Reef National Park,UT,241904,38.2,-111.17
9,CAVE,Carlsbad Caverns National Park,NM,46766,32.17,-104.44


In [12]:
# Split parks with multiple states into duplicate rows with one for each state

query = """
    WITH split_states AS (
        SELECT
            p."Park Code",
            p."Park Name",
            TRIM(CASE
                WHEN INSTR(p.State, ', ') > 0 THEN SUBSTR(p.State, 1, INSTR(p.State, ', ') - 1)
                ELSE p.State
            END) AS State,
            p.Acres,
            p.Latitude,
            p.Longitude
        FROM
            parks AS p
        UNION
        SELECT
            p."Park Code",
            p."Park Name",
            TRIM(CASE
                WHEN INSTR(p.State, ', ') > 0 THEN SUBSTR(p.State, INSTR(p.State, ', ') + 1)
                ELSE ''
            END) AS State,
            p.Acres,
            p.Latitude,
            p.Longitude
        FROM
            parks AS p
        UNION
        SELECT
            p."Park Code",
            p."Park Name",
            TRIM(CASE
                WHEN INSTR(p.State, ', ') > 0 THEN SUBSTR(p.State, INSTR(p.State, ', ') + 2)
                ELSE ''
            END) AS State,
            p.Acres,
            p.Latitude,
            p.Longitude
        FROM
            parks AS p
    )
    SELECT
        s."Species ID",
        s.Category,
        s."Order",
        s.Family,
        s."Scientific Name",
        s."Common Names",
        s."Conservation Status",
        t."Park Code",
        t."Park Name",
        TRIM(t.State) AS State,
        t.Acres,
        t.Latitude,
        t.Longitude
    FROM
        split_states as t
    JOIN
        species as s ON s."Park Name" = t."Park Name"
    WHERE
        s."Conservation Status" <> 'None'
        AND TRIM(t.State) <> '';
    """


combined_df = pd.read_sql(text(query), con=engine)
combined_df

Unnamed: 0,Species ID,Category,Order,Family,Scientific Name,Common Names,Conservation Status,Park Code,Park Name,State,Acres,Latitude,Longitude
0,ACAD-1002,Mammal,Carnivora,Canidae,Canis latrans,"Coyote, Eastern Coyote",Species of Concern,ACAD,Acadia National Park,ME,47390,44.35,-68.21
1,ACAD-1003,Mammal,Carnivora,Canidae,Canis lupus,"Eastern Timber Wolf, Gray Wolf, Timber Wolf",Endangered,ACAD,Acadia National Park,ME,47390,44.35,-68.21
2,ACAD-1020,Mammal,Chiroptera,Vespertilionidae,Eptesicus fuscus,"Big Brown Bat, Common Brown Bat",Species of Concern,ACAD,Acadia National Park,ME,47390,44.35,-68.21
3,ACAD-1021,Mammal,Chiroptera,Vespertilionidae,Lasionycteris noctivagans,Silver-Haired Bat,Species of Concern,ACAD,Acadia National Park,ME,47390,44.35,-68.21
4,ACAD-1024,Mammal,Chiroptera,Vespertilionidae,Myotis keenii,"Keen's Myotis, Northern Long-Eared Bat, Northe...",Species of Concern,ACAD,Acadia National Park,ME,47390,44.35,-68.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5195,ZION-2176,Vascular Plant,Fabales,Fabaceae,Pediomelum epipsilum,Kane Breadroot,Species of Concern,ZION,Zion National Park,UT,146598,37.30,-113.05
5196,ZION-2388,Vascular Plant,Myrtales,Onagraceae,Camissonia bairdii,Baird's Camissonia,Species of Concern,ZION,Zion National Park,UT,146598,37.30,-113.05
5197,ZION-2403,Vascular Plant,Myrtales,Onagraceae,Epilobium nevadense,Nevada Willow-Herb,Species of Concern,ZION,Zion National Park,UT,146598,37.30,-113.05
5198,ZION-2442,Vascular Plant,Poales,Cyperaceae,Carex haysii,Hays' Sedge,Species of Concern,ZION,Zion National Park,UT,146598,37.30,-113.05


In [14]:
# Confirm that states are split
combined_df.loc[combined_df["Park Name"] == "Death Valley National Park"].reset_index(drop=True)

Unnamed: 0,Species ID,Category,Order,Family,Scientific Name,Common Names,Conservation Status,Park Code,Park Name,State,Acres,Latitude,Longitude
0,DEVA-1013,Mammal,Artiodactyla,Bovidae,Ovis canadensis,"Bighorn Sheep, Bighorn Sheep",Species of Concern,DEVA,Death Valley National Park,CA,4740912,36.24,-116.82
1,DEVA-1013,Mammal,Artiodactyla,Bovidae,Ovis canadensis,"Bighorn Sheep, Bighorn Sheep",Species of Concern,DEVA,Death Valley National Park,NV,4740912,36.24,-116.82
2,DEVA-1021,Mammal,Carnivora,Canidae,Canis latrans,Coyote,Species of Concern,DEVA,Death Valley National Park,CA,4740912,36.24,-116.82
3,DEVA-1021,Mammal,Carnivora,Canidae,Canis latrans,Coyote,Species of Concern,DEVA,Death Valley National Park,NV,4740912,36.24,-116.82
4,DEVA-1037,Mammal,Carnivora,Mustelidae,Gulo gulo luteus,,Species of Concern,DEVA,Death Valley National Park,CA,4740912,36.24,-116.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,DEVA-5204,Insect,Odonata,Coenagrionidae,Megalagrion xanthomelas,,Proposed Endangered,DEVA,Death Valley National Park,NV,4740912,36.24,-116.82
490,DEVA-5206,Insect,Odonata,Corduliidae,Somatochlora hineana,Hine's Emerald,Endangered,DEVA,Death Valley National Park,CA,4740912,36.24,-116.82
491,DEVA-5206,Insect,Odonata,Corduliidae,Somatochlora hineana,Hine's Emerald,Endangered,DEVA,Death Valley National Park,NV,4740912,36.24,-116.82
492,DEVA-5365,Fungi,Teloschistales,Caliciaceae,Texosporium sancti-jacobi,,Species of Concern,DEVA,Death Valley National Park,CA,4740912,36.24,-116.82


In [15]:
# Save the dataframe to the database
combined_df.to_sql('combined', engine, index=False, if_exists='replace')

5200

In [16]:
# INSPECT to confirm existence

# Create the inspector and connect it to the engine
inspector_gadget = inspect(engine)

# Collect the names of tables within the database
tables = inspector_gadget.get_table_names()

# print metadata for each table
for table in tables:
    print(table)
    print("-----------")
    
    # get columns
    columns = inspector_gadget.get_columns(table)
    for column in columns:
        print(column["name"], column["type"])

    print()

combined
-----------
Species ID TEXT
Category TEXT
Order TEXT
Family TEXT
Scientific Name TEXT
Common Names TEXT
Conservation Status TEXT
Park Code TEXT
Park Name TEXT
State TEXT
Acres BIGINT
Latitude FLOAT
Longitude FLOAT

conservation
-----------
Park Name TEXT
Latitude FLOAT
Longitude FLOAT
Conservation Status TEXT
Species Count BIGINT
State TEXT
Acres BIGINT

parks
-----------
Park Code TEXT
Park Name TEXT
State TEXT
Acres BIGINT
Latitude FLOAT
Longitude FLOAT

species
-----------
Species ID TEXT
Park Name TEXT
Category TEXT
Order TEXT
Family TEXT
Scientific Name TEXT
Common Names TEXT
Conservation Status TEXT



In [17]:
print(combined_df["State"].nunique())
combined_df.State.sort_values().unique()

27


array(['AK', 'AR', 'AZ', 'CA', 'CO', 'FL', 'HI', 'KY', 'ME', 'MI', 'MN',
       'MT', 'MT, ID', 'NC', 'ND', 'NM', 'NV', 'OH', 'OR', 'SC', 'SD',
       'TN', 'TX', 'UT', 'VA', 'WA', 'WY'], dtype=object)

In [34]:
print(combined_df["Conservation Status"].nunique())
combined_df["Conservation Status"].unique()

11


array(['Species of Concern', 'Endangered', 'In Recovery', 'Threatened',
       'Under Review', 'Proposed Threatened', 'Extinct',
       'Proposed Endangered', 'Resident', 'Breeder', 'Migratory'],
      dtype=object)

In [18]:
# Q3: How does the size of different parks compare with the conservation status of the animals in each park across the country?

In [27]:
query = """
    SELECT
        "Park Name",
        Latitude,
        Longitude,
        "Conservation Status",
        COUNT ("Conservation Status") AS "Species Count",
        State,
        Acres
    FROM
        combined
    WHERE
        "Conservation Status" IN ('Endangered', 'Threatened', 'Species of Concern', 'Under Review')
    GROUP BY 
        "Park Name", "State", "Conservation Status"
    ORDER BY
        Acres DESC
    """

q3_df = pd.read_sql(text(query), con=engine)
q3_df.head(15)

Unnamed: 0,Park Name,Latitude,Longitude,Conservation Status,Species Count,State,Acres
0,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Endangered,5,AK,8323148
1,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Species of Concern,40,AK,8323148
2,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Threatened,3,AK,8323148
3,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Under Review,2,AK,8323148
4,Gates Of The Arctic National Park and Preserve,67.78,-153.3,Species of Concern,21,AK,7523898
5,Death Valley National Park,36.24,-116.82,Endangered,24,CA,4740912
6,Death Valley National Park,36.24,-116.82,Species of Concern,177,CA,4740912
7,Death Valley National Park,36.24,-116.82,Threatened,16,CA,4740912
8,Death Valley National Park,36.24,-116.82,Under Review,27,CA,4740912
9,Death Valley National Park,36.24,-116.82,Endangered,24,NV,4740912


In [28]:
q3_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Park Name            220 non-null    object 
 1   Latitude             220 non-null    float64
 2   Longitude            220 non-null    float64
 3   Conservation Status  220 non-null    object 
 4   Species Count        220 non-null    int64  
 5   State                220 non-null    object 
 6   Acres                220 non-null    int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 12.2+ KB


In [29]:
q3_df.value_counts("Conservation Status")

Conservation Status
Species of Concern    59
Threatened            56
Endangered            54
Under Review          51
Name: count, dtype: int64

In [30]:
# Total Species count per park
df3 = q3_df.groupby("Park Name")["Species Count"].sum()
df3 = df3.sort_values(ascending=False)
df3

Park Name
Death Valley National Park                        488
Great Smoky Mountains National Park               280
Yellowstone National Park                         182
Redwood National Park                             162
Channel Islands National Park                     139
Big Bend National Park                            136
Grand Canyon National Park                        132
Hawaii Volcanoes National Park                    115
Joshua Tree National Park                         107
Everglades National Park                          105
Carlsbad Caverns National Park                    102
Great Basin National Park                          99
Zion National Park                                 98
Saguaro National Park                              95
Yosemite National Park                             94
Shenandoah National Park                           91
Capitol Reef National Park                         91
Mesa Verde National Park                           89
Guadalupe Mountain

In [25]:
# df_split = q3_df.assign(State=q3_df["State"].str.split(", ")).explode("State")

# df_split = df_split.reset_index(drop=True)

# df_split.head(20)

Unnamed: 0,Park Name,Latitude,Longitude,Conservation Status,Species Count,State,Acres
0,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Endangered,5,AK,8323148
1,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Species of Concern,40,AK,8323148
2,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Threatened,3,AK,8323148
3,Wrangell - St Elias National Park and Preserve,61.0,-142.0,Under Review,2,AK,8323148
4,Gates Of The Arctic National Park and Preserve,67.78,-153.3,Species of Concern,21,AK,7523898
5,Death Valley National Park,36.24,-116.82,Endangered,48,CA,4740912
6,Death Valley National Park,36.24,-116.82,Species of Concern,354,CA,4740912
7,Death Valley National Park,36.24,-116.82,Threatened,32,CA,4740912
8,Death Valley National Park,36.24,-116.82,Under Review,54,CA,4740912
9,Katmai National Park and Preserve,58.5,-155.0,Endangered,6,AK,3674530


In [31]:
# Save the dataframes to the database
q3_df.to_sql('conservation', engine, index=False, if_exists='replace')

220