In [3]:
import pandas as pd
import numpy as np

# adapted from palmerpenguins package in R 
#   Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer
#   Archipelago (Antarctica) penguin data. R package version 0.1.0.
#   https://allisonhorst.github.io/palmerpenguins/. doi:
#   10.5281/zenodo.3960218.

In [4]:
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
uri_adelie = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff"

# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
uri_gentoo = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381"

# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
uri_chinstrap = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462"

# Combining the URIs
uris = [uri_adelie, uri_gentoo, uri_chinstrap]

# Downloading and importing data
penguins_raw_adelie = pd.read_csv(uri_adelie, na_values = ["", "NA", ".", "NaN"])
penguins_raw_gentoo = pd.read_csv(uri_gentoo, na_values = ["", "NA", ".", "NaN"])
penguins_raw_chinstrap = pd.read_csv(uri_chinstrap, na_values = ["", "NA", ".", "NaN"])

# Combine data frames
penguins_raw_df = pd.concat([penguins_raw_adelie, penguins_raw_gentoo, penguins_raw_chinstrap])


In [5]:
# Make column names lowercase
penguins_raw_df.columns = [x.lower() for x in penguins_raw_df.columns]

# Get shortened version of Species
penguins_raw_df["species_short"] = [x.split(" ")[0] for x in penguins_raw_df["species"]]

# Make sex values lowercase
penguins_raw_df["sex"] = [x.lower() if isinstance(x, str) else x for x in penguins_raw_df["sex"]]

# Turn date into datetime
penguins_raw_df["date egg"] = pd.to_datetime(penguins_raw_df["date egg"])

# Grab just year from date
penguins_raw_df["year"] = pd.DatetimeIndex(penguins_raw_df['date egg']).year

In [6]:
# Select only columns of interest
penguins_df = penguins_raw_df[["species_short", "island", "culmen length (mm)",
                              "culmen depth (mm)", "flipper length (mm)", "body mass (g)",
                              "sex", "year"]]

# Rename columns: species short --> species, culmen --> bill
penguins_df.columns = ["species", "island", "bill_length_mm",
                              "bill_depth_mm", "flipper_length_mm", "body_mass_g",
                              "sex", "year"]

# Write to csv in current directory

penguins_df.to_csv("penguins.csv")