# Intro to Data Science Midterm Review

In [1]:
library(tidyverse)
library(palmerpenguins)
options(repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007
Adelie,Torgersen,39.5,17.4,186,3800,female,2007
Adelie,Torgersen,40.3,18.0,195,3250,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,49.6,18.2,193,3775,male,2009
Chinstrap,Dream,50.8,19.0,210,4100,male,2009
Chinstrap,Dream,50.2,18.7,198,3775,female,2009


## Data Wrangling

In [3]:
# filter() - selects rows

filter_penguins <- penguins |>
    filter(species == "Adelie", island == "Torgersen")
filter_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007
Adelie,Torgersen,39.5,17.4,186,3800,female,2007
Adelie,Torgersen,40.3,18.0,195,3250,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Adelie,Torgersen,44.1,18.0,210,4000,male,2009
Adelie,Torgersen,38.5,17.9,190,3325,female,2009
Adelie,Torgersen,43.1,19.2,197,3500,male,2009


In [4]:
# select() - chooses columns

select_penguins <- penguins |>
    select(species, bill_length_mm, body_mass_g)
select_penguins

species,bill_length_mm,body_mass_g
<fct>,<dbl>,<int>
Adelie,39.1,3750
Adelie,39.5,3800
Adelie,40.3,3250
⋮,⋮,⋮
Chinstrap,49.6,3775
Chinstrap,50.8,4100
Chinstrap,50.2,3775


In [5]:
# mutate() - creates or modifies columns

mutate_penguins <- penguins |>
    mutate(bill_ratio = bill_length_mm / bill_depth_mm)
mutate_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,bill_ratio
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>,<dbl>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007,2.090909
Adelie,Torgersen,39.5,17.4,186,3800,female,2007,2.270115
Adelie,Torgersen,40.3,18.0,195,3250,female,2007,2.238889
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,49.6,18.2,193,3775,male,2009,2.725275
Chinstrap,Dream,50.8,19.0,210,4100,male,2009,2.673684
Chinstrap,Dream,50.2,18.7,198,3775,female,2009,2.684492


In [10]:
# arrange() - sorts rows

# automatically uses ascending order
arrange_penguins <- penguins |>
    arrange(flipper_length_mm)
arrange_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Biscoe,37.9,18.6,172,3150,female,2007
Adelie,Biscoe,37.8,18.3,174,3400,female,2007
Adelie,Torgersen,40.2,17.0,176,3450,female,2009
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Gentoo,Biscoe,54.3,15.7,231,5650,male,2008
Adelie,Torgersen,,,,,,2007
Gentoo,Biscoe,,,,,,2009


In [9]:
# arrange() & desc() – using descending order

desc_penguins <- penguins |>
    arrange(desc(body_mass_g))
desc_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Gentoo,Biscoe,49.2,15.2,221,6300,male,2007
Gentoo,Biscoe,59.6,17.0,230,6050,male,2007
Gentoo,Biscoe,51.1,16.3,220,6000,male,2008
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,46.9,16.6,192,2700,female,2008
Adelie,Torgersen,,,,,,2007
Gentoo,Biscoe,,,,,,2009


In [15]:
# group_by() - treats subsets of data as separate groups
# summarize() - reduces each group to a singular row using summary functions

grouped_penguins <- penguins |>
    group_by(species) |>
    summarize(avg_mass = mean(body_mass_g, na.rm = TRUE))
grouped_penguins

species,avg_mass
<fct>,<dbl>
Adelie,3700.662
Chinstrap,3733.088
Gentoo,5076.016


In [16]:
# slice() - picks out rows

# slice_max() - finds the top rows by value

heaviest_penguin <- penguins |>
    slice_max(order_by = body_mass_g, n = 1)
heaviest_penguin

# slice_min() - finds the least rows by value 

lightest_3_penguins <- penguins |>
    slice_min(order_by = body_mass_g, n = 3)
lightest_3_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Gentoo,Biscoe,49.2,15.2,221,6300,male,2007


species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Chinstrap,Dream,46.9,16.6,192,2700,female,2008
Adelie,Biscoe,36.5,16.6,181,2850,female,2008
Adelie,Biscoe,36.4,17.1,184,2850,female,2008


In [17]:
# rename() - renames columns

rename_penguins <- penguins |>
    rename(mass = body_mass_g)
rename_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,mass,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007
Adelie,Torgersen,39.5,17.4,186,3800,female,2007
Adelie,Torgersen,40.3,18.0,195,3250,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,49.6,18.2,193,3775,male,2009
Chinstrap,Dream,50.8,19.0,210,4100,male,2009
Chinstrap,Dream,50.2,18.7,198,3775,female,2009


In [18]:
# distinct() - removes duplicates

unique_penguins <- penguins |>
    distinct(species, island)
unique_penguins

species,island
<fct>,<fct>
Adelie,Torgersen
Adelie,Biscoe
Adelie,Dream
Gentoo,Biscoe
Chinstrap,Dream


In [22]:
# across() - applies functions to multiple columns

mutate_penguins_2 <- penguins |>
    mutate(across(ends_with("_mm"), scale)) # standardized all measurements to scale
mutate_penguins_2

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<int>,<fct>,<int>
Adelie,Torgersen,-0.8832047,0.7843001,-1.4162715,3750,male,2007
Adelie,Torgersen,-0.8099390,0.1260033,-1.0606961,3800,female,2007
Adelie,Torgersen,-0.6634077,0.4298326,-0.4206603,3250,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,1.040019,0.5311090,-0.5628905,3775,male,2009
Chinstrap,Dream,1.259816,0.9362147,0.6460660,4100,male,2009
Chinstrap,Dream,1.149917,0.7843001,-0.2073150,3775,female,2009


In [23]:
# head() / tail() - previews the first or last few rows

head(penguins, 5)
tail(penguins, 5)

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
Adelie,Torgersen,,,,,,2007
Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Chinstrap,Dream,55.8,19.8,207,4000,male,2009
Chinstrap,Dream,43.5,18.1,202,3400,female,2009
Chinstrap,Dream,49.6,18.2,193,3775,male,2009
Chinstrap,Dream,50.8,19.0,210,4100,male,2009
Chinstrap,Dream,50.2,18.7,198,3775,female,2009


In [24]:
# na.rm = TRUE
# ignores NAs in calculations

average_mass <- mean(penguins$body_mass_g, na.rm = TRUE)
average_mass

# !is.na()
# filters out rows with missing values

filtered_penguins <- penguins |> 
  filter(!is.na(body_mass_g))
filtered_penguins

species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
<fct>,<fct>,<dbl>,<dbl>,<int>,<int>,<fct>,<int>
Adelie,Torgersen,39.1,18.7,181,3750,male,2007
Adelie,Torgersen,39.5,17.4,186,3800,female,2007
Adelie,Torgersen,40.3,18.0,195,3250,female,2007
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
Chinstrap,Dream,49.6,18.2,193,3775,male,2009
Chinstrap,Dream,50.8,19.0,210,4100,male,2009
Chinstrap,Dream,50.2,18.7,198,3775,female,2009
