<a href="https://colab.research.google.com/github/corinneah/intro-to-R/blob/main/Tutorial_1_Intro_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. The R Environment 

In [None]:
x <- 12

In [None]:
# create vector of integers from 0 to 9
some_numbers <- c(0,1,2,3,4,5,6,7,8,9)

# alternatively 
some_numbers <- c(0:9)

In [None]:
some_numbers

In [None]:
##  [1] 0 1 2 3 4 5 6 7 8 9

In [None]:
mean(some_numbers) # [1] 4.5

In [None]:
install.packages('tidyverse')

2. Getting Started with Data

In [None]:
library(tidyverse)

In [None]:
## ✔ ggplot2 3.1.1       ✔ purrr   0.2.5  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.1       ✔ stringr 1.4.0  
## ✔ readr   1.1.1       ✔ forcats 0.3.0

In [None]:
library(tidyverse)

In [None]:
paygap <- read.csv('/UK Gender Pay Gap Data - 2019 to 2020.csv', header = TRUE)

In [None]:
head(paygap, n = 5)

In [None]:
colnames(paygap)

In [None]:
dim(paygap)

In [None]:
class(paygap)

In [None]:
class(paygap$DiffMeanHourlyPercent)

3. Wrangling and Visualizing Data

In [None]:
# extract element in 1st row and 3th column
paygap[1,3]

In [None]:
# extract rows 5-7 from columns 1-3
paygap[5:7,1:3]

In [None]:
# extract column 'bodytemp' and assign to new object 'temperatures'
Employment_address <- paygap$Address

In [None]:
# select columns 'gender' and 'heartrate' and assign to new object 'heartrates'
Employer_info <- select(paygap, EmployerName, EmployerId)

In [None]:
## append variable names with their respective units
# rename 'bodytemp' to 'bodytemp_degF'
cardiacdata <- rename(cardiacdata, bodytemp_degF = bodytemp)

# rename 'heartrate' to 'heartrate_bpm'
cardiacdata <- rename(cardiacdata, heartrate_bpm = heartrate)

In [None]:
head(paygap)

In [None]:
# add new column with body temperature in degrees celsius
cardiacdata <- mutate(cardiacdata, bodytemp_degC = (bodytemp_degF - 32)*(5/9))

In [None]:
# change values in gender column to read "Female" and "Male" instead of 1 and 2
cardiacdata <- mutate(cardiacdata, gender = ifelse(gender == 1, 'Female', 
                                            ifelse(gender == 2, 'Male', NA)))

In [None]:
head(paygap)

In [None]:
# filter for data from female subjects only
paygap_on_time <- filter(paygap, SubmittedAfterTheDeadline == 'False')

In [None]:
paygap_byEmployerSize <- paygap %>%
  group_by(EmployerSize) %>%                                  # group by 'EmployerSize'
  summarize(FemaleBonusPercent = mean(FemaleBonusPercent),    # aggregate using means
            MaleBonusPercent = mean(MaleBonusPercent), 
            DiffMeanBonusPercent = mean(DiffMeanBonusPercent))

In [None]:
paygap_byEmployerSize

In [None]:
cardiacdata <- cardiacdata %>%
  filter(gender == 'Female') %>%
  select(-gender) %>%
  mutate(bodytemp_Kelvin = bodytemp_degC + 273)

In [None]:
paygap_byEmployerSize 

In [None]:
some_numbers <- c(4,5,6,7,NA,8)

mean(some_numbers)

In [None]:
mean(some_numbers, na.rm = TRUE)


In [None]:
ggplot(data = paygap, mapping = aes(x = MaleBonusPercent)) + 
  geom_histogram(bins = 20)

In [None]:
ggplot(data = paygap, mapping = aes(x = MaleBonusPercent)) + 
  geom_histogram(bins = 20, aes(y = ..density..), fill = 'lightblue') +
  xlab('Male bonus (percentage)') +
  ggtitle('distribution of male bonuses') +
  theme_light()

In [None]:
ggplot(data = paygap, mapping = aes(x = MaleBonusPercent)) + 
  geom_histogram(bins = 20, aes(y = ..density..))

In [None]:
ggplot(data = paygap, mapping = aes(x = FemaleBonusPercent, y = MaleBonusPercent)) + 
  geom_point() +
  theme_dark()

In [None]:
ggplot(data = paygap, mapping = aes(x = FemaleBonusPercent, y = MaleBonusPercent, color = EmployerSize )) + 
  geom_point() +
  theme_light()

In [None]:
ggplot(data = paygap, mapping = aes(x = EmployerSize, y = MaleBonusPercent)) +
  geom_boxplot() +
  xlab('Employer Size') +
  ylab('Male bonus (percentage)') +
  theme_light()