Examining common themed variables

In [None]:
#Latihan 1
learning_platform_usefulness <- multiple_choice_responses %>%
  # select columns with LearningPlatformUsefulness in title
  select(contains("LearningPlatformUsefulness")) %>%
  # change data from wide to long
  gather(learning_platform, usefulness) %>%
  # remove rows where usefulness is NA
  filter(!is.na(usefulness)) %>%
  # remove "LearningPlatformUsefulness" from each string in learning_platform 
  mutate(learning_platform = str_replace(learning_platform, "LearningPlatformUsefulness", ""))

In [None]:
#Latihan 2
learning_platform_usefulness %>%
  # change dataset to one row per learning_platform usefulness pair with number of entries for each
  count(learning_platform, usefulness) 

In [None]:
#Latihan 3
learning_platform_usefulness %>%
  # If usefulness is "Not Useful", make 0, else 1 
  mutate(usefulness = if_else(usefulness == "Not Useful", 0, 1))

In [None]:
usefulness_by_platform <- learning_platform_usefulness %>%
  # If usefulness is "Not Useful", make 0, else 1 
  mutate(usefulness = if_else(usefulness == "Not Useful", 0, 1)) %>%
  # Group by learning platform 
  group_by(learning_platform) %>%
  # Summarize the mean usefulness for each platform
  summarize(avg_usefulness = mean(usefulness))

In [None]:
# Make a scatter plot of average usefulness by learning platform 
ggplot(usefulness_by_platform, aes(x = learning_platform, y = avg_usefulness)) + 
  geom_point()

Tricks of ggplot2

In [None]:
#Latihan 1
ggplot(usefulness_by_platform, aes(x = learning_platform, y = avg_usefulness)) + 
    geom_point() + 
  # rotate x-axis text by 90 degrees
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  # rename y and x axis labels
  labs(x = "Learning Platform", y = "Percent finding at least somewhat useful") + 
  # change y axis scale to percentage
  scale_y_continuous(labels = scales::percent)

In [None]:
#Latihan 2
usefulness_by_platform %>%
  # reorder learning_platform by avg_usefulness
  mutate(learning_platform = fct_reorder(learning_platform, avg_usefulness)) %>%
  # reverse the order of learning_platform
  mutate(learning_platform = fct_rev(learning_platform)) %>%
  ggplot(aes(x = learning_platform, y = avg_usefulness)) + 
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "Learning Platform", y = "Percent finding at least somewhat useful") + 
  scale_y_continuous(labels = scales::percent)

Changing and creating variables with case_when()

In [None]:
#Latihan 1
# Check the min age
min(multiple_choice_responses$Age, na.rm = TRUE)

# Check the max age
max(multiple_choice_responses$Age, na.rm = TRUE) 

In [None]:
multiple_choice_responses %>%
    # Filter for rows where Age is between 10 and 90
    filter(between(Age, 10, 90))

In [None]:
multiple_choice_responses %>%
    # Filter for rows where Age is between 10 and 90
    filter(between(Age, 10, 90)) %>%
    # Create the generation variable based on age
    mutate(generation = case_when(
    between(Age, 10, 22) ~ "Gen Z", 
    between(Age, 23, 37) ~ "Gen Y", 
    between(Age, 38, 52) ~ "Gen X", 
    between(Age, 53, 71) ~ "Baby Boomer", 
    between(Age, 72, 90) ~ "Silent"
    )) 

In [None]:
multiple_choice_responses %>%
    # Filter for rows where Age is between 10 and 90
    filter(between(Age, 10, 90)) %>%
    # Create the generation variable based on age
    mutate(generation = case_when(
    between(Age, 10, 22) ~ "Gen Z", 
    between(Age, 23, 37) ~ "Gen Y", 
    between(Age, 38, 52) ~ "Gen X", 
    between(Age, 53, 71) ~ "Baby Boomer", 
    between(Age, 72, 90) ~ "Silent"
    )) %>%
    # Get a count of how many answers in each generation
    count(generation)

In [None]:
#Latihan 2
multiple_choice_responses %>%
  # Filter out people who selected Data Scientist as their Job Title
  filter(CurrentJobTitleSelect != "Data Scientist") 

In [None]:
multiple_choice_responses %>%
    # Filter out people who selected Data Scientist as their Job Title
    filter(CurrentJobTitleSelect != "Data Scientist") %>%
  # Create a new variable, job_identity
  mutate(job_identity = case_when(
    CurrentJobTitleSelect == "Data Analyst" & 
      DataScienceIdentitySelect == "Yes" ~ "DS analysts", 
    CurrentJobTitleSelect == "Data Analyst" & 
      DataScienceIdentitySelect %in% c("No", "Sort of (Explain more)") ~ "NDS analyst", 
    CurrentJobTitleSelect != "Data Analyst" & 
      DataScienceIdentitySelect == "Yes" ~ "DS non-analysts", 
    TRUE ~ "NDS non analysts")) 

In [None]:
multiple_choice_responses %>%
    # Filter out people who selected Data Scientist as their Job Title
    filter(CurrentJobTitleSelect != "Data Scientist") %>%
    # Create a new variable, job_identity
    mutate(job_identity = case_when(
        CurrentJobTitleSelect == "Data Analyst" & 
        DataScienceIdentitySelect == "Yes" ~ "DS analysts", 
        CurrentJobTitleSelect == "Data Analyst" & 
        DataScienceIdentitySelect %in% c("No", "Sort of (Explain more)") ~ "NDS analyst", 
        CurrentJobTitleSelect != "Data Analyst" & 
        DataScienceIdentitySelect == "Yes" ~ "DS non-analysts", 
        TRUE ~ "NDS non analysts")) %>%
    # Get the average job satisfaction by job_identity, removing NAs
  group_by(job_identity) %>%
  summarize(avg_js = mean(JobSatisfaction, na.rm = TRUE))