<center><h1>Introduction to ggplot2</h1></center>
<center><h3>Ellen Duong</h3></center>
<center><h3>Paul Stey</h3></center>
<center><h3>2023-10-12</h3></center>

# 1. What is _ggplot2_?

  - Hugely popular R package for visualization 
  - Authored by Hadley Wickham (of _dplyr_ and _tidyverse_ fame)
  - Implements the "grammar-of-graphics" design philosophy (hence "gg")
  - Easily produces beautiful and informative visualizations

# 2. Plotting Person-Level Characteristics in Arrests

  - The Pvd arrests data are at the _violation_-level
  - We want person-level data on the invdividuals arrested
  - This "_level-of-analysis_" or "_level-of-granularity_" problem is ubiquitous 

# 3. Generating Person-Level Data
  - We are aggregating "up" from the violation level
  - Will use the `group_by()` and `summarise()` idiom

In [None]:
# Load necessary packages
library(stringr)
library(dplyr)
library(ggplot2)

arrests_df <- read.csv("data/pvd_arrests_2021-10-03.csv")


## 3.1 Computing Number of Officers (correctly)

  - First, need to determine if `arresting_officers` is in _full-name_-format or _first-initial_-format

In [None]:
is_uppercase <- function(chr) {
    res <- chr %in% LETTERS
    return(res)
}

has_full_names <- function(names_str) {
    char1 <- str_sub(names_str, 1, 1)
    char2 <- str_sub(names_str, 2, 2)
    
    res <- !(is_uppercase(char1) && is_uppercase(char2))
    return(res)
}

### 3.1.1 Couting the Names 
  - Want to correct count names regardless of format
  - Update our `count_names()` function

In [None]:
count_names <- function(names_str) {
    names_str_trm <- str_trim(names_str)     # remove whitespace
    
    if (has_full_names(names_str_trm)) {
        split_char <- "/ "
    } else {
        split_char <- ", "
    }
    
    name_list <- str_split(names_str_trm, split_char)
    
    name_vec <- unlist(name_list)
    
    k <- length(name_vec)
    
    return(k)
}

### 3.1.2 Counting Officers (correctly)
  - Note the sequence of function calls:
    + `count_all_names()` => `count_names()` => `has_full_names()` => `is_uppercase()`

In [None]:
count_all_names <- function(col) {

    n <- length(col)   # get the length of our input column
    cnts <- rep(0, n)  # allocate vector of zeros to populate with counts

    for (i in 1:n) {
        cnts[i] <- count_names(col[i])
    }
    return(cnts) 
}

In [None]:
arrests_df$officer_cnt <- count_all_names(arrests_df$arresting_officers)

## 3.2 Add Violent Offense Flag

In [None]:
# Write function to flag alleged violent crimes from the 
# description of of the statute violation

is_violent_offense <- function(v) {

    violent_terms <- c("domestic-asslt", "assault", "battery", "murder")
    n_obs <- length(v)
    is_violent <- rep(FALSE, n_obs)
    
    # iterate over all statute descriptions
    for (i in 1:n_obs) {
        
        # iterate over the 4 terms associated with violence
        for (term in violent_terms) {
            if (!is.na(v[i]) && str_detect(tolower(v[i]), term)) {

                is_violent[i] <- TRUE
            }
        }
    }
    return(is_violent)
}

### 3.2.1 Test our Function (always!!)

In [None]:
vio_vec <- c("DISORDERLY CONDUCT", 
             "RESISTING LEGAL OR ILLEGAL ARREST",
             "DOMESTIC-SIMPLE ASSAULT/BATTERY", 
             "SIMPLE ASSAULT OR BATTERY")

is_violent_offense(vio_vec)    # Should be: FALSE, FALSE, TRUE, TRUE

### 3.2.2 Create `violent` Column in `arrests_df`

In [None]:
arrests_df$violent <- is_violent_offense(arrests_df$statute_desc)

In [None]:
head(arrests_df)

## 3.3 Aggregating to _Person-Level_ DataFrame

  - Use the `group_by()` and `summarise()` pattern from _dplyr_ functions

In [None]:
person_df <- arrests_df %>%
    group_by(arrestee_id) %>%
    summarise(
        total_charges = n(),
        num_uniq_arrests = length(unique(case_number)),
        prop_violent = mean(violent),
        mean_officer_cnt = mean(officer_cnt),
        age = age[1],
        gender = gender[1]
    ) 

In [None]:
head(person_df)

# 4. Intro to _ggplot2_
  - Operates on `data.frame` objects
  - Map variables to aesthetics, and then display using "geom" (i.e., "geometric object")
  - Geom layers can be stack over one another to add information

In [None]:
ggplot(person_df, aes(x = age))    # does nothing...

## 4.1 Plotting Histogram of `age`

In [None]:
ggplot(person_df, aes(x = age)) +
    geom_histogram()   # kinda boring...

### 4.1.1 Adding `colour` and `fill` to `geom_histogram()`

In [None]:
ggplot(person_df, aes(x = age)) +
    geom_histogram(fill = "skyblue", colour = "lightblue", bins = 30)

## 4.2 Density Plot of `age`

In [None]:
ggplot(person_df, aes(x = age)) +
    geom_density(fill = "skyblue", colour = "lightblue")  

### 4.2.1 Adjusting `alpha`

In [None]:
ggplot(person_df, aes(x = age)) + 
    geom_density(fill = "skyblue", colour = "lightblue", alpha = 0.5)

### 4.2.2 Adding `gender` Variable Aesthetic

In [None]:
ggplot(person_df, aes(x = age, y = stat(count), fill = gender, colour = gender)) +
    geom_density(alpha = 0.4) +
    xlab("Age of Person Arrested") +
    ylab("Count")

## 4.3 Scatter Plot of `age` and `total_charges` 

In [None]:
ggplot(person_df, aes(x = age, y = total_charges)) + 
    geom_point()

### 4.3.1 Adjusting `colour` and `alpha`

In [None]:
ggplot(person_df, aes(x = age, y = total_charges)) +
    geom_point(colour = "skyblue", alpha = 0.6)

### 4.3.2 Using `geom_jitter` for Scatterplots

In [None]:
ggplot(person_df, aes(x = age, y = total_charges)) +
    geom_jitter(colour = "purple", alpha = 0.4) 

## 4.4 Plotting `num_uniq_arrests` and `total_charges` with a `stat_smooth()` Layer

In [None]:
ggplot(person_df, aes(x = num_uniq_arrests, y = total_charges)) +
    geom_jitter(colour = "violet", alpha = 0.4) +
    stat_smooth(method = "lm", formula = y ~ x)

## 4.5 Adding Third Variable to `aes()`

In [None]:
ggplot(person_df, aes(x = num_uniq_arrests, y = total_charges, colour = prop_violent)) +
    geom_jitter(alpha = 0.5) + 
    xlim(1, 6) +
    ylim(1, 15)