# Visualize cumulative contibution by user over time for Project Rephetio

In [1]:
library(magrittr)

## Read datasets

In [2]:
profile_df = readr::read_tsv('../process/table/profiles.tsv') %>%
  dplyr::select(project, profile_id, username, first_name, last_name)
comment_df = readr::read_tsv('../process/table/comments.tsv') %>%
  dplyr::mutate(content_type = 'comment')
note_df = readr::read_tsv('../process/table/notes.tsv') %>%
  dplyr::mutate(content_type = 'note')

Parsed with column specification:
cols(
  first_name = col_character(),
  last_name = col_character(),
  profile_id = col_integer(),
  project = col_character(),
  url = col_character(),
  username = col_character()
)
Parsed with column specification:
cols(
  comment_id = col_integer(),
  profile_id = col_integer(),
  project = col_character(),
  published = col_datetime(format = ""),
  thread_id = col_integer(),
  url = col_character(),
  word_count = col_integer(),
  character_count = col_integer()
)
Parsed with column specification:
cols(
  added = col_datetime(format = ""),
  comment_id = col_integer(),
  note_id = col_integer(),
  profile_id = col_integer(),
  project = col_character(),
  url = col_character(),
  word_count = col_integer(),
  character_count = col_integer()
)


## Combine comments and notes into a single dataframe

In [3]:
content_df = dplyr::bind_rows(comment_df, note_df) %>%
  dplyr::inner_join(profile_df) %>%
  dplyr::mutate(date = dplyr::coalesce(published, added)) %>%
  dplyr::select(-comment_id, -note_id, -published, -added) %>%
  dplyr::filter(project == 'rephetio') %>%
  dplyr::arrange(date)

Joining, by = c("profile_id", "project")


## Create a dataframe where each row is the cumulative contribution of a user at a given datetime

In [4]:
cumulative_df = content_df %>%
  dplyr::group_by(username) %>%
  dplyr::mutate(cum_words = cumsum(word_count)) %>% 
  dplyr::mutate(cum_chars = cumsum(character_count)) %>%
  dplyr::ungroup() %>%
  dplyr::select(username, date, cum_words, cum_chars)

min_date = min(cumulative_df$date)
max_date = max(cumulative_df$date)

six_months = lubridate::duration(6, 'months')
even_date_df = dplyr::data_frame(username = 'dhimmel', date = seq(min_date - six_months, max_date + six_months, by='hours'))

cumulative_df = dplyr::bind_rows(
  cumulative_df %$%
    dplyr::data_frame(username = unique(username), date=min_date - six_months, cum_words=0, cum_chars=0),
  cumulative_df,
  cumulative_df %>%
    dplyr::group_by(username) %>%
    dplyr::summarize(date=max_date + six_months, cum_words = max(cum_words), cum_chars = max(cum_chars))
  ) %>%
  dplyr::full_join(even_date_df) %>%
  tidyr::complete(username, date) %>%
  tidyr::fill(username, cum_words, cum_chars) %>%
  dplyr::filter(date %in% even_date_df$date) %>% 
  dplyr::mutate(cum_chars_trans = cum_chars ^ 0.5)

cumulative_df %>% tail(2) 

Joining, by = c("username", "date")


username,date,cum_words,cum_chars,cum_chars_trans
vsmalladi,2017-08-09 04:55:24,320,2008,44.81071
vsmalladi,2017-08-09 05:55:24,320,2008,44.81071


## Create a dataframe of users

In [5]:
get_colors = colorRampPalette(RColorBrewer::brewer.pal(12, 'Paired'))

usage_df = content_df %>%
  dplyr::group_by(username, first_name, last_name) %>%
  dplyr::summarize(
    first_date = min(date),
    total_chars = sum(character_count),
    total_words = sum(word_count),
    total_chars_trans = total_chars ^ 0.5
  ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(first_date) %>%
  dplyr::mutate(y_position = cumsum(total_chars_trans) - total_chars_trans / 2) %>%
  dplyr::mutate(color = get_colors(length(username)))

usage_df$username = factor(usage_df$username, levels = usage_df$username)

usage_df %>% head(3)

username,first_name,last_name,first_date,total_chars,total_words,total_chars_trans,y_position,color
dhimmel,Daniel,Himmelstein,2015-01-14 05:55:24,561514,87822,749.34238,374.6712,#A6CEE3
jspauld,Jesse,Spaulding,2015-01-16 10:18:57,2485,465,49.84977,774.2673,#86B9D8
caseygreene,Casey,Greene,2015-01-22 20:43:07,6888,1211,82.99398,840.6891,#66A5CD


## Create a smoothed dataframe of cumulative contribution

In [6]:
min_date_decimal = min(content_df$date) %>% lubridate::decimal_date()
max_date_decimal = max(content_df$date) %>% lubridate::decimal_date()
constant = nrow(even_date_df) / diff(lubridate::decimal_date(range(even_date_df$date)))

densify = function(df) {
  den = suppressWarnings(density(
    df$date %>% lubridate::decimal_date(),
    weights = df$cum_chars, bw=0.03,
    from = min_date_decimal, to = max_date_decimal))
  dplyr::data_frame(date = lubridate::date_decimal(den$x), cum_chars = den$y / constant) %>% return
}

smooth_df = cumulative_df %>%
  dplyr::group_by(username) %>%
  dplyr::do(densify(.)) %>%
  dplyr::mutate(cum_chars_trans = cum_chars ^ 0.5)

smooth_df$username = factor(smooth_df$username, levels = usage_df$username)

smooth_df %<>% dplyr::arrange(username, date)

## Plot contribution over time

In [7]:
label_df = usage_df %>%
  dplyr::filter(total_chars >= 4500) %>%
  dplyr::mutate(date = max_date) %>%
  dplyr::mutate(name_label = sprintf('- %s %s', first_name, last_name)) %>%
  dplyr::mutate(nchar_label = sprintf('%.1fK ', total_chars / 1000))

In [8]:
major_breaks = seq(lubridate::make_datetime(2015, 1, 1), max_date, by = "3 months")
minor_breaks = seq(lubridate::make_datetime(2015, 1, 1), max_date, by = "1 months")
light_black = '#282828'

gg = smooth_df %>%
  ggplot2::ggplot(ggplot2::aes(x = date)) +
  ggplot2::geom_area(ggplot2::aes(y = cum_chars_trans, fill = username), 
            alpha = 0.9, size = 0.27, colour = "grey95",
            position=ggplot2::position_stack(reverse = TRUE)) +
  ggplot2::geom_text(data = label_df,
    mapping = ggplot2::aes(y = y_position, label=name_label, color=username), size = 4, hjust = 0) +
  ggplot2::geom_text(data = label_df,
    mapping = ggplot2::aes(y = y_position, label=nchar_label), size = 4, hjust = 1, color=light_black) +
  hetior::theme_dhimmel() +
  ggplot2::theme_minimal() +
  ggplot2::guides(colour = "none", fill = "none") +
  ggplot2::scale_x_datetime(date_labels = '%b %Y', breaks = major_breaks, minor_breaks = minor_breaks,
    limits = c(min_date - lubridate::period(1, 'day'), max_date + (max_date - min_date) / 5.25)) +
  ggplot2::scale_y_continuous(breaks = NULL, minor_breaks = NULL, labels = NULL, expand = c(0, 0)) +
  ggplot2::scale_fill_manual(values = usage_df$color) + 
  ggplot2::scale_color_manual(values = label_df$color) + 
  ggplot2::theme(
    axis.title.y = ggplot2::element_text(color = light_black, margin=ggplot2::margin(0,-9,0,0)),
    axis.ticks.x = ggplot2::element_line(size = 0.4, color = light_black),
    axis.text.x = ggplot2::element_text(color = light_black),
    axis.ticks.length = grid::unit(0.1, "cm"),
    panel.grid.major.x = ggplot2::element_line(colour = "grey70"),
    panel.grid.minor.x = ggplot2::element_line(colour = "grey98")) +
  ggplot2::labs(x = NULL, y = expression(sqrt(Total~Characters)))

# Save image to file
ggplot2::ggsave(gg, filename = 'rephetio-contribution.png', dpi = 300, width = 8, height = 4.3)
ggplot2::ggsave(gg, filename = 'rephetio-contribution.pdf', width = 8, height = 4.3)

“`legend.margin` must be specified using `margin()`. For the old behavior use legend.spacing”

## Summary statistics

In [9]:
# Number of discussions
dplyr::n_distinct(content_df$thread_id, na.rm = TRUE)

In [10]:
# Number of comments and notes
table(content_df$content_type)


comment    note 
    607     190 

In [11]:
# Number of users
nrow(usage_df)

In [12]:
# Non-team members who contributed
team_members = c(
    'dhimmel',
    'pouyakhankhanian',
    'alizee',
    'leobrueggeman',
    'sabrinachen',
    'idrdex',
    'chrissyhessler',
    'arigreen',
    'sergiobaranzini'
)

length(setdiff(usage_df$username, team_members))

In [13]:
# Total number of characters
sum(usage_df$total_chars)

In [14]:
# Total number of words
total_words = sum(usage_df$total_words)
total_words

In [15]:
# Estimated number of manuscripts of content
# http://academia.stackexchange.com/q/35133
round(total_words / 7000, 2)

In [16]:
min_date
max_date
max_date - min_date

[1] "2015-01-14 05:55:24 UTC"

[1] "2017-02-07 18:31:08 UTC"

Time difference of 755.5248 days

## Package information

In [17]:
sessionInfo()

R version 3.3.1 (2016-06-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.10

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] magrittr_1.5

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.8        munsell_0.4.3      uuid_0.1-2         colorspace_1.3-1  
 [5] R6_2.2.0           plyr_1.8.4         stringr_1.1.0      dplyr_0.5.0       
 [9] tools_3.3.1        grid_3.3.1         gtable_0.2.0       DBI_0.5-1         
[13] lazyeval_0.2.0     assertthat_0.1     digest_0.6.11      tibble_1.2        
[17] crayon_1.3.2       IRdispl