In [None]:
setwd('/home/notebook/data-analysis')

In [None]:
if (exists('INC_DATABASE_R')) {
    rm(INC_DATABASE_R)
}
source('include/database.r')
library(ggplot2)
library(plyr)

mode <- function(data, na.rm=F) {
    unique_data <- unique(data[!is.na(data)])
    return(unique_data[which.max(tabulate(match(data, unique_data)))])
}

In [None]:
conn <- connect()
query <- 'SELECT COALESCE(project.quality_display_name, project.name) AS project, num_changes, COUNT(*) AS count FROM (
    SELECT issue.project_id, issue.issue_id, SUM(CASE WHEN old_issue.issue_id IS NULL THEN 0 ELSE 1 END) AS num_changes
    FROM gros.issue
    LEFT JOIN gros.issue AS old_issue
    ON issue.project_id = old_issue.project_id
    AND issue.issue_id = old_issue.issue_id
    AND issue.changelog_id = old_issue.changelog_id + 1
    WHERE issue.story_points <> old_issue.story_points
    AND issue.story_points NOT IN (99, 100, 122, 999)
    AND old_issue.story_points NOT IN (99, 100, 122, 999)
    GROUP BY issue.project_id, issue.issue_id
) AS changes
JOIN gros.project ON changes.project_id = project.project_id
WHERE COALESCE(project.is_support_team, FALSE) = FALSE
GROUP BY project.quality_display_name, project.name, num_changes ORDER BY project.quality_display_name, project.name, num_changes'
data <- dbGetQuery(conn, query)
data

In [None]:
lapply(split(data, data$num_changes), function(c) { sum(c$count) })

In [None]:
lapply(split(data, data$project), function(project) {
    ggplot(project) + geom_bar(aes(x=num_changes, y=count), stat="identity") +
        labs(title=paste("number of user stories in", project[1, "project"])) +
        theme(plot.title = element_text(hjust = 0.5))
})

In [None]:
query <- 'SELECT COALESCE(project.quality_display_name, project.name) AS project, issue.issue_id, issue.story_points - old_issue.story_points AS change
    FROM gros.issue
    LEFT JOIN gros.issue AS old_issue
    ON issue.project_id = old_issue.project_id
    AND issue.issue_id = old_issue.issue_id
    AND issue.changelog_id = old_issue.changelog_id + 1
JOIN gros.project ON issue.project_id = project.project_id
WHERE issue.story_points <> old_issue.story_points
AND issue.story_points < 99
AND old_issue.story_points < 99
AND COALESCE(project.is_support_team, FALSE) = FALSE
    --GROUP BY project.quality_display_name, project.name, issue.issue_id
ORDER BY issue.issue_id, issue.changelog_id'
changes <- dbGetQuery(conn, query)
changes$num_changes <- unlist(lapply(split(changes, changes$issue_id),
                                     function(issue) { rep(nrow(issue), nrow(issue)) }))
changes <- arrange(changes, changes$project, changes$issue_id)
changes

In [None]:
lapply(split(changes, changes$num_changes), function(bin) {
    #return(c(min(bin$change), max(bin$change)))
    ggplot(bin) +
        geom_bar(aes(x=project, y=mode(change)), stat="identity") +
        labs(title=paste("story points with", bin[1, "num_changes"], "change(s)")) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5))
})

In [None]:
query <- 'SELECT COALESCE(project.quality_display_name, project.name) AS project, issue.issue_id, issue.story_points - old_issue.story_points AS change
    FROM gros.issue
    LEFT JOIN gros.issue AS old_issue
    ON issue.project_id = old_issue.project_id
    AND issue.issue_id = old_issue.issue_id
    AND issue.changelog_id = old_issue.changelog_id + 1
JOIN gros.project ON issue.project_id = project.project_id
WHERE issue.story_points <> COALESCE(old_issue.story_points, -1)
AND issue.story_points < 99
AND COALESCE(old_issue.story_points, -1) < 99
AND COALESCE(project.is_support_team, FALSE) = FALSE
    --GROUP BY project.quality_display_name, project.name, issue.issue_id
ORDER BY issue.issue_id, issue.changelog_id'
changes <- dbGetQuery(conn, query)
changes$num_changes <- unlist(lapply(split(changes, changes$issue_id),
                                     function(issue) { rep(sum(!is.na(issue$change)), nrow(issue)) }))
changes <- arrange(changes, changes$project, changes$issue_id)

Van elke stap `#0 -> #1 -> #2 -> ...` en `#0 -> laatste`
- Gemiddelde verandering
- Modus
- mediaan
- Uitsplitsen positief/negatief
- Aantallen
- Spreiding?

Kunnen we een correctiefactor vinden op basis van eerdere backlog wat er mogelijk gaat veranderen

Dezelfde analyse voor vervallen en extra stories

Vervolgstap: Hoe verhoudt de correctiefactor over tijd?
Cross-validatie (of echt random eruit gooien)

- Opschonen storypoints

Met correctiefactor-bepaling: Per sprint bepalen
Verschil tussen projecten en over tijd

Correctiefactor totaal = CF_nieuw * CF_vervallen * CF_sp