Ekstrom2018/ekstrom.Rmd

# Oops.

This file is a slight mess, but it does contain all the code for my prediction. The end result was written up [here](http://sandsynligvis.dk/2018/06/14/hvem-vinder-vm-2018-og-hvem-er-bedst-til-at-pr%C3%A6diktere-det/). Apologies to the non-Danish readers.

In any case - here's the code to be posted before the first match is over.

```{r echo=FALSE}

library("dplyr")
library("magrittr")

normalgoals <- 2.75

## This data frams contains information about the teams.
## You are free to add information here that you can use when determining winners
team_data <- tibble(
  number = 1:32,
  name = c("Egypt","Russia","Saudi Arabia","Uruguay",
           "Iran","Morocco","Portugal","Spain",
           "Australia","Denmark","France","Peru",
           "Argentina","Croatia","Iceland","Nigeria",
           "Brazil","Costa Rica","Switzerland","Serbia",
           "Germany","South Korea","Mexico","Sweden",
           "Belgium","England","Panama","Tunisia",
           "Colombia","Japan","Poland","Senegal"),
  group = rep(LETTERS[1:8], each=4),
  rating = c(151, 41, 1001, 34,
             501, 501, 26, 7,
             301, 101, 7.5, 201,
             10, 34, 201, 201,
             5, 501, 101, 201,
             5.5, 751, 101, 151,
             12, 19, 1001, 751,
             41, 301, 51, 201),
  elo = c(1646, 1685, 1582, 1890, # From https://www.eloratings.net/, May 12th
          1793, 1711, 1975, 2048,
          1714, 1843, 1984, 1906,
          1985, 1853, 1787, 1699,
          2131, 1745, 1879, 1770,
          2092, 1746, 1859, 1796,
          1931, 1941, 1669, 1649,
          1935, 1693, 1831, 1747)
)

# Giv hjemmebanefordel
team_data$elo[team_data$name=="Russia"] <- round( team_data$elo[team_data$name=="Russia"]*1.05, 0)

group_match_data <- read.csv(text=
"team1,team2,date,goals1,goals2
Russia,Saudi Arabia,14/06/2018,,
Egypt,Uruguay,15/06/2018,,
Morocco,Iran,15/06/2018,,
Portugal,Spain,15/06/2018,,
France,Australia,16/06/2018,,
Argentina,Iceland,16/06/2018,,
Peru,Denmark,16/06/2018,,
Croatia,Nigeria,16/06/2018,,
Costa Rica,Serbia,17/06/2018,,
Germany,Mexico,17/06/2018,,
Brazil,Switzerland,17/06/2018,,
Sweden,South Korea,18/06/2018,,
Belgium,Panama,18/06/2018,,
Tunisia,England,18/06/2018,,
Colombia,Japan,19/06/2018,,
Poland,Senegal,19/06/2018,,
Russia,Egypt,19/06/2018,,
Portugal,Morocco,20/06/2018,,
Uruguay,Saudi Arabia,20/06/2018,,
Iran,Spain,20/06/2018,,
Denmark,Australia,21/06/2018,,
France,Peru,21/06/2018,,
Argentina,Croatia,21/06/2018,,
Brazil,Costa Rica,22/06/2018,,
Nigeria,Iceland,22/06/2018,,
Serbia,Switzerland,22/06/2018,,
Belgium,Tunisia,23/06/2018,,
South Korea,Mexico,23/06/2018,,
Germany,Sweden,23/06/2018,,
England,Panama,24/06/2018,,
Japan,Senegal,24/06/2018,,
Poland,Colombia,24/06/2018,,
Saudi Arabia,Egypt,25/06/2018,,
Uruguay,Russia,25/06/2018,,
Iran,Portugal,25/06/2018,,
Spain,Morocco,25/06/2018,,
Australia,Peru,26/06/2018,,
Denmark,France,26/06/2018,,
Nigeria,Argentina,26/06/2018,,
Iceland,Croatia,26/06/2018,,
Mexico,Sweden,27/06/2018,,
South Korea,Germany,27/06/2018,,
Serbia,Brazil,27/06/2018,,
Switzerland,Costa Rica,27/06/2018,,
England,Belgium,28/06/2018,,
Senegal,Colombia,28/06/2018,,
Panama,Tunisia,28/06/2018,,
Japan,Poland,28/06/2018,,
",header=TRUE)
```


```{r echo=FALSE}
tt <- team_data
tt <- tt %>% rename(odds=rating, ELO=elo, Gruppe=group, Navn=name)
knitr::kable(cbind(tt[1:16,-1], ` `=rep("  ", 16), tt[17:32,-1]))
```


```{r echo=FALSE, fig.width=10, fig.height=8}
library("reshape2")
source("~/ku/praesentationer/erum-2018/socceR2018/socceR/R/elo.R")
pw <- outer(team_data$elo, team_data$elo, elo2prob)
diag(pw) <- NA
griddata <- melt(pw) %>% 
  mutate(v1=factor(team_data$name[Var1], levels=sort(team_data$name)), 
         v2=factor(team_data$name[Var2], levels=rev(sort(team_data$name))))
#reverse level order of state

library("RColorBrewer")
griddata %>% mutate(value=round(value, 2)) %>%
  ggplot(aes(v1, v2, fill = 1-value,
             text = paste('Sandsynlighed for at', v2,
                          '<br>slår', v1, ': ', (1-value)*100, '%')
             )) + geom_tile(colour="white",size=0.25) + 
  labs(x="", y="") + coord_fixed() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_distiller(palette = "RdBu", direction=-1) +
  scale_y_discrete(expand=c(0,0)) + 
  theme(legend.position="bottom") +
  labs(fill='Chance\nfor at\nvinde kamp') + 
  coord_fixed() -> p    #remove extra space

library("plotly")

# p

ggplotly(p, tooltip="text") # %>% layer(paper_bgcolor='transparent')
```


```{r skellam, echo=FALSE}
dskellam <- function(x, mu1, mu2) {
  return(exp(-(mu1+mu2))*(mu1/mu2)^(x/2)*besselI(2*sqrt(mu1*mu2),nu=x)
         )
}


eta <- 2.75
beta1 <- seq(0.05, eta-0.05, 0.05)


skellam <- rep(0, length(beta1))
counter <- 1
for (i in beta1) {
  
  # Udregn ssh for at hold 1 vinder
  skellam[counter] <- sum(dskellam(1:12, i, eta-i)) / ( sum(dskellam(seq(-10,10,1), i, eta-i)) - dskellam(0, i, eta-i) )
  counter <- counter+1
}

skellam <- data.frame(beta=beta1, prob=skellam)


FindParameter <- function(prob) {
      sapply(prob, function(i) {
                 if (i<.009) {
                     return (.1)
                 }
                 if (i>.995) {
                       return (eta-.05)
                 }
               return(min(skellam$beta[skellam$prob>i]))
           }
           )
}

```


```{r echo=FALSE}
## This function fills out the missing matches in the order from top to bottom
## It returns a list of two data frames - one is the of the form

# Input: 
# Returns: 

odds2probs <- function(odds, rescale=TRUE) {
    if (any(odds<0))
        stop("Odds must be non-negative")
    probs <- 1/(odds)  # Note: decimal odds here!
    if (rescale)
        probs <- probs/sum(probs)
    probs
}

play_game <- function(team_data, team1, team2, musthavewinner=FALSE, k=58) {
  # Sanity checks
  if (length(team1) != length(team2))
    stop("Lengths of team should be the same")
  
  if (any(team1==team2))
    stop("A team cannot play against itself")
  
#  print(team1)

  ## Simplest version. 
  ## All teams are equal
  result <- cbind(rpois(length(team1), lambda=normalgoals/2), 
                  rpois(length(team1), lambda=normalgoals/2))

  ## Skellam distribution
  p1 <- .91/team_data$rating[team1]
  p2 <- .91/team_data$rating[team2]
  
  p1 <- odds2probs(team_data$rating)[team1]
  p2 <- odds2probs(team_data$rating)[team2]
  
  prob <- p1 / (p1 + p2)
  lambdaA <- FindParameter(prob)
  Agoals <- rpois(length(prob), lambdaA)
  Bgoals <- rpois(length(prob), normalgoals-lambdaA)
  result <- cbind(Agoals, Bgoals)
  
  ## ELO version (no update here). Using sapply here instead of
  ## vectorization in case the elo ranking should be updated after each match
#  result <- t(sapply(seq_len(length(team1)), function(i) {
#                          AWinProb <- 1/(1 + 10^((team_data$elo[team2[i]] - team_data$elo[team1[i]])/400))
#                          myres <- rbinom(1, size=1, prob=AWinProb)
#                              fakegoals <- c(1,0)  
#                              if (myres==0)
#                                fakegoals <- c(0,1)
#                              fakegoals
#                             }))

#  result <- cbind(rpois(length(team1), lambda=ifelse(team1==1, 2, 1)), 
#                  rpois(length(team1), lambda=ifelse(team2==1, 2, 1)))
  
  # If we MUST have a winner then one simple trick is to add a random goal 
  # to one of the two teams that have the same score. Penalty goals seem rather 
  # random anyway
  if (musthavewinner) {
    result[result[,1]==result[,2],1] + 2*rbinom(sum(result[,1]==result[,2]), size=1, prob=.5) - 1
    
  }
  result
  
}


#
#
#  Uses the external team_data

find_group_winners <- function(team_data, group_match_data, FUN=play_game) {

  ## Create a copy of the the matches that we can fill out
  group_match_results <- group_match_data

  ## Simulate each match that hasn't already been played  
  pick <- (!complete.cases(group_match_results[c("goals1", "goals2")]))
  group_results <- play_game(team_data, 
                             team_data$number[match(group_match_data$team1, team_data$name)],
                             team_data$number[match(group_match_data$team2, team_data$name)],
                             musthavewinner = FALSE)

  ## Now add the results (the goals) to the match resuls
  group_match_results[, c("goals1", "goals2")] <- group_results
  ## Compute points earned per team for each match
  group_match_results$pointsForA <- with(group_match_results, 3*(goals1>goals2)+1*(goals1==goals2))
  group_match_results$pointsForB <- with(group_match_results, 3*(goals1<goals2)+1*(goals1==goals2))


  team_data$points <- 
  sapply(team_data$name, function(i) { sum(group_match_results[c("pointsForA", "pointsForB")][i == group_match_data[c("team1","team2")]]) })
  team_data$goalsFore <- sapply(team_data$name, function(i) { sum(group_match_results[c("goals1", "goals2")][i == group_match_data[c("team1","team2")]]) })

  team_data$goalsAgainst <- sapply(team_data$name, function(i) { sum(group_match_results[c("goals2", "goals2")][i == group_match_data[c("team1","team2")]]) })
  
  team_data$goalsDifference <- team_data$goalsFore-team_data$goalsAgainst


  team_data %>% 
    group_by(group) %>% 
    arrange(desc(points), desc(goalsDifference), desc(goalsFore)) %>% 
    mutate(groupRank = row_number()) %>% 
    ungroup() %>%
    arrange(group, groupRank)
}


find_knockout_winners <- function(team_data, match_data, FUN=play_game) {
  ## Get the results
  results <- play_game(team_data, match_data[,1], match_data[,2], musthavewinner=TRUE)
  ## Find the teams that won
  winners <- match_data[cbind(seq(nrow(results)), ifelse(results[,1]>results[,2], 1, 2))]
  winners
}


simulate_tournament <- function(n=10, FUN=playgame,
                                teams=team_data, 
                                group_matches=group_match_data) {
  
  
  sapply(1:n, function(matchnumber) {
    
  ## Step 1: Find the results from the group matcges
  group_results <- find_group_winners(team_data=teams, group_match_data)
  
  ## Step 2: Design matches for the first part of the knockout match
  eigth_matches <- cbind(group_results$number[seq(1, 32, by=4)], group_results$number[c(6, 2, 14, 10, 22, 18, 30, 26)])
  ## and find the results
  eigth_winners <- find_knockout_winners(team_data, eigth_matches)

  ## Step 3: Design matches for the quarter finals and run them
  quarter_matches <- cbind(eigth_winners[c(1, 2, 5, 6)], eigth_winners[c(3, 4, 7, 8)])
  quarter_winners <- find_knockout_winners(team_data, quarter_matches)

  ## Step 4: Semi finals ... yada yada yada
  semi_matches <- cbind(quarter_winners[c(1,2)], quarter_winners[c(3,4)])
  semi_winners <- find_knockout_winners(team_data, semi_matches)

  ## Steps 5 and 6 Find number 1-4
  bronze_match <- matrix(quarter_winners[!quarter_winners %in% semi_winners], ncol=2)
  bronze_winner <- find_knockout_winners(team_data, bronze_match)

  final_match <- matrix(semi_winners, ncol=2)
  final_result <- find_knockout_winners(team_data, final_match)

  ## Return a vector with the teams in ranked order. 
  ## Note only the first 4 are individuals - the rest are really groups
  final_ranking <- c(final_result, # Number 1
                     final_match[!(final_match %in% final_result)], #2
                     bronze_winner, # Number 3
                     bronze_match[!(bronze_match %in% bronze_winner)], #4
                     quarter_matches[!(quarter_matches %in% quarter_winners)], # 5-8
                     eigth_matches[!(eigth_matches %in% eigth_winners)], # 9-16
                     seq(32)[!(seq(32) %in% eigth_matches)]
                   )
 
  final_ranking 
  })
}
# Endelig kode giver en vektor af sandsynligheder
```


```{r echo=FALSE, cache=TRUE}
set.seed(140616)
result <- simulate_tournament(100000)

# Final players
finalteams <- sort(table(apply(result[1:2,], 2, function(x) {paste(sort(team_data$name[x]), collapse=", ")})),
decreasing = TRUE)

finalteams <- finalteams/ncol(result)*100


winner <- table(result[1,])
names(winner) <- team_data$name[match(names(winner), team_data$number)]

# winner/sum(winner)

DF2 <- as.data.frame(winner/sum(winner)*100)
library("ggplot2")
library("cowplot")
p2 <- DF2 %>% #filter(Freq>1) %>% 
  ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + geom_bar(stat="identity") + 
  xlab("Country") + ylab("Sandsynligheden for at vinde") + coord_flip() 

# p2


p3 <- DF2 %>% #filter(Freq>1) %>% 
  ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + geom_point(size=3) + 
  geom_segment(aes(x=reorder(Var1, -Freq), xend=reorder(Var1, -Freq), y=0, yend=Freq)) + 
  xlab("Land") + ylab("Sandsynligheden for at vinde VM") + coord_flip() 

# ggplotly(p3)

# Ranks are rows, columns are countries
final_matrix <- sapply(1:32, function(i) {rowMeans(result==i)})
```

```{r echo=FALSE}
p3
```


```{r echo=FALSE}
res <- as.data.frame(finalteams) %>% dplyr::rename(Lande=Var1, Hyppighed=Freq)
knitr::kable(res[1:8,])
```


```{r echo=FALSE, cache=TRUE}
n <- 100000
set.seed(140618)
gres <- sapply(1:n, function(matchnumber) {
    
  ## Step 1: Find the results from the group matcges
  group_results <- find_group_winners(team_data=team_data, group_match_data) %>% filter(group=="C")
  
  # Return for group C only
  group_results$number
})

grouptab <- round(melt(gres) %>% 
                  dplyr::rename(Land = value, Placering=Var1) %>% 
                  xtabs(~ Land + Placering, data=.) / n * 100, 2)
# print(rownames(grouptab))

rownames(grouptab) <- team_data$name[as.numeric(rownames(grouptab))]
knitr::kable(grouptab)

```