Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Code and images for Super Bowl pool analysis
- Loading branch information
1 parent
227754f
commit 9fec74c
Showing
7 changed files
with
190 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# File-Name: boxes_viz.R | ||
# Date: 2011-02-08 | ||
# Author: Drew Conway | ||
# Email: drew.conway@nyu.edu | ||
# Purpose: Visualize the probability of a win in the Super Bowl | ||
# boxes game given historical data | ||
# Data Used: see super_bowl.R | ||
# Packages Used: ggplot2, and see super_bowl.R for additional | ||
# Output File: | ||
# Data Output: | ||
# Machine: Drew Conway's MacBook Pro | ||
|
||
# Copyright (c) 2011, under the Simplified BSD License. | ||
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php | ||
# All rights reserved. | ||
|
||
# Creates data frames. Only run if you have not already create data | ||
# source(super_bowl.R) | ||
|
||
# Load libraries | ||
library(ggplot2) | ||
|
||
# Create a data frame for boxes | ||
positions<-do.call(rbind, lapply(0:9, function(x) cbind(0:9,x))) | ||
boxes<-data.frame(x=positions[,1],y=positions[,2]) | ||
|
||
# Create a heatmap of probability of winning given different digit combinations by quarter | ||
boxes.heatmap<-ggplot(boxes[1:10,], aes(xmin=x,xmax=x+1,ymin=y,ymax=x+1))+geom_rect(color="white") |
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
# File-Name: super_bowl.R | ||
# Date: 2011-02-07 | ||
# Author: Drew Conway | ||
# Email: drew.conway@nyu.edu | ||
# Purpose: Gather and manipulate historic Super Bowl score data | ||
# Data Used: Wikipedia Super Bowl data | ||
# Packages Used: ggplot2, RCurl, XML | ||
# Output File: | ||
# Data Output: | ||
# Machine: Drew Conway's MacBook Pro | ||
|
||
# Copyright (c) 2011, under the Simplified BSD License. | ||
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php | ||
# All rights reserved. | ||
|
||
# Load libraries | ||
library(ggplot2) | ||
library(RCurl) | ||
library(XML) | ||
|
||
# A function that converts a given integer into its Roman Numeral equivalent | ||
to.RomanNumeral<-function(x) { | ||
if(0 < x & x < 5000) { | ||
x<-as.integer(x) | ||
digits<-c(1000,900,500,400,100,90,50,40,10,9,5,4,1) | ||
numerals<-c("M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I") | ||
digits.numerals<-as.data.frame(cbind(digits,numerals), stringsAsFactors=FALSE) | ||
numeral<-"" | ||
for(i in 1:nrow(digits.numerals)) { | ||
while(x >= as.numeric(digits.numerals[i,1])) { | ||
numeral<-paste(numeral,digits.numerals[i,2],sep="") | ||
x<-x-as.numeric(digits.numerals[i,1]) | ||
} | ||
} | ||
return(numeral) | ||
} | ||
else { | ||
stop(paste(x,"is invalid. Input must be an integer between 1 and 4,999")) | ||
} | ||
} | ||
|
||
# Function returns quater scores from Wikipedia Super Bown pages | ||
get.scores<-function(numeral) { | ||
# Base URL for Wikipedia | ||
wp.url<-getURL(paste("http://en.wikipedia.org/wiki/Super_Bowl_",numeral,sep="")) | ||
wp.data<-htmlTreeParse(wp.url, useInternalNodes=TRUE) | ||
score.html<-getNodeSet(wp.data,"//table[@style='background-color:transparent;']") | ||
score.table<-readHTMLTable(score.html[[1]]) | ||
score.table<-transform(score.table, SB=numeral) | ||
return(score.table) | ||
} | ||
|
||
# Function returns the right-most digit of numer | ||
last.digit<-function(x) { | ||
digits<-strsplit(as.character(x),"")[[1]] | ||
last.digit<-digits[length(digits)] | ||
return(as.numeric(last.digit)) | ||
} | ||
|
||
# Function takes an nxm matrix of scores and returns a nxn matrix of | ||
# probability of winning given a combination of digits | ||
get.probs<-function(score.df) { | ||
digits.sums<-sum(score.df[,2:11])*10 | ||
prob.matrix<-matrix(ncol=10,nrow=10) | ||
for(i in 1:10) { | ||
for(j in 1:10) { | ||
prob.matrix[i,j]<-(score.df[1,i+1]+score.df[2,j+1])/digits.sums | ||
} | ||
} | ||
prob.df<-as.data.frame(prob.matrix, row.names=0:9) | ||
names(prob.df)<-0:9 | ||
return(prob.df) | ||
} | ||
|
||
# There have been 45 Super Bowls | ||
bowls<-lapply(1:45, to.RomanNumeral) | ||
|
||
# Create data frame of all Super Bowl scores | ||
scores.list<-lapply(bowls, get.scores) | ||
scores.df<-data.frame(do.call(rbind, scores.list)) | ||
|
||
# Fix the data types | ||
scores.df$X1<-as.numeric(as.character(scores.df$X1)) | ||
scores.df$X2<-as.numeric(as.character(scores.df$X2)) | ||
scores.df$X3<-as.numeric(as.character(scores.df$X3)) | ||
scores.df$X4<-as.numeric(as.character(scores.df$X4)) | ||
scores.df$Total<-as.numeric(as.character(scores.df$Total)) | ||
|
||
# Add quarter scoring data to data frame | ||
quarters.list<-lapply(1:nrow(scores.df), function(i) c(scores.df[i,2],sum(scores.df[i,2:3]),sum(scores.df[i,2:4]))) | ||
quarters.df<-as.data.frame(do.call(rbind, quarters.list)) | ||
|
||
# Final data set | ||
super.df<-cbind(scores.df, quarters.df,rep(as.factor(c("Home","Away")),45)) | ||
names(super.df)<-c("Team","Q1","Q2","Q3","Q4","Total","SB","Q1T","Q2T","Q3T","Type") | ||
|
||
# Get digit count totals in workable data frame for visualization | ||
digits.df<-ddply(super.df,.(SB,Type), summarise, Q1D=last.digit(Q1T), Q2D=last.digit(Q2T), Q3D=last.digit(Q3T), | ||
Q4D=last.digit(Total)) | ||
|
||
# Create separate data frames for each quarter | ||
q1.counts<-ddply(digits.df,.(Type), summarise, D0=length(which(Q1D==0)), D1=length(which(Q1D==1)), D2=length(which(Q1D==2)), | ||
D3=length(which(Q1D==3)), D4=length(which(Q1D==4)), D5=length(which(Q1D==5)), D6=length(which(Q1D==6)), D7=length(which(Q1D==7)), | ||
D8=length(which(Q1D==8)), D9=length(which(Q1D==9))) | ||
q1.probs<-get.probs(q1.counts) | ||
|
||
q2.counts<-ddply(digits.df,.(Type), summarise, D0=length(which(Q2D==0)), D1=length(which(Q2D==1)), D2=length(which(Q2D==2)), | ||
D3=length(which(Q2D==3)), D4=length(which(Q2D==4)), D5=length(which(Q2D==5)), D6=length(which(Q2D==6)), D7=length(which(Q2D==7)), | ||
D8=length(which(Q2D==8)), D9=length(which(Q2D==9))) | ||
q2.probs<-get.probs(q2.counts) | ||
|
||
q3.counts<-ddply(digits.df,.(Type), summarise, D0=length(which(Q3D==0)), D1=length(which(Q3D==1)), D2=length(which(Q3D==2)), | ||
D3=length(which(Q3D==3)), D4=length(which(Q3D==4)), D5=length(which(Q3D==5)), D6=length(which(Q3D==6)), D7=length(which(Q3D==7)), | ||
D8=length(which(Q3D==8)), D9=length(which(Q3D==9))) | ||
q3.probs<-get.probs(q3.counts) | ||
|
||
q4.counts<-ddply(digits.df,.(Type), summarise, D0=length(which(Q4D==0)), D1=length(which(Q4D==1)), D2=length(which(Q4D==2)), | ||
D3=length(which(Q4D==3)), D4=length(which(Q4D==4)), D5=length(which(Q4D==5)), D6=length(which(Q4D==6)), D7=length(which(Q4D==7)), | ||
D8=length(which(Q4D==8)), D9=length(which(Q4D==9))) | ||
q4.probs<-get.probs(q4.counts) | ||
|
||
### Create visualizations | ||
|
||
# Create a data frame for boxes | ||
positions<-do.call(rbind, lapply(0:9, function(x) cbind(0:9,x))) | ||
boxes<-data.frame(x=positions[,1],y=positions[,2]) | ||
boxes$Q1<-melt(q1.probs)$value | ||
boxes$Q2<-melt(q2.probs)$value | ||
boxes$Q3<-melt(q3.probs)$value | ||
boxes$Q4<-melt(q4.probs)$value | ||
|
||
# Create a heatmap of probability of winning given different digit combinations by quarter | ||
q1.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q1))+ | ||
scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ | ||
scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ | ||
scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ | ||
xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- First Quater") | ||
ggsave(plot=q1.heatmap, filename="images/q1_heatmap.png", height=12, width=12) | ||
|
||
q2.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q2))+ | ||
scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ | ||
scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ | ||
scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ | ||
xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Half Time") | ||
ggsave(plot=q2.heatmap, filename="images/q2_heatmap.png", height=12, width=12) | ||
|
||
q3.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q3))+ | ||
scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ | ||
scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ | ||
scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ | ||
xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Third Quarter") | ||
ggsave(plot=q3.heatmap, filename="images/q3_heatmap.png", height=12, width=12) | ||
|
||
q4.heatmap<-ggplot(boxes, aes(xmin=x,xmax=x+1,ymin=y,ymax=y+1))+geom_rect(aes(color="white", fill=Q4))+ | ||
scale_fill_gradient(limits=c(0,.047), low="lightgrey", high="darkred", name="Pr(Winning)")+ | ||
scale_color_manual(values=c("white"="white"), legend=FALSE)+theme_bw()+ | ||
scale_x_continuous(breaks=.5:9.5,labels=0:9)+scale_y_continuous(breaks=.5:9.5,labels=0:9)+ | ||
xlab("Home Team")+ylab("Away Team")+opts(title="Heat Map of Win Probabilties -- Final") | ||
ggsave(plot=q4.heatmap, filename="images/q4_heatmap.png", height=12, width=12) | ||
|
||
|
||
|