/
extract_events.R
97 lines (90 loc) · 3.27 KB
/
extract_events.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Script to download/extract Retrosheet data and export as CSV
# Frank Firke
# Borrowing *substantially* from other sources
# Function taken from @bayesball
# https://gist.github.com/bayesball/8892981
parse.retrosheet2.pbp = function(season){
# ADJUSTED FOR MAC -- function will work for WINDOWS and MAC
# download, unzip, append retrosheet data
# assume current directory has a folder download.folder
# download.folder has two subfolders unzipped and zipped
# program cwevent.exe is in unzipped folder (for windows)
download.retrosheet <- function(season){
# get zip file from retrosheet website
download.file(
url=paste("http://www.retrosheet.org/events/", season, "eve.zip", sep="")
, destfile=paste("download.folder", "/zipped/", season, "eve.zip", sep="")
)
}
unzip.retrosheet <- function(season){
#unzip retrosheet files
unzip(paste("download.folder", "/zipped/", season, "eve.zip", sep=""),
exdir=paste("download.folder", "/unzipped", sep=""))
}
create.csv.file=function(year){
# http://chadwick.sourceforge.net/doc/cwevent.html#cwtools-cwevent
# shell("cwevent -y 2000 2000TOR.EVA > 2000TOR.bev")
wd = getwd()
setwd("download.folder/unzipped")
if (.Platform$OS.type == "unix"){
system(paste(paste("cwevent -y", year, "-f 0-96"),
paste(year,"*.EV*",sep=""),
paste("> all", year, ".csv", sep="")))} else {
shell(paste(paste("cwevent -y", year, "-f 0-96"),
paste(year,"*.EV*",sep=""),
paste("> all", year, ".csv", sep="")))
}
setwd(wd)
}
create.csv.roster = function(year){
# creates a csv file of the rosters
filenames <- list.files(path = "download.folder/unzipped/")
filenames.roster =
subset(filenames, substr(filenames, 4, 11)==paste(year,".ROS",sep=""))
read.csv2 = function(file)
read.csv(paste("download.folder/unzipped/", file, sep=""),header=FALSE)
R = do.call("rbind", lapply(filenames.roster, read.csv2))
names(R)[1:6] = c("Player.ID", "Last.Name", "First.Name",
"Bats", "Pitches", "Team")
wd = getwd()
setwd("download.folder/unzipped")
write.csv(R, file=paste("roster", year, ".csv", sep=""))
setwd(wd)
}
cleanup = function(){
# removes retrosheet files not needed
wd = getwd()
setwd("download.folder/unzipped")
if (.Platform$OS.type == "unix"){
system("rm *.EVN")
system("rm *.EVA")
system("rm *.ROS")
system("rm TEAM*")} else {
shell("del *.EVN")
shell("del *.EVA")
shell("del *.ROS")
shell("del TEAM*")
}
setwd(wd)
setwd("download.folder/zipped")
if (.Platform$OS.type == "unix"){
system("rm *.zip")} else {
shell("del *.zip")
}
setwd(wd)
}
download.retrosheet(season)
unzip.retrosheet(season)
create.csv.file(season)
create.csv.roster(season)
cleanup()
}
setwd("C:/Users/Frank/Documents/Blog/Retrosheet/R_script")
require(foreach)
require(doParallel)
# Parallelization speeds this up but it's still pretty time consuming
cl <- makeCluster(2)
registerDoParallel(cl)
y=Sys.time()
foreach(i=1930:2016) %dopar% parse.retrosheet2.pbp(i)
Sys.time()-y