Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Diehl committed Nov 7, 2011
2 parents 619cf0d + 215251c commit ec8171a
Show file tree
Hide file tree
Showing 23 changed files with 2,123 additions and 0 deletions.
6 changes: 6 additions & 0 deletions NYC_2011/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# New York Data Without Borders Datadive #
_This is currently a placeholder for a repository that will be populated with resources from the event, which took place October 14-16, 2011._

- We are in the process of moving material from the Wiki at http://wiki.datawithoutborders.cc

This repository is the primary resource for all things developed during the Datadive. It will include subdirectories for the participating social organizations, which will contain the code, data, and analyses produced by the participants.
14 changes: 14 additions & 0 deletions SF_2011/Benetech/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.DS_Store
.RData
.RHistory
*.tex
*.log
*.aux
*.out
*.nav
*.pdf
*.pdfsync
*.snm
*.synctex.gz
*.toc
*.swp
39 changes: 39 additions & 0 deletions SF_2011/Benetech/R/11_map_report.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# File-Name: drew_reports.R
# Date: 2011-11-05
# Author: Drew Conway
# Email: drew.conway@nyu.edu
# Purpose: Generates a map of of where in the world Martus is being used
# Data Used: data/*.csv
# Packages Used: RCurl, RJSONIO, ggplot2
# Machine: Drew Conway's MacBook Pro

# Copyright (c) 2011, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
# All rights reserved.

library(maps)

# WARNING: This may take a very long time...you have been warned.

### NOTE: To update the geocoding, uncomment the lines below

# geo.coded <- lapply(unique(benetech.full$location), function(l), geocode.addr(l)))
# geo.data <- data.frame(do.call(rbind, geo.coded))

# write.csv(geo.data, '../data/geo_data.csv', row.names=FALSE)

geo.data <- read.csv('../data/geo_data.csv', stringsAsFactors=FALSE)
names(geo.data) <- c('location', 'lng', 'lat')

geo.frequency <- ddply(geo.data, .(lng, lat), summarise, count=length(location))
names(geo.frequency) <- c('lng', 'lat', 'count')

globe <- data.frame(map(plot=FALSE)[c('x','y')])
map.plot <- ggplot(globe, aes(x=x, y=y))+geom_path(aes(alpha=0.25))+coord_map(projection='lagrange', ylim=c(-48,52))
map.plot <- map.plot + geom_point(data=subset(geo.frequency, !is.na(lng)),
aes(x=lat, y=lng, alpha=0.75, size=count, color=count))+scale_size(to=c(2,5), name='Martus Use')+
scale_color_gradient(low='orange', high='darkred', name='Martus Use')+theme_bw()+
scale_alpha(to=c(0.25,0.75), legend=FALSE)+xlab('')+ylab('')+
opts(panel.grid.major=theme_blank(), panel.grid.minor=theme_blank(),
axis.text.x=theme_blank(), axis.text.y=theme_blank(), axis.ticks=theme_blank())
ggsave(plot=map.plot, filename='benetech_report-map_plot.pdf', width=8, height=5)
116 changes: 116 additions & 0 deletions SF_2011/Benetech/R/Q1_server_usage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

library( plyr )
library(xtable)

#source( "load_data.R" )
# save( benetech.full, file="thedatafile.rdata" )
if ( FALSE ) {
bk = benetech.full
load( file="thedatefile" )
benetech.full = bk
table( benetech.full$server, benetech.full$original.server, useNA="ifany" )

# for debugging -- speed up
#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ]
}


# make sure we have ids for all the records
stopifnot( sum( is.na( benetech.full$public.code ) ) == 0 )
stopifnot( sum( is.na( benetech.full$bulletin.id ) ) == 0 )

# sum.dat = ddply( benetech.full, c("server","original.server"), summarize,
#table( benetech.full$original.server )

# Cut up data by server. For each server cut up by the original server
# and do summarize. Also compute overall stats.
# Need to do this seperately to capture total unique across all records
# summing will give an invalid answer.
big.sum.list = dlply( benetech.full, "server", function( chunk ) {

df = ddply( chunk, .(original.server), summarise,
server=server[[1]],
total.bull = length(public.code),
total.Mb = sum( size..Kb. ),

total.acc = length( unique( public.code ) ),
attach.pub = sum( public.attachments ),
attach.priv = sum( private.attachments )
)

if ( nrow(df) > 1 ) {
df.full = ddply( chunk, .(server), summarise,
original.server="Total",
total.bull = length(public.code),
total.Mb = sum( size..Kb. ),

total.acc = length( unique( public.code ) ),
attach.pub = sum( public.attachments ),
attach.priv = sum( private.attachments )
)
df$original.server = c("Mirr","Orig")[1+df$original.server]
df = df[ c(2,1,3:ncol(df)) ]
rbind( df, df.full )
} else {
df$original.server = c("Total","Total")[1+df$original.server]
df
}

} )

total.df = ddply( benetech.full, c(), summarise,
original.server="T",
total.bull = length(public.code),
total.Mb = sum( size..Kb. ),

total.acc = length( unique( public.code ) ),
attach.pub = sum( public.attachments ),
attach.priv = sum( private.attachments )
)
names(total.df)[1] = "server"
total.df$server = "Total"
total.df$original.server = NA


# put it all together
summ = do.call( rbind, big.sum.list )
#summ
summ = rbind( summ, total.df )




dups = !duplicated( summ$server )

# for pretty output
summ$server[!dups] = ""

# add some summary stats
summ$kBperBul = with( summ, total.Mb / total.bull )
summ $attach.tot = with( summ, attach.pub + attach.priv)
summ = summ[c(1,2,3,4,8,5,6,7,9)]

# not that useful?
#summ$avg.attach = with( summ, attach.tot / total.bull )

#summ
summ$total.Mb = summ$total.Mb / 1024
summ$total.Mb = formatC(summ$total.Mb, format="d", big.mark=',' )
summ


summ$per.pub = 100 *summ$attach.pub / summ$attach.tot
summ
summ$attach.pub = NULL
summ$attach.priv = NULL
summ$per.pub = paste( round(summ$per.pub,digits=1), "%", sep="" )
summ$attach.tot = formatC(summ$attach.tot, format="d", big.mark=',' )


# make an xtable in latex.
xtb = xtable( summ, align="rrrrrrrrr", caption="Database Usage Statistics",
digits=0 )
print( xtb, hline.after=(which(dups)-1), include.rownames=FALSE,
include.colnames=FALSE, only.contents=TRUE )


199 changes: 199 additions & 0 deletions SF_2011/Benetech/R/Q1b_server_growth.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@

# plot increase of four usage variables of interest over time.

library( plyr )
library(xtable)

#source( "load_data.R" )
# save( benetech.full, file="thedatafile.rdata" )
if ( FALSE ) {
bk = benetech.full
load( file="thedatefile" )
benetech.full = bk
table( benetech.full$server, benetech.full$original.server, useNA="ifany" )

# for debugging -- speed up
#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ]
}


bf = subset( benetech.full, !is.na( date.uploaded ) )
if(FALSE){
nrow(bf)
nrow(benetech.full)
}

bf = bf[ order( bf$date.uploaded ), ]
#nrow(bf)
bf$new.account = !duplicated( bf$public.code )

# compute total from time of start
bf$tot.bull = 1:nrow(bf)
bf$totKb = cumsum( bf$size..Kb. )
bf$accounts = cumsum( bf$new.account )
bf$tot.attach = cumsum( bf$public.attachments + bf$private.attachments )


# select some subsample for plotting
pick = round( seq( 1, nrow(bf), length.out=4000 ) )
pick = pick[ !duplicated( bf$date.uploaded[ pick ], fromLast=TRUE ) ]

# make the full subsample
bfs.big = bf[pick,]
N.big = nrow(bfs.big)

# make the smoothed subsample
pick = pick[ round( seq( 1, N.big, length.out=60 ) ) ]
N.little = length(pick)
bfs.little = bf[pick,]

#plot(pick)

# Calculate numerical derivatives
delt = function( X ) {
N = length(X)
X[2:N] - X[1:(N-1)]
}

del.ts = as.double( delt( bfs.little$date.uploaded ), units="days")
#del.ts
#summary(del.ts)

days = bfs.little$date.uploaded[2:N.little]

bull.per.day = delt( bfs.little$tot.bull ) / del.ts
kb.per.day = delt( bfs.little$totKb ) / del.ts
account.per.day = delt( bfs.little$accounts ) / del.ts
attach.per.day = delt( bfs.little$tot.attach ) / del.ts


#######
# PLOTS
# this code called in sweave
#######

# totals
plot.tot = function( bfs ) {
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
plot( bfs$date.uploaded, bfs$tot.bull, type="l", xlab="Time", ylab="Total Bulletins" )
plot( bfs$date.uploaded, bfs$totKb/1024, type="l", xlab="Time", ylab="Total Megabytes")
plot( bfs$date.uploaded, bfs$accounts, type="l", xlab="Time", ylab="Total Number of Distinct Accounts" )
plot( bfs$date.uploaded, bfs$tot.attach, type="l", xlab="Time", ylab="Total Number of Attachments" )
}


plot.tot.from = function( bfs, years ) {
cut = max(bfs$date.uploaded) - years * 365 * 24*60*60
keep = bfs$date.uploaded >= cut
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
plot( bfs$date.uploaded[keep], bfs$tot.bull[keep], type="l", xlab="Time", ylab="Total Bulletins" )
plot( bfs$date.uploaded[keep], bfs$totKb[keep]/1024, type="l", xlab="Time", ylab="Total Megabytes")
plot( bfs$date.uploaded[keep], bfs$accounts[keep], type="l", xlab="Time", ylab="Total Number of Distinct Accounts" )
plot( bfs$date.uploaded[keep], bfs$tot.attach[keep], type="l", xlab="Time", ylab="Total Number of Attachments" )

}




# no truncation
plot.deriv = function( bfs ) {
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
plot( days, bull.per.day, type="l", xlab="Time", ylab="Bulletins Per Day" )
plot( days, kb.per.day/1024, type="l", xlab="Time", ylab="Megabytes per Day")
plot( days, account.per.day, type="l", xlab="Time", ylab="New Accounts per Day" )
plot( days, attach.per.day, type="l", xlab="Time", ylab="Attachments per Day" )
}

# truncated
plot.deriv.trunc = function( bfs ) {
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
plot( days, bull.per.day, ylim=c(0, quantile( bull.per.day, 0.95 )), type="l", xlab="Time", ylab="Bulletins Per Day" )
plot( days, kb.per.day/1024, type="l", ylim=c(0, quantile( kb.per.day/1024, 0.95 )), xlab="Time", ylab="Megabytes per Day")
plot( days, account.per.day, type="l", ylim=c(0, quantile( account.per.day, 0.95 )), xlab="Time", ylab="New Accounts per Day" )
plot( days, attach.per.day, type="l", ylim=c(0, quantile( attach.per.day, 0.95 )), xlab="Time", ylab="Attachments per Day" )
}



plot.deriv.from = function( years ) {
cut = max(days) - years * 365 * 24*60*60
keep = days >= cut
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
plot( days[keep], bull.per.day[keep], type="l", xlab="Time", ylab="Bulletins Per Day" )
plot( days[keep], kb.per.day[keep]/1024, type="l", xlab="Time", ylab="Megabytes per Day")
plot( days[keep], account.per.day[keep], type="l", xlab="Time", ylab="New Accounts per Day" )
plot( days[keep], attach.per.day[keep], type="l", xlab="Time", ylab="Attachments per Day" )
}




if (FALSE) {

plot.tot( bfs.big )
plot.deriv( bfs.little )
plot.deriv.trunc( bfs.little )

}






###### DEAD CODE

# rg = range(bf$date.uploaded)
# K = 100
# cutpts = seq( rg[1], rg[2], length.out=K )
# midpts = seq( rg[1], rg[2], length.out=(K*2)-1 )
# midpts = midpts[ 2*(1:(K-1)) ]
# #cutpts
# #midpts

# bf.bk = bf
# bf = bf.bk[ sample( nrow(bf), K * 10000 ), ]
# nrow(bf)


# # res = sapply( 2:K, function(X) {
# cat( "tick", X,K,"\n" )
# tmp = subset( bf, cutpts[[X-1]] <= date.uploaded & date.uploaded <= cutpts[[X]] )

# bull.per.day=nrow(tmp)

# Kb.per.day=sum( tmp$size..Kb. )

# acc.per.day=sum(tmp$new.account)

# attach.per.day=sum(tmp$public.attachments + tmp$private.attachments)

# c( bull.per.day=bull.per.day, Kb.per.day=Kb.per.day, acc.per.day=acc.per.day, attach.per.day=attach.per.day ) / as.numeric((cutpts[X] - cutpts[X-1]))

# } )



# # rough plots
# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
# plot( midpts, res[1,], type="l", xlab="Time", ylab="Bulletins Per Day" )
# plot( midpts, res[2,], type="l", xlab="Time", ylab="Megabytes per Day")
# plot( midpts, res[3,], type="l", xlab="Time", ylab="New Accounts per Day" )
# plot( midpts, res[4,], type="l", xlab="Time", ylab="Attachments per Day" )



# # smoothed plots
# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
# plot( midpts, res[1,], type="n", xlab="Time", ylab="Bulletins Per Day" )
# lines( lowess(midpts, res[1,], f=1/20))
# plot( midpts, res[2,], type="n", xlab="Time", ylab="Megabytes per Day")
# lines( lowess(midpts, res[2,], f=1/20) )
# plot( midpts, res[3,], type="n", xlab="Time", ylab="New Accounts per Day" )
# lines( lowess(midpts, res[3,], f=1/20) )
# plot( midpts, res[4,], type="n", xlab="Time", ylab="Attachments per Day" )
# lines( lowess(midpts, res[4,], f=1/20) )



Loading

0 comments on commit ec8171a

Please sign in to comment.