Merge branch 'master' of https://github.com/datanoborders/Datadives

datakind · Nov 7, 2011 · ec8171a · ec8171a
2 parents 619cf0d + 215251c
commit ec8171a
Show file tree

Hide file tree

Showing 23 changed files with 2,123 additions and 0 deletions.
diff --git a/NYC_2011/README.md b/NYC_2011/README.md
@@ -0,0 +1,6 @@
+# New York Data Without Borders Datadive #
+_This is currently a placeholder for a repository that will be populated with resources from the event, which took place October 14-16, 2011._
+
+ - We are in the process of moving material from the Wiki at http://wiki.datawithoutborders.cc
+
+This repository is the primary resource for all things developed during the Datadive.  It will include subdirectories for the participating social organizations, which will contain the code, data, and analyses produced by the participants.
diff --git a/SF_2011/Benetech/.gitignore b/SF_2011/Benetech/.gitignore
@@ -0,0 +1,14 @@
+.DS_Store
+.RData
+.RHistory
+*.tex
+*.log
+*.aux
+*.out
+*.nav
+*.pdf
+*.pdfsync
+*.snm
+*.synctex.gz
+*.toc
+*.swp
diff --git a/SF_2011/Benetech/R/11_map_report.R b/SF_2011/Benetech/R/11_map_report.R
@@ -0,0 +1,39 @@
+# File-Name:       drew_reports.R
+# Date:            2011-11-05
+# Author:          Drew Conway
+# Email:           drew.conway@nyu.edu                                      
+# Purpose:         Generates a map of of where in the world Martus is being used
+# Data Used:       data/*.csv
+# Packages Used:   RCurl, RJSONIO, ggplot2
+# Machine:         Drew Conway's MacBook Pro
+
+# Copyright (c) 2011, under the Simplified BSD License.  
+# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
+# All rights reserved.
+
+library(maps)
+
+# WARNING: This may take a very long time...you have been warned.
+
+### NOTE: To update the geocoding, uncomment the lines below
+
+# geo.coded <- lapply(unique(benetech.full$location), function(l), geocode.addr(l)))
+# geo.data <- data.frame(do.call(rbind, geo.coded))
+
+# write.csv(geo.data, '../data/geo_data.csv', row.names=FALSE)
+
+geo.data <- read.csv('../data/geo_data.csv', stringsAsFactors=FALSE)
+names(geo.data) <- c('location', 'lng', 'lat')
+
+geo.frequency <- ddply(geo.data, .(lng, lat), summarise, count=length(location))
+names(geo.frequency) <- c('lng', 'lat', 'count')
+
+globe <- data.frame(map(plot=FALSE)[c('x','y')])
+map.plot <- ggplot(globe, aes(x=x, y=y))+geom_path(aes(alpha=0.25))+coord_map(projection='lagrange', ylim=c(-48,52))
+map.plot <- map.plot + geom_point(data=subset(geo.frequency, !is.na(lng)), 
+    aes(x=lat, y=lng, alpha=0.75, size=count, color=count))+scale_size(to=c(2,5), name='Martus Use')+
+    scale_color_gradient(low='orange', high='darkred', name='Martus Use')+theme_bw()+
+    scale_alpha(to=c(0.25,0.75), legend=FALSE)+xlab('')+ylab('')+
+    opts(panel.grid.major=theme_blank(), panel.grid.minor=theme_blank(), 
+        axis.text.x=theme_blank(), axis.text.y=theme_blank(), axis.ticks=theme_blank())
+ggsave(plot=map.plot, filename='benetech_report-map_plot.pdf', width=8, height=5)
diff --git a/SF_2011/Benetech/R/Q1_server_usage.R b/SF_2011/Benetech/R/Q1_server_usage.R
@@ -0,0 +1,116 @@
+
+library( plyr )
+library(xtable)
+
+#source( "load_data.R" )
+# save( benetech.full, file="thedatafile.rdata" )
+if ( FALSE ) {
+	bk = benetech.full
+	load( file="thedatefile" )
+	benetech.full = bk
+	table( benetech.full$server, benetech.full$original.server, useNA="ifany" )
+
+	# for debugging -- speed up
+	#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ]
+}
+
+
+# make sure we have ids for all the records
+stopifnot( sum( is.na( benetech.full$public.code ) ) == 0 )
+stopifnot( sum( is.na( benetech.full$bulletin.id ) ) == 0 )
+
+#			sum.dat = ddply( benetech.full, c("server","original.server"), summarize,
+#table( benetech.full$original.server )
+
+# Cut up data by server.  For each server cut up by the original server
+# and do summarize.  Also compute overall stats.
+# Need to do this seperately to capture total unique across all records
+# summing will give an invalid answer.
+big.sum.list = dlply( benetech.full, "server", function( chunk ) {
+
+	df = ddply( chunk, .(original.server), summarise,
+		server=server[[1]],
+		total.bull = length(public.code),
+		total.Mb = sum( size..Kb. ),
+
+		total.acc = length( unique( public.code ) ),
+		attach.pub = sum( public.attachments ),
+		attach.priv = sum( private.attachments )
+	)
+
+	if ( nrow(df) > 1 ) {
+		df.full = ddply( chunk, .(server), summarise,
+			original.server="Total",
+			total.bull = length(public.code),
+			total.Mb = sum( size..Kb. ),
+
+			total.acc = length( unique( public.code ) ),
+			attach.pub = sum( public.attachments ),
+			attach.priv = sum( private.attachments )
+			)
+		df$original.server = c("Mirr","Orig")[1+df$original.server]
+		df = df[ c(2,1,3:ncol(df)) ]
+		rbind( df, df.full )
+	} else {
+		df$original.server = c("Total","Total")[1+df$original.server]
+		df
+	}
+
+} )
+
+total.df = ddply( benetech.full, c(), summarise,
+			original.server="T",
+			total.bull = length(public.code),
+			total.Mb = sum( size..Kb. ),
+
+			total.acc = length( unique( public.code ) ),
+			attach.pub = sum( public.attachments ),
+			attach.priv = sum( private.attachments )
+			)
+names(total.df)[1] = "server"
+total.df$server = "Total"
+total.df$original.server = NA
+
+
+# put it all together
+summ = do.call( rbind, big.sum.list )
+#summ
+summ = rbind( summ, total.df )
+
+
+
+
+dups = !duplicated( summ$server )
+
+# for pretty output
+summ$server[!dups] = ""
+
+# add some summary stats
+summ$kBperBul = with( summ, total.Mb / total.bull )
+summ $attach.tot = with( summ, attach.pub + attach.priv)
+summ = summ[c(1,2,3,4,8,5,6,7,9)]
+
+# not that useful?
+#summ$avg.attach = with( summ, attach.tot / total.bull )
+
+#summ
+summ$total.Mb = summ$total.Mb / 1024
+summ$total.Mb = formatC(summ$total.Mb, format="d", big.mark=',' )
+summ
+
+
+summ$per.pub = 100 *summ$attach.pub / summ$attach.tot
+summ
+summ$attach.pub = NULL
+summ$attach.priv = NULL
+summ$per.pub = paste( round(summ$per.pub,digits=1), "%", sep="" )
+summ$attach.tot = formatC(summ$attach.tot, format="d", big.mark=',' )
+
+
+# make an xtable in latex.
+xtb = xtable( summ, align="rrrrrrrrr", caption="Database Usage Statistics",
+			digits=0 )
+print( xtb, hline.after=(which(dups)-1), include.rownames=FALSE,
+		include.colnames=FALSE, only.contents=TRUE  )
+
+
diff --git a/SF_2011/Benetech/R/Q1b_server_growth.R b/SF_2011/Benetech/R/Q1b_server_growth.R
@@ -0,0 +1,199 @@
+
+# plot increase of four usage variables of interest over time.
+
+library( plyr )
+library(xtable)
+
+#source( "load_data.R" )
+# save( benetech.full, file="thedatafile.rdata" )
+if ( FALSE ) {
+	bk = benetech.full
+	load( file="thedatefile" )
+	benetech.full = bk
+	table( benetech.full$server, benetech.full$original.server, useNA="ifany" )
+
+	# for debugging -- speed up
+	#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ]
+}
+
+
+bf = subset( benetech.full, !is.na( date.uploaded ) )
+if(FALSE){ 
+	nrow(bf)
+	nrow(benetech.full)
+}
+
+bf = bf[ order( bf$date.uploaded ), ]
+#nrow(bf)
+bf$new.account = !duplicated( bf$public.code )
+
+# compute total from time of start
+bf$tot.bull = 1:nrow(bf)
+bf$totKb = cumsum( bf$size..Kb. )
+bf$accounts = cumsum( bf$new.account )
+bf$tot.attach = cumsum( bf$public.attachments + bf$private.attachments )
+
+
+# select some subsample for plotting
+pick = round( seq( 1, nrow(bf), length.out=4000 ) )
+pick = pick[ !duplicated( bf$date.uploaded[ pick ], fromLast=TRUE ) ]
+
+# make the full subsample
+bfs.big = bf[pick,]
+N.big = nrow(bfs.big)
+
+# make the smoothed subsample
+pick = pick[ round( seq( 1, N.big, length.out=60 ) ) ]
+N.little = length(pick)
+bfs.little = bf[pick,]
+
+#plot(pick)
+
+# Calculate numerical derivatives
+delt = function( X ) { 
+	N = length(X)
+	X[2:N] - X[1:(N-1)] 
+}
+
+del.ts = as.double( delt( bfs.little$date.uploaded ), units="days")
+#del.ts
+#summary(del.ts)
+
+days = bfs.little$date.uploaded[2:N.little]
+
+bull.per.day = delt( bfs.little$tot.bull ) / del.ts
+kb.per.day = delt( bfs.little$totKb ) / del.ts
+account.per.day = delt( bfs.little$accounts ) / del.ts 
+attach.per.day = delt( bfs.little$tot.attach ) / del.ts
+
+
+#######
+# PLOTS
+#  this code called in sweave
+#######
+
+# totals 
+plot.tot = function( bfs ) {
+	par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+	plot( bfs$date.uploaded, bfs$tot.bull, type="l", xlab="Time", ylab="Total Bulletins" )
+	plot( bfs$date.uploaded, bfs$totKb/1024, type="l", xlab="Time", ylab="Total Megabytes")
+	plot( bfs$date.uploaded, bfs$accounts, type="l", xlab="Time", ylab="Total Number of Distinct Accounts" )
+	plot( bfs$date.uploaded, bfs$tot.attach, type="l", xlab="Time", ylab="Total Number of Attachments" )
+}
+
+
+plot.tot.from = function( bfs, years ) {
+	cut = max(bfs$date.uploaded) - years * 365 * 24*60*60
+	keep = bfs$date.uploaded >= cut
+	par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+	plot( bfs$date.uploaded[keep], bfs$tot.bull[keep], type="l", xlab="Time", ylab="Total Bulletins" )
+	plot( bfs$date.uploaded[keep], bfs$totKb[keep]/1024, type="l", xlab="Time", ylab="Total Megabytes")
+	plot( bfs$date.uploaded[keep], bfs$accounts[keep], type="l", xlab="Time", ylab="Total Number of Distinct Accounts" )
+	plot( bfs$date.uploaded[keep], bfs$tot.attach[keep], type="l", xlab="Time", ylab="Total Number of Attachments" )
+
+}
+
+
+
+
+# no truncation
+plot.deriv = function( bfs ) {
+	par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+	plot( days, bull.per.day, type="l", xlab="Time", ylab="Bulletins Per Day" )
+	plot( days, kb.per.day/1024, type="l", xlab="Time", ylab="Megabytes per Day")
+	plot( days, account.per.day, type="l", xlab="Time", ylab="New Accounts per Day" )
+	plot( days, attach.per.day, type="l", xlab="Time", ylab="Attachments per Day" )
+}
+
+# truncated
+plot.deriv.trunc = function( bfs ) {
+	par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+	plot( days, bull.per.day, ylim=c(0, quantile( bull.per.day, 0.95 )), type="l", xlab="Time", ylab="Bulletins Per Day" )
+	plot( days, kb.per.day/1024, type="l", ylim=c(0, quantile( kb.per.day/1024, 0.95 )), xlab="Time", ylab="Megabytes per Day")
+	plot( days, account.per.day, type="l", ylim=c(0, quantile( account.per.day, 0.95 )), xlab="Time", ylab="New Accounts per Day" )
+	plot( days, attach.per.day, type="l", ylim=c(0, quantile( attach.per.day, 0.95 )), xlab="Time", ylab="Attachments per Day" )
+}
+
+
+
+plot.deriv.from = function( years ) {
+	cut = max(days) - years * 365 * 24*60*60
+	keep = days >= cut
+	par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+	plot( days[keep], bull.per.day[keep], type="l", xlab="Time", ylab="Bulletins Per Day" )
+	plot( days[keep], kb.per.day[keep]/1024, type="l", xlab="Time", ylab="Megabytes per Day")
+	plot( days[keep], account.per.day[keep], type="l", xlab="Time", ylab="New Accounts per Day" )
+	plot( days[keep], attach.per.day[keep], type="l", xlab="Time", ylab="Attachments per Day" )
+}
+
+
+
+
+if (FALSE) {
+
+plot.tot( bfs.big )
+plot.deriv( bfs.little )
+plot.deriv.trunc( bfs.little )
+
+}
+
+
+
+
+
+
+###### DEAD CODE
+
+# rg = range(bf$date.uploaded)
+# K = 100
+# cutpts = seq( rg[1], rg[2], length.out=K )
+# midpts = seq( rg[1], rg[2], length.out=(K*2)-1 )
+# midpts = midpts[ 2*(1:(K-1)) ]
+# #cutpts
+# #midpts
+
+# bf.bk = bf
+# bf = bf.bk[ sample( nrow(bf), K * 10000 ), ]
+# nrow(bf)
+
+
+# # res = sapply( 2:K, function(X) {
+	# cat( "tick", X,K,"\n" )
+	# tmp = subset( bf, cutpts[[X-1]] <= date.uploaded & date.uploaded <= cutpts[[X]] )
+
+	# bull.per.day=nrow(tmp)
+
+	# Kb.per.day=sum( tmp$size..Kb. ) 
+
+	# acc.per.day=sum(tmp$new.account)
+
+	# attach.per.day=sum(tmp$public.attachments + tmp$private.attachments)
+
+	# c( bull.per.day=bull.per.day, Kb.per.day=Kb.per.day, acc.per.day=acc.per.day, attach.per.day=attach.per.day ) / as.numeric((cutpts[X] - cutpts[X-1]))
+
+# } )
+
+
+
+# # rough plots
+# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+# plot( midpts, res[1,], type="l", xlab="Time", ylab="Bulletins Per Day" )
+# plot( midpts, res[2,], type="l", xlab="Time", ylab="Megabytes per Day")
+# plot( midpts, res[3,], type="l", xlab="Time", ylab="New Accounts per Day" )
+# plot( midpts, res[4,], type="l", xlab="Time", ylab="Attachments per Day" )
+
+
+
+# # smoothed plots
+# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) )
+# plot( midpts, res[1,], type="n", xlab="Time", ylab="Bulletins Per Day" )
+# lines( lowess(midpts, res[1,], f=1/20))
+# plot( midpts, res[2,], type="n", xlab="Time", ylab="Megabytes per Day")
+# lines( lowess(midpts, res[2,], f=1/20) )
+# plot( midpts, res[3,], type="n", xlab="Time", ylab="New Accounts per Day" )
+# lines( lowess(midpts, res[3,], f=1/20) )
+# plot( midpts, res[4,], type="n", xlab="Time", ylab="Attachments per Day" )
+# lines( lowess(midpts, res[4,], f=1/20) )
+
+
+