-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/datanoborders/Datadives
- Loading branch information
Showing
23 changed files
with
2,123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# New York Data Without Borders Datadive # | ||
_This is currently a placeholder for a repository that will be populated with resources from the event, which took place October 14-16, 2011._ | ||
|
||
- We are in the process of moving material from the Wiki at http://wiki.datawithoutborders.cc | ||
|
||
This repository is the primary resource for all things developed during the Datadive. It will include subdirectories for the participating social organizations, which will contain the code, data, and analyses produced by the participants. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
.DS_Store | ||
.RData | ||
.RHistory | ||
*.tex | ||
*.log | ||
*.aux | ||
*.out | ||
*.nav | ||
*.pdfsync | ||
*.snm | ||
*.synctex.gz | ||
*.toc | ||
*.swp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# File-Name: drew_reports.R | ||
# Date: 2011-11-05 | ||
# Author: Drew Conway | ||
# Email: drew.conway@nyu.edu | ||
# Purpose: Generates a map of of where in the world Martus is being used | ||
# Data Used: data/*.csv | ||
# Packages Used: RCurl, RJSONIO, ggplot2 | ||
# Machine: Drew Conway's MacBook Pro | ||
|
||
# Copyright (c) 2011, under the Simplified BSD License. | ||
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php | ||
# All rights reserved. | ||
|
||
library(maps) | ||
|
||
# WARNING: This may take a very long time...you have been warned. | ||
|
||
### NOTE: To update the geocoding, uncomment the lines below | ||
|
||
# geo.coded <- lapply(unique(benetech.full$location), function(l), geocode.addr(l))) | ||
# geo.data <- data.frame(do.call(rbind, geo.coded)) | ||
|
||
# write.csv(geo.data, '../data/geo_data.csv', row.names=FALSE) | ||
|
||
geo.data <- read.csv('../data/geo_data.csv', stringsAsFactors=FALSE) | ||
names(geo.data) <- c('location', 'lng', 'lat') | ||
|
||
geo.frequency <- ddply(geo.data, .(lng, lat), summarise, count=length(location)) | ||
names(geo.frequency) <- c('lng', 'lat', 'count') | ||
|
||
globe <- data.frame(map(plot=FALSE)[c('x','y')]) | ||
map.plot <- ggplot(globe, aes(x=x, y=y))+geom_path(aes(alpha=0.25))+coord_map(projection='lagrange', ylim=c(-48,52)) | ||
map.plot <- map.plot + geom_point(data=subset(geo.frequency, !is.na(lng)), | ||
aes(x=lat, y=lng, alpha=0.75, size=count, color=count))+scale_size(to=c(2,5), name='Martus Use')+ | ||
scale_color_gradient(low='orange', high='darkred', name='Martus Use')+theme_bw()+ | ||
scale_alpha(to=c(0.25,0.75), legend=FALSE)+xlab('')+ylab('')+ | ||
opts(panel.grid.major=theme_blank(), panel.grid.minor=theme_blank(), | ||
axis.text.x=theme_blank(), axis.text.y=theme_blank(), axis.ticks=theme_blank()) | ||
ggsave(plot=map.plot, filename='benetech_report-map_plot.pdf', width=8, height=5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
|
||
library( plyr ) | ||
library(xtable) | ||
|
||
#source( "load_data.R" ) | ||
# save( benetech.full, file="thedatafile.rdata" ) | ||
if ( FALSE ) { | ||
bk = benetech.full | ||
load( file="thedatefile" ) | ||
benetech.full = bk | ||
table( benetech.full$server, benetech.full$original.server, useNA="ifany" ) | ||
|
||
# for debugging -- speed up | ||
#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ] | ||
} | ||
|
||
|
||
# make sure we have ids for all the records | ||
stopifnot( sum( is.na( benetech.full$public.code ) ) == 0 ) | ||
stopifnot( sum( is.na( benetech.full$bulletin.id ) ) == 0 ) | ||
|
||
# sum.dat = ddply( benetech.full, c("server","original.server"), summarize, | ||
#table( benetech.full$original.server ) | ||
|
||
# Cut up data by server. For each server cut up by the original server | ||
# and do summarize. Also compute overall stats. | ||
# Need to do this seperately to capture total unique across all records | ||
# summing will give an invalid answer. | ||
big.sum.list = dlply( benetech.full, "server", function( chunk ) { | ||
|
||
df = ddply( chunk, .(original.server), summarise, | ||
server=server[[1]], | ||
total.bull = length(public.code), | ||
total.Mb = sum( size..Kb. ), | ||
|
||
total.acc = length( unique( public.code ) ), | ||
attach.pub = sum( public.attachments ), | ||
attach.priv = sum( private.attachments ) | ||
) | ||
|
||
if ( nrow(df) > 1 ) { | ||
df.full = ddply( chunk, .(server), summarise, | ||
original.server="Total", | ||
total.bull = length(public.code), | ||
total.Mb = sum( size..Kb. ), | ||
|
||
total.acc = length( unique( public.code ) ), | ||
attach.pub = sum( public.attachments ), | ||
attach.priv = sum( private.attachments ) | ||
) | ||
df$original.server = c("Mirr","Orig")[1+df$original.server] | ||
df = df[ c(2,1,3:ncol(df)) ] | ||
rbind( df, df.full ) | ||
} else { | ||
df$original.server = c("Total","Total")[1+df$original.server] | ||
df | ||
} | ||
|
||
} ) | ||
|
||
total.df = ddply( benetech.full, c(), summarise, | ||
original.server="T", | ||
total.bull = length(public.code), | ||
total.Mb = sum( size..Kb. ), | ||
|
||
total.acc = length( unique( public.code ) ), | ||
attach.pub = sum( public.attachments ), | ||
attach.priv = sum( private.attachments ) | ||
) | ||
names(total.df)[1] = "server" | ||
total.df$server = "Total" | ||
total.df$original.server = NA | ||
|
||
|
||
# put it all together | ||
summ = do.call( rbind, big.sum.list ) | ||
#summ | ||
summ = rbind( summ, total.df ) | ||
|
||
|
||
|
||
|
||
dups = !duplicated( summ$server ) | ||
|
||
# for pretty output | ||
summ$server[!dups] = "" | ||
|
||
# add some summary stats | ||
summ$kBperBul = with( summ, total.Mb / total.bull ) | ||
summ $attach.tot = with( summ, attach.pub + attach.priv) | ||
summ = summ[c(1,2,3,4,8,5,6,7,9)] | ||
|
||
# not that useful? | ||
#summ$avg.attach = with( summ, attach.tot / total.bull ) | ||
|
||
#summ | ||
summ$total.Mb = summ$total.Mb / 1024 | ||
summ$total.Mb = formatC(summ$total.Mb, format="d", big.mark=',' ) | ||
summ | ||
|
||
|
||
summ$per.pub = 100 *summ$attach.pub / summ$attach.tot | ||
summ | ||
summ$attach.pub = NULL | ||
summ$attach.priv = NULL | ||
summ$per.pub = paste( round(summ$per.pub,digits=1), "%", sep="" ) | ||
summ$attach.tot = formatC(summ$attach.tot, format="d", big.mark=',' ) | ||
|
||
|
||
# make an xtable in latex. | ||
xtb = xtable( summ, align="rrrrrrrrr", caption="Database Usage Statistics", | ||
digits=0 ) | ||
print( xtb, hline.after=(which(dups)-1), include.rownames=FALSE, | ||
include.colnames=FALSE, only.contents=TRUE ) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
|
||
# plot increase of four usage variables of interest over time. | ||
|
||
library( plyr ) | ||
library(xtable) | ||
|
||
#source( "load_data.R" ) | ||
# save( benetech.full, file="thedatafile.rdata" ) | ||
if ( FALSE ) { | ||
bk = benetech.full | ||
load( file="thedatefile" ) | ||
benetech.full = bk | ||
table( benetech.full$server, benetech.full$original.server, useNA="ifany" ) | ||
|
||
# for debugging -- speed up | ||
#benetech.full = benetech.full[ sample( 1:nrow(benetech.full), 5000 ), ] | ||
} | ||
|
||
|
||
bf = subset( benetech.full, !is.na( date.uploaded ) ) | ||
if(FALSE){ | ||
nrow(bf) | ||
nrow(benetech.full) | ||
} | ||
|
||
bf = bf[ order( bf$date.uploaded ), ] | ||
#nrow(bf) | ||
bf$new.account = !duplicated( bf$public.code ) | ||
|
||
# compute total from time of start | ||
bf$tot.bull = 1:nrow(bf) | ||
bf$totKb = cumsum( bf$size..Kb. ) | ||
bf$accounts = cumsum( bf$new.account ) | ||
bf$tot.attach = cumsum( bf$public.attachments + bf$private.attachments ) | ||
|
||
|
||
# select some subsample for plotting | ||
pick = round( seq( 1, nrow(bf), length.out=4000 ) ) | ||
pick = pick[ !duplicated( bf$date.uploaded[ pick ], fromLast=TRUE ) ] | ||
|
||
# make the full subsample | ||
bfs.big = bf[pick,] | ||
N.big = nrow(bfs.big) | ||
|
||
# make the smoothed subsample | ||
pick = pick[ round( seq( 1, N.big, length.out=60 ) ) ] | ||
N.little = length(pick) | ||
bfs.little = bf[pick,] | ||
|
||
#plot(pick) | ||
|
||
# Calculate numerical derivatives | ||
delt = function( X ) { | ||
N = length(X) | ||
X[2:N] - X[1:(N-1)] | ||
} | ||
|
||
del.ts = as.double( delt( bfs.little$date.uploaded ), units="days") | ||
#del.ts | ||
#summary(del.ts) | ||
|
||
days = bfs.little$date.uploaded[2:N.little] | ||
|
||
bull.per.day = delt( bfs.little$tot.bull ) / del.ts | ||
kb.per.day = delt( bfs.little$totKb ) / del.ts | ||
account.per.day = delt( bfs.little$accounts ) / del.ts | ||
attach.per.day = delt( bfs.little$tot.attach ) / del.ts | ||
|
||
|
||
####### | ||
# PLOTS | ||
# this code called in sweave | ||
####### | ||
|
||
# totals | ||
plot.tot = function( bfs ) { | ||
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
plot( bfs$date.uploaded, bfs$tot.bull, type="l", xlab="Time", ylab="Total Bulletins" ) | ||
plot( bfs$date.uploaded, bfs$totKb/1024, type="l", xlab="Time", ylab="Total Megabytes") | ||
plot( bfs$date.uploaded, bfs$accounts, type="l", xlab="Time", ylab="Total Number of Distinct Accounts" ) | ||
plot( bfs$date.uploaded, bfs$tot.attach, type="l", xlab="Time", ylab="Total Number of Attachments" ) | ||
} | ||
|
||
|
||
plot.tot.from = function( bfs, years ) { | ||
cut = max(bfs$date.uploaded) - years * 365 * 24*60*60 | ||
keep = bfs$date.uploaded >= cut | ||
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
plot( bfs$date.uploaded[keep], bfs$tot.bull[keep], type="l", xlab="Time", ylab="Total Bulletins" ) | ||
plot( bfs$date.uploaded[keep], bfs$totKb[keep]/1024, type="l", xlab="Time", ylab="Total Megabytes") | ||
plot( bfs$date.uploaded[keep], bfs$accounts[keep], type="l", xlab="Time", ylab="Total Number of Distinct Accounts" ) | ||
plot( bfs$date.uploaded[keep], bfs$tot.attach[keep], type="l", xlab="Time", ylab="Total Number of Attachments" ) | ||
|
||
} | ||
|
||
|
||
|
||
|
||
# no truncation | ||
plot.deriv = function( bfs ) { | ||
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
plot( days, bull.per.day, type="l", xlab="Time", ylab="Bulletins Per Day" ) | ||
plot( days, kb.per.day/1024, type="l", xlab="Time", ylab="Megabytes per Day") | ||
plot( days, account.per.day, type="l", xlab="Time", ylab="New Accounts per Day" ) | ||
plot( days, attach.per.day, type="l", xlab="Time", ylab="Attachments per Day" ) | ||
} | ||
|
||
# truncated | ||
plot.deriv.trunc = function( bfs ) { | ||
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
plot( days, bull.per.day, ylim=c(0, quantile( bull.per.day, 0.95 )), type="l", xlab="Time", ylab="Bulletins Per Day" ) | ||
plot( days, kb.per.day/1024, type="l", ylim=c(0, quantile( kb.per.day/1024, 0.95 )), xlab="Time", ylab="Megabytes per Day") | ||
plot( days, account.per.day, type="l", ylim=c(0, quantile( account.per.day, 0.95 )), xlab="Time", ylab="New Accounts per Day" ) | ||
plot( days, attach.per.day, type="l", ylim=c(0, quantile( attach.per.day, 0.95 )), xlab="Time", ylab="Attachments per Day" ) | ||
} | ||
|
||
|
||
|
||
plot.deriv.from = function( years ) { | ||
cut = max(days) - years * 365 * 24*60*60 | ||
keep = days >= cut | ||
par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
plot( days[keep], bull.per.day[keep], type="l", xlab="Time", ylab="Bulletins Per Day" ) | ||
plot( days[keep], kb.per.day[keep]/1024, type="l", xlab="Time", ylab="Megabytes per Day") | ||
plot( days[keep], account.per.day[keep], type="l", xlab="Time", ylab="New Accounts per Day" ) | ||
plot( days[keep], attach.per.day[keep], type="l", xlab="Time", ylab="Attachments per Day" ) | ||
} | ||
|
||
|
||
|
||
|
||
if (FALSE) { | ||
|
||
plot.tot( bfs.big ) | ||
plot.deriv( bfs.little ) | ||
plot.deriv.trunc( bfs.little ) | ||
|
||
} | ||
|
||
|
||
|
||
|
||
|
||
|
||
###### DEAD CODE | ||
|
||
# rg = range(bf$date.uploaded) | ||
# K = 100 | ||
# cutpts = seq( rg[1], rg[2], length.out=K ) | ||
# midpts = seq( rg[1], rg[2], length.out=(K*2)-1 ) | ||
# midpts = midpts[ 2*(1:(K-1)) ] | ||
# #cutpts | ||
# #midpts | ||
|
||
# bf.bk = bf | ||
# bf = bf.bk[ sample( nrow(bf), K * 10000 ), ] | ||
# nrow(bf) | ||
|
||
|
||
# # res = sapply( 2:K, function(X) { | ||
# cat( "tick", X,K,"\n" ) | ||
# tmp = subset( bf, cutpts[[X-1]] <= date.uploaded & date.uploaded <= cutpts[[X]] ) | ||
|
||
# bull.per.day=nrow(tmp) | ||
|
||
# Kb.per.day=sum( tmp$size..Kb. ) | ||
|
||
# acc.per.day=sum(tmp$new.account) | ||
|
||
# attach.per.day=sum(tmp$public.attachments + tmp$private.attachments) | ||
|
||
# c( bull.per.day=bull.per.day, Kb.per.day=Kb.per.day, acc.per.day=acc.per.day, attach.per.day=attach.per.day ) / as.numeric((cutpts[X] - cutpts[X-1])) | ||
|
||
# } ) | ||
|
||
|
||
|
||
# # rough plots | ||
# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
# plot( midpts, res[1,], type="l", xlab="Time", ylab="Bulletins Per Day" ) | ||
# plot( midpts, res[2,], type="l", xlab="Time", ylab="Megabytes per Day") | ||
# plot( midpts, res[3,], type="l", xlab="Time", ylab="New Accounts per Day" ) | ||
# plot( midpts, res[4,], type="l", xlab="Time", ylab="Attachments per Day" ) | ||
|
||
|
||
|
||
# # smoothed plots | ||
# par( mfrow=c(2,2), mar=c(3,3,1,1), mgp=c(2,1,0) ) | ||
# plot( midpts, res[1,], type="n", xlab="Time", ylab="Bulletins Per Day" ) | ||
# lines( lowess(midpts, res[1,], f=1/20)) | ||
# plot( midpts, res[2,], type="n", xlab="Time", ylab="Megabytes per Day") | ||
# lines( lowess(midpts, res[2,], f=1/20) ) | ||
# plot( midpts, res[3,], type="n", xlab="Time", ylab="New Accounts per Day" ) | ||
# lines( lowess(midpts, res[3,], f=1/20) ) | ||
# plot( midpts, res[4,], type="n", xlab="Time", ylab="Attachments per Day" ) | ||
# lines( lowess(midpts, res[4,], f=1/20) ) | ||
|
||
|
||
|
Oops, something went wrong.