Skip to content

Commit

Permalink
version 0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
wzhang27 authored and cran-robot committed Dec 9, 2018
0 parents commit 835059e
Show file tree
Hide file tree
Showing 17 changed files with 606 additions and 0 deletions.
16 changes: 16 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Package: rSPARCS
Type: Package
Title: Data Management for the SPARCS
Version: 0.0.1
Author: Wangjian Zhang, Zhicheng Du, Ziqiang Lin, David Q. Rich, Sally W. Thurston, Jijin Yao, Xiaobo Xue, Shao Lin, Yuantao Hao
Maintainer: Wangjian Zhang <wzhang27@albany.edu>
Description: To clean and analyze the data from the Statewide Planning and Research Cooperative System (SPARCS), and generate sets for statistical modeling.
Additionally, other data with similar format or study objectives can also be handled.
License: GPL-3
Encoding: UTF-8
LazyData: true
Imports: foreign,data.table,spatialEco,geosphere,tigris,raster,sp
NeedsCompilation: no
Packaged: 2018-12-02 17:48:38 UTC; zwjed
Repository: CRAN
Date/Publication: 2018-12-09 15:10:09 UTC
16 changes: 16 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
7eafabac3fade13122371ec8b28b244f *DESCRIPTION
8927c120eaece26e6d1cbe26de42eccb *NAMESPACE
4db45197f2a5182d3b17a4b132879f5d *R/CXover.data.R
8b16ea8364b938775656e504a8db1e3e *R/DBFgeocode.R
1ad44bd688ec549851bd812303fb9366 *R/FIPS.name.R
ab216426bc16c71407a1ab610681ce5a *R/case.series.R
bb1f81674e0697a39ec3dfa46083eb0a *R/desc.comp.R
0ad93862fdb1744d2a96ee6e2cae2e85 *R/dupl.readm.R
ebfee48932401739b07fb0bc2e862341 *R/pick.cases.R
e5d9c776948025220385c40bddb03c56 *man/CXover.data.Rd
2cbf6cbf0c16bc815d983275663ee5e3 *man/DBFgeocode.Rd
a857a5e6e8a76c31de766535ec109960 *man/FIPS.name.Rd
88701ad21e32eedd5af25b34b54a838f *man/case.series.Rd
e78f4667dbd49360c0e78cf6f85f7634 *man/desc.comp.Rd
f0d1129f746468134bb353c55f8d74ef *man/dupl.readm.Rd
6fc07a7a448802a5e1a81d7e8d75f887 *man/pick.cases.Rd
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
exportPattern("^[[:alpha:]]+")
importFrom("foreign","write.dbf")
importFrom("data.table","as.data.table")
importFrom("geosphere","distGeo")
importFrom("tigris","tracts")
importFrom("raster","crs")
importFrom("sp","coordinates")
importFrom("spatialEco","point.in.poly")
importFrom("stats", "IQR", "aov", "chisq.test", "kruskal.test","median", "sd", "shapiro.test", "t.test", "wilcox.test")
51 changes: 51 additions & 0 deletions R/CXover.data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
CXover.data=function(data,date,ID=NULL,direction="pre4"){
if(length(ID)==0) data$ID=1:nrow(data) else data$ID=factor(data[,ID],levels=unique(data[,ID]))
test=as.character(data[,date])
test=test[test!=""&!is.na(test)]
if(any(!is.na(grep("/",test[1])),!is.na(grep("-",test[1])))) data$d1=as.Date(as.character(data[,date])) else data$d1=as.Date(paste(substr(data[,date],1,4),"/",substr(data[,date],5,6),"/",substr(data[,date],7,8),sep=""))
data$date1=as.character(data$d1-28)
data$date2=as.character(data$d1-21)
data$date3=as.character(data$d1-14)
data$date4=as.character(data$d1-7)
data$date5=as.character(data$d1+7)
data$date6=as.character(data$d1+14)
data$date7=as.character(data$d1+21)
data$date8=as.character(data$d1+28)
data$date1_1=substr(data$date1,6,7)
data$date2_1=substr(data$date2,6,7)
data$date3_1=substr(data$date3,6,7)
data$date4_1=substr(data$date4,6,7)
data$date5_1=substr(data$date5,6,7)
data$date6_1=substr(data$date6,6,7)
data$date7_1=substr(data$date7,6,7)
data$date8_1=substr(data$date8,6,7)

if(direction=="pre4") data$date5=data$date6=data$date7=data$date8=NA
if(direction=="month4"){
data$d1_1=substr(data$d1,6,7)
data$date1=ifelse(data$date1_1==data$d1_1,data$date1,NA)
data$date2=ifelse(data$date2_1==data$d1_1,data$date2,NA)
data$date3=ifelse(data$date3_1==data$d1_1,data$date3,NA)
data$date4=ifelse(data$date4_1==data$d1_1,data$date4,NA)
data$date5=ifelse(data$date5_1==data$d1_1,data$date5,NA)
data$date6=ifelse(data$date6_1==data$d1_1,data$date6,NA)
data$date7=ifelse(data$date7_1==data$d1_1,data$date7,NA)
data$date8=ifelse(data$date8_1==data$d1_1,data$date8,NA)
}
if(direction!="pre4"&direction!="month4") print("You may contact author (wzhang27@albany.edu) to add more options")
output=data[,c("ID","d1")]
names(output)[2]="Date"
output$status=1
for(j in 1:8){
test=data[,c("ID",paste("date",j,sep=""))]
names(test)[2]="Date"
test=test[which(!is.na(test$Date)),]
if(nrow(test)>0){
test$status=0
output=rbind(output,test)
}
output=output[order(output$ID),]
}
rownames(output)=1:nrow(output)
return(output)
}
14 changes: 14 additions & 0 deletions R/DBFgeocode.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
DBFgeocode <- function(data,cityname,roadaddress,mailbox=NULL,ZIP,output="data.csv"){
if(length(mailbox)==0) {
addr12=data[,roadaddress]
} else{
test1=nchar(as.character(data[,mailbox]))
test2=paste(data[,roadaddress],data[,mailbox])
addr12=ifelse(test1==0,data[,roadaddress],test2)
}
cityzip=paste(data[,cityname],data[,ZIP])
data$singleline=paste(addr12,", ",cityzip,sep="")
write.dbf(data,output,factor2char = TRUE)
print(paste("A dbf file has been written to ", getwd()," for geocoding",sep=""))
}

40 changes: 40 additions & 0 deletions R/FIPS.name.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FIPS.name<-function(data,patco,level="county",add=c("name","FIPS"),state="36",county=NULL,map=NULL,long.case,lat.case,censusFIPS="GEOID"){
COUNTY=data.frame(COUNTY=c("Albany","Allegany","Bronx","Broome",
"Cattaraugus","Cayuga","Chautauqua","Chemung","Chenango","Clinton","Columbia","Cortland","Delaware",
"Dutchess","Erie","Essex","Franklin","Fulton","Genesee","Greene","Hamilton","Herkimer","Jefferson",
"Kings","Lewis","Livingston","Madison","Monroe","Montgomery","Nassau","New York","Niagara","Oneida",
"Onondaga","Ontario","Orange","Orleans","Oswego","Otsego","Putnam","Queens","Rensselaer","Richmond","Rockland",
"Saratoga","Schenectady","Schoharie","Schuyler","Seneca","St Lawrence","Steuben","Suffolk","Sullivan",
"Tioga","Tompkins","Ulster","Warren","Washington","Wayne","Westchester","Wyoming","Yates"),
FIPS=c(36001,36003,36005,36007,36009,36011,36013,36015,36017,36019,36021,36023,36025,36027,36029,
36031,36033,36035,36037,36039,36041,36043,36045,36047,36049,36051,36053,36055,36057,36059,36061,
36063,36065,36067,36069,36071,36073,36075,36077,36079,36081,36083,36085,36087,36091,36093,36095,
36097,36099,36089,36101,36103,36105,6107,36109,36111,36113,36115,36117,36119,36121,36123),
SPARCS=c(1,2,58,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,59,23,24,25,26,27,28,60,29,
30,31,32,33,34,35,36,37,61,38,62,39,41,42,43,44,45,40,46,47,48,49,50,51,52,53,54,55,56,57))
if(level=="county"){
if("FIPS"%in%add) data$FIPS=COUNTY$FIPS[match(data[,patco],COUNTY$SPARCS)]
if("name"%in%add) data$county=COUNTY$COUNTY[match(data[,patco],COUNTY$SPARCS)]
}
if(level=="census"){
name=names(data)
data$long.case=data[,long.case]
data$lat.case=data[,lat.case]
data$long.case=as.numeric(as.character(data$long.case))
data$lat.case=as.numeric(as.character(data$lat.case))
a=ncol(data)
data1=data[which(!is.na(data$long.case)&!is.na(data$lat.case)),]
sp::coordinates(data1)=~long.case+lat.case
if(length(map)==0) NYSmap=tracts(state=state,county=county) else NYSmap=map
raster::crs(data1)=raster::crs(NYSmap)
data1=point.in.poly(data1,NYSmap)
data1=as.data.frame(data1)
data1=cbind(data1[,1:(a-2)],data1[,censusFIPS])
names(data1)[ncol(data1)]="CSFIPS"
data$CSFIPS=NA
data$CSFIPS[which(!is.na(data$long.case)&!is.na(data$lat.case))]=as.character(data1$CSFIPS)
data=data[,c(name,"CSFIPS")]
}
return(data)
}

76 changes: 76 additions & 0 deletions R/case.series.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
case.series <-function(data,ICD=NULL,diagnosis,date="ADMDT",start="2001/1/1",end="2016/12/31",by1=NULL,by2=NULL,by3=NULL,by4=NULL,by5=NULL){
data$diag6=data$diag5=data$diag4=data$diag3=NA
data$diag3=substr(data[,diagnosis],1,3)
data$diag4=substr(data[,diagnosis],1,4)
data$diag5=substr(data[,diagnosis],1,5)
data$diag6=substr(data[,diagnosis],1,6)
data=data[data$diag3%in%ICD|data$diag4%in%ICD|data$diag5%in%ICD|data$diag6%in%ICD,]
test=as.character(data[,date])
test=test[test!=""&!is.na(test)]
if(any(!is.na(grep("/",test[1])),!is.na(grep("-",test[1])))) data[,date]=as.Date(as.character(data[,date])) else data[,date]=as.Date(paste(substr(data[,date],1,4),"/",substr(data[,date],5,6),"/",substr(data[,date],7,8),sep=""))
by=c(by1,by2,by3,by4,by5)
if(length(by)>0) by=by[!is.na(by)]

if(length(by)>0){
for(k in 1:length(by)){
a=paste(unique(data[,by[k]]))
b=paste("Level for", by[k],":",a[1])
for(t in 2:length(a)) b=paste(b,a[t])
print(b)
data$var.by=data[,by[k]]
names(data)[ncol(data)]=paste("var",k,sep="")
}
case=var1=var2=var3=var4=var5=date.test=NA
data$case=1
med=as.data.table(data)
if(length(by)==1) med=as.data.frame(med[,list(case=sum(case)),list(var1)])
if(length(by)==2) med=as.data.frame(med[,list(case=sum(case)),list(var1,var2)])
if(length(by)==3) med=as.data.frame(med[,list(case=sum(case)),list(var1,var2,var3)])
if(length(by)==4) med=as.data.frame(med[,list(case=sum(case)),list(var1,var2,var3,var4)])
if(length(by)==5) med=as.data.frame(med[,list(case=sum(case)),list(var1,var2,var3,var4,var5)])
names(med)[1:(ncol(med)-1)]=by
Output1=med

data$date.test=data[,date]
data=as.data.table(data)
if(length(by)==1) data=as.data.frame(data[,list(case=sum(case)),list(date.test,var1)])
if(length(by)==2) data=as.data.frame(data[,list(case=sum(case)),list(date.test,var1,var2)])
if(length(by)==3) data=as.data.frame(data[,list(case=sum(case)),list(date.test,var1,var2,var3)])
if(length(by)==4) data=as.data.frame(data[,list(case=sum(case)),list(date.test,var1,var2,var3,var4)])
if(length(by)==5) data=as.data.frame(data[,list(case=sum(case)),list(date.test,var1,var2,var3,var4,var5)])
data$code=data[,"date.test"]
for(t in 1:(ncol(Output1)-1)) data$code=paste(data$code,data[,paste("var",t,sep="")])

Output2=NULL
for(k in 1:nrow(Output1)){
med=data.frame(date=seq.Date(as.Date(start),as.Date(end),"1 day"))
for(t in 1:(ncol(Output1)-1)){
med$var=Output1[k,t]
names(med)[ncol(med)]=names(Output1)[t]
}
med$code=med$date
for(t in 1:(ncol(Output1)-1)) med$code=paste(med$code,med[,t+1])
med$case=data$case[match(med$code,data$code)]
med$case[is.na(med$case)]=0
med=med[,-which(names(med)=="code")]
Output2=rbind(Output2,med)
}
} else{
data$date.test=data[,date]
data$case=1
data=as.data.table(data)
data=as.data.frame(data[,list(case=sum(case)),list(date.test)])
med=data.frame(date=seq.Date(as.Date(start),as.Date(end),"1 day"))
med$case=data$case[match(med$date,data$date.test)]
med$case[is.na(med$case)]=0
Output1=NULL
Output2=med
}
print(Output1)
return(Output2)
}





83 changes: 83 additions & 0 deletions R/desc.comp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
desc.comp <-function(data,variables,by=NULL,margin=2,avg.num="mean",test.num="metric"){
data0=as.data.frame(data[,variables])
names(data0)=names(data)[variables]
if(length(by)==0) group=rep(1,nrow(data0)) else group=data[,by]
if(is.factor(group)) level=levels(group) else level=unique(group)[order(unique(group))]
Output=NULL
for(i in 1:length(variables)){
data_med=data.frame(a=data0[,i],group=group)
names(data_med)[1]=names(data0)[i]
if(is.numeric(data0[,i])|is.integer(data0[,i])){
result=matrix(rep(0,1*(length(level)+2)),nrow=1)
mean.in=sprintf("%.2f",tapply(data_med[,1],data_med[,2],mean,na.rm=T))
sd.in=sprintf("%.2f",tapply(data_med[,1],data_med[,2],sd,na.rm=T))
median.in=sprintf("%.2f",tapply(data_med[,1],data_med[,2],median,na.rm=T))
IQR.in=sprintf("%.2f",tapply(data_med[,1],data_med[,2],IQR,na.rm=T))
test=NULL
try({test=tapply(data_med[,1],data_med[,2],shapiro.test)},silent=T)
shapiro=rep(0.5,length(level))
if(length(test)==0) print("Warning: The sample size for some groups may be too small") else {
for(k in 1:length(test)) shapiro[k]=test[[k]]$p.value
}
if(max(shapiro)>=0.05) result[1,1:length(level)]=paste(mean.in," (",sd.in,")",sep="") else result[1,1:length(level)]=paste(median.in," (",IQR.in,")",sep="")
if(avg.num=="mean") result[1,1:length(level)]=paste(mean.in," (",sd.in,")",sep="")
if(avg.num=="median") result[1,1:length(level)]=paste(median.in," (",IQR.in,")",sep="")

if((max(shapiro)>=0.05&test.num!="nonmetric")|test.num=="metric"){
if(length(level)==2){
result[1,ncol(result)-1]=paste("t =",sprintf("%.2f",t.test(data_med[,1]~data_med[,2],var.equal=T)$statistic))
test=t.test(data_med[,1]~data_med[,2],var.equal=T)$p.value
test=ifelse(test<0.001,"P<0.001",sprintf("%.3f",test))
result[1,ncol(result)]=test
}
if(length(level)>2){
result[1,ncol(result)-1]=paste("F =",sprintf("%.2f",summary(aov(data_med[,1]~data_med[,2]))[[1]]$"F value"[1]))
test=summary(aov(data_med[,1]~data_med[,2]))[[1]]$"Pr(>F)"[1]
test=ifelse(test<0.001,"P<0.001",sprintf("%.3f",test))
result[1,ncol(result)]=test
}

} else{
if(length(level)==2){
result[1,ncol(result)-1]=paste("W =",sprintf("%.2f",wilcox.test(data_med[,1]~data_med[,2],exact=F)$statistic))
test=wilcox.test(data_med[,1]~data_med[,2],exact=F)$p.value
test=ifelse(test<0.001,"P<0.001",sprintf("%.3f",test))
result[1,ncol(result)]=test
}
if(length(level)>2){
result[1,ncol(result)-1]=paste("W =",sprintf("%.2f",kruskal.test(data_med[,1]~data_med[,2])$statistic))
test=kruskal.test(data_med[,1]~data_med[,2])$p.value
test=ifelse(test<0.001,"P<0.001",sprintf("%.3f",test))
result[1,ncol(result)]=test
}
}
rownames(result)=names(data0)[i]
colnames(result)=c(level,"statistic","P")
} else {
result=matrix(rep(0,(length(levels(data_med[,1]))+1)*(length(level)+2)),nrow=length(levels(data_med[,1]))+1)
media=table(data_med[,1],data_med$group)
media1=round(prop.table(media,margin)*100,2)
media=as.data.frame.matrix(media)
media1=as.data.frame.matrix(media1)
for(k in 1:ncol(media1)){
media1[,k]=sprintf("%.2f",media1[,k])
result[-1,k]=paste(media[,k]," (",media1[,k],")",sep="")
}
result[1,]=""
result[,(ncol(result)-1):ncol(result)]=""
set.seed(4715)
result[2,ncol(result)-1]=paste("Chisq =",sprintf("%.2f",chisq.test(as.matrix(media),simulate.p.value=T)$statistic))
set.seed(4715)
test=chisq.test(as.matrix(media),simulate.p.value=T)$p.value
test=ifelse(test<0.001,"P<0.001",sprintf("%.3f",test))
result[2,ncol(result)]=test
rownames(result)=c(names(data0)[i],rownames(media))
colnames(result)=c(level,"statistic","P")
}
for(k in 1:ncol(result)) result[,k]=as.character(result[,k])
Output=rbind(Output,result)
}
return(Output)
}


40 changes: 40 additions & 0 deletions R/dupl.readm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
dupl.readm=function(data,UniqueID,date,period=365){
data$d2=data$d3=NA
test=as.character(data[,date])
test=test[test!=""&!is.na(test)]
if(any(!is.na(grep("/",test[1])),!is.na(grep("-",test[1])))) data[,date]=as.Date(as.character(data[,date])) else data[,date]=as.Date(paste(substr(data[,date],1,4),"/",substr(data[,date],5,6),"/",substr(data[,date],7,8),sep=""))

test=which(duplicated(data[,c(UniqueID,date)]))
data$id.dupl=0
data$id.dupl[test]=1
data0=data[data$id.dup==0,]
data1=data[data$id.dup==1,]#save
if(nrow(data1)>0) data1$onlyone=0
test=which(data0[,UniqueID]%in%data0[which(duplicated(data0[,UniqueID])),UniqueID])
data0$onlyone=1
data0$onlyone[test]=0

test=unique(data0[data0$onlyone==0,UniqueID])

# admission time
for(i in 1:length(test)){
med=data0[data0[,UniqueID]==test[i],]
med=med[order(med[,date]),]
med$d2=c(0,diff(med[,date]))
med$d3=ifelse(med$d2<=period,1,0)
med$d3[1]=0
med1=data.frame(x=rep(rle(med$d3)$values,rle(med$d3)$lengths),
y=sequence(rle(med$d3)$lengths))
med1$y[med1$x==0]=0
med$d3=med1$y
data0[data0[,UniqueID]==test[i],]=med
}

data=rbind(data0,data1)
names(data)[names(data)=="d3"]="Nadmission"
names(data)[names(data)=="d2"]="Period"
return(data)
}



24 changes: 24 additions & 0 deletions R/pick.cases.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
pick.cases<-function(data,long.case,lat.case,long.sites,lat.sites,radius="15 miles"){
data$which.site=NA
a=ncol(data)
for(i in 1:length(long.sites)){
data$var=NA
data$var=distGeo(data[,c(long.case,lat.case)],c(long.sites[i],lat.sites[i]))*0.000621371
if(gregexpr("km",radius)[[1]][1]>0|gregexpr("kms",radius)[[1]][1]>0) data$var=data$var*1.60934
if(i==1) data$which.site[!is.na(data$var)]=1
if(i==1) names(data)[ncol(data)]=paste("distance.site",i,sep="")
if(i>1) data$which.site=ifelse(data$distance.site1<data$var,data$which.site,i)
if(i>1) data$distance.site1=ifelse(data$distance.site1<data$var,data$distance.site1,data$var)
}
data=cbind(data[,1:a],data[,"distance.site1"])
names(data)[ncol(data)]="minDIST"
a=gregexpr("km",radius)[[1]][1]
b=gregexpr("kms",radius)[[1]][1]
c=gregexpr("mile",radius)[[1]][1]
d=gregexpr("miles",radius)[[1]][1]
cut=as.numeric(substr(radius,1,unique(c(a,b,c,d)[c(a,b,c,d)>0])-2))
data$Select=0
data$Select[data$minDIST<=cut]=1
return(data)
}

0 comments on commit 835059e

Please sign in to comment.