version 1.0

cran · May 7, 2014 · cb24136 · cb24136
commit cb24136
Show file tree

Hide file tree

Showing 12 changed files with 367 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,14 @@
+Package: gendata
+Type: Package
+Title: Generate and modify synthetic datasets
+Version: 1.0
+Date: 2014-05-07
+Author: Francis Huang <flh3@hotmail.com>
+Maintainer: Francis Huang <flh3@hotmail.com>
+Description: Set of functions to create datasets using a correlation matrix. 
+License: GPL-3
+Suggests: psych
+Packaged: 2014-05-08 11:52:56 UTC; huangf
+NeedsCompilation: no
+Repository: CRAN
+Date/Publication: 2014-05-08 15:00:04
diff --git a/MD5 b/MD5
@@ -0,0 +1,11 @@
+7195afc887c32881466e57af6b4ed44d *DESCRIPTION
+df390c53434517b304ac5db487184641 *NAMESPACE
+342f1ce5c35aeb3cf31f40751e358665 *R/dtrans.R
+f4f8cedfa2020bf348b967e67aba1b87 *R/genmvnorm.R
+01ef3ab6e2a122c3c91d6870bc17a2e6 *R/recalib.R
+4cbd5ff37108a8c4cd99a19946511f89 *R/revcode.R
+67874b9ff91c1fd314102a5dc00cf566 *man/dtrans.Rd
+7ad81a8b5b71270c0033a82d41998a13 *man/gendata-package.Rd
+63f2bee019b17a527c52d9a5d84d6862 *man/genmvnorm.Rd
+0e7981599f1f44e390fb28f645fde363 *man/recalib.Rd
+72fe602011db9615eb043cb9a50193b6 *man/revcode.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1 @@
+exportPattern("^[[:alpha:]]+")
diff --git a/R/dtrans.R b/R/dtrans.R
@@ -0,0 +1,21 @@
+#transform dataset to specified m and sd
+dtrans<-function(data,m,sd,rnd=F){
+#basic checking  
+x<-dim(data)[2] #getting number of vars
+cat("Number of variables in dataset:",  x,"\n")#number of vars
+cat("Number of means specified:",  length(m),"\n")
+cat("Number of standard deviations in dataset:",  length(sd),"\n")
+  if(length(m) != length(sd)){stop("Number of means/SDs should match.")}
+  if(length(m) != dim(data)[2]){stop("Incorrect number of means/SDs")}
+for (i in 1:x){  
+xrange<-range(data[,1])
+if(xrange[1]-xrange[2]==0){stop("Constant value detected")}
+}
+
+#transforming
+for (i in 1:x){
+  data[i]=m[i]+sd[i]*data[i]
+ }
+if(rnd==T){return(round(data,0))
+} else {return(data)}
+}
diff --git a/R/genmvnorm.R b/R/genmvnorm.R
@@ -0,0 +1,35 @@
+
+####################
+genmvnorm<-function(cor,k,n,seed=F){
+if(require("psych")!=T){
+  print("installing 'psych' package")
+  install.packages("psych")
+  require(psych)}
+if(seed != F){set.seed(seed)}
+if(is.matrix(cor)==F){
+x<-length(cor)
+if(x != (k*(k-1)/2) ){stop("STOP: wrong correlation table")}
+cr.cor<-matrix(NA,k,k)
+diag(cr.cor)<-1
+cr.cor[lower.tri(cr.cor)]<-cor
+cr.cor[upper.tri(cr.cor)]<-t(cr.cor)[upper.tri(cr.cor)]
+fit<-principal(cr.cor,rotate=F,nfactors=k)}
+
+if(is.matrix(cor)==T){ #if a correlation matrix was used
+  fit<-principal(cor,rotate=F,nfactors=k)
+}
+
+l<-fit$loadings[1:k,1:k]
+f<-matrix(nrow=k,l)
+dim(f)
+ma<-matrix(nrow=n,ncol=k)
+for (i in 1:k){
+  ma[,i]<-rnorm(n)
+}
+tma<-t(ma)
+sol<-f %*% tma
+sol<-t(sol)
+
+data<-data.frame(sol)
+return(data)
+}
diff --git a/R/recalib.R b/R/recalib.R
@@ -0,0 +1,13 @@
+recalib<-function(data,var,low,high){
+  nmin<-low
+  nmax<-high
+  if(low>high){
+    nmax<-low
+    nmin<-high
+  }
+  cmin<-min(data[,var])
+  cmax<-max(data[,var])
+
+  data[,var]<-(nmax-nmin)/(cmax-cmin)*(data[,var]-cmin)+nmin
+  return(data)
+}
diff --git a/R/revcode.R b/R/revcode.R
@@ -0,0 +1,10 @@
+#reverse coding
+revcode<-function(data,vars){
+x<-length(vars)
+ for (i in 1:x){
+ mx<-max(data[,vars[i]])
+ mn<-min(data[,vars[i]])
+ data[,vars[i]]<-(mx)-data[,vars[i]]+(mn)
+}
+return(data)
+}
diff --git a/man/dtrans.Rd b/man/dtrans.Rd
@@ -0,0 +1,53 @@
+\name{dtrans}
+\alias{dtrans}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Data transform
+}
+\description{
+Transforms variables in a dataset with a specified mean and standard deviation.
+}
+\usage{
+dtrans(data, m, sd, rnd = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{data}{
+name of your dataset
+}
+  \item{m}{
+indicate a vector of desired means
+}
+  \item{sd}{
+indicate a vector of desired standard deviations
+}
+  \item{rnd}{
+Indicates if you want to round the numbers (no decimals). T or F.
+}
+}
+\author{
+Francis Huang
+}
+
+\examples{
+
+sdata<-genmvnorm(cor=c(.7,.2,.3),k=3,n=500,seed=12345)
+cor(sdata)
+summary(sdata)
+#note: data are in z scores
+
+s2<-dtrans(sdata,c(0,100,50),c(1,15,10),rnd=FALSE)
+summary(s2)
+sd(s2[,2])
+sd(s2[,3])
+#note: variables X2 and X3 are now rescaled with the appropriate means and standard deviations.
+head(s2)
+
+s2<-dtrans(sdata,c(0,100,50),c(1,15,10),rnd=TRUE)
+#at times, you may want a dataset to not have decimals. use rnd=T.
+head(s2)
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ transform }
+\keyword{ data }% __ONLY ONE__ keyword per line
diff --git a/man/gendata-package.Rd b/man/gendata-package.Rd
@@ -0,0 +1,48 @@
+\name{gendata-package}
+\alias{gendata-package}
+\alias{gendata}
+\docType{package}
+\title{
+Generate synthetic datasets
+}
+\description{
+Create synthetic datasets based on a correlation table. Additional functions can be used to rescale, transform, and reverse code variables.
+}
+\details{
+\tabular{ll}{
+Package: \tab gendata\cr
+Type: \tab Package\cr
+Version: \tab 1.0\cr
+Date: \tab 2014-05-07\cr
+License: \tab GPL-3\cr
+}
+
+Additional functions are for modifying the dataset. \cr
+
+genmvnorm: 
+ creates the dataset (generates a multivariate normal dataset). \cr
+recalib : for rescaling the dataset \cr
+dtrans  : for giving a variable a new mean and standard deviation \cr
+revcode : for reverse coding a variable
+}
+\author{
+Francis Huang
+
+Maintainer: Francis Huang <flh3@hotmail.com>
+
+}
+\references{
+
+Fan, X., Sivo, S., & Keenan, S. (2002). SAS for Monte Carlo studies: A guide for quantitative researchers. Sas Institute.
+}
+
+\keyword{ package }
+\seealso{
+\link[gendata:genmvnorm]{genmvnorm}
+\link[gendata:revcode]{revcode}
+\link[gendata:dtrans]{dtrans}
+\link[gendata:recalib]{recalib}
+
+
+}
+
diff --git a/man/genmvnorm.Rd b/man/genmvnorm.Rd
@@ -0,0 +1,68 @@
+\name{genmvnorm}
+\alias{genmvnorm}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+genmvnorm
+}
+\description{
+Generates a multivariate normal dataset based on a specified correlation matrix. 
+}
+\usage{
+genmvnorm(cor, k, n, seed = F)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{cor}{Can be a correlation matrix-- e.g., data<-cor(xyz)-- or the lower half
+  of a correlation matrix, e.g., for a 3 variable dataset, data<-c(.7,.3,.2)-- useful for creating datasets without having to specify both halves of the correlation matrix.
+
+}
+  \item{k}{
+Indicate the number of variables in your dataset.
+}
+  \item{n}{
+Indicate the number of observations in your new synthetic dataset.
+}
+  \item{seed}{
+For reproducability of results, set a specific seed number.
+}
+}
+\details{
+For creating synthetic datasets. Based on the SAS chapter by Fan et al. (2002).
+}
+
+\references{Based on:
+
+Fan, X., Sivo, S., & Keenan, S. (2002). SAS for Monte Carlo studies: A guide for quantitative researchers. SAS Institute.
+}
+\author{
+Francis Huang
+}
+
+
+\seealso{
+\link[gendata:revcode]{revcode}
+\link[gendata:dtrans]{dtrans}
+\link[gendata:recalib]{recalib}
+
+}
+\examples{
+sdata<-genmvnorm(cor=c(.7,.2,.3),k=3,n=500,seed=12345)
+cor(sdata)
+#dataset above uses the lower half of a correlation table
+#     1  .7  .2
+#     .7  1  .3
+#     .2 .3   1
+# Can also use a correlation table
+
+data(iris)
+dat<-cor(iris[,1:3])
+dat
+sdata<-genmvnorm(cor=dat,k=3,n=100,seed=123)
+cor(sdata)
+
+#example above uses the IRIS dataset.
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ montecarlo }
+\keyword{ synthetic }% __ONLY ONE__ keyword per line
diff --git a/man/recalib.Rd b/man/recalib.Rd
@@ -0,0 +1,58 @@
+\name{recalib}
+\alias{recalib}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Recalibrate (rescale) variables
+}
+\description{
+Rescale variables (one at a time) to have a new minimum and maximum value.
+}
+\usage{
+recalib(data, var, low, high)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{data}{the dataset to use.
+}
+  \item{var}{indicate the variable number (or variable name).}
+  \item{low}{
+Indicate the new minimum value.
+}
+  \item{high}{
+Indicate the new maximum value.
+}
+}
+\details{
+Specify the rescaling of variables one at a time.
+}
+
+
+\author{
+Francis Huang
+}
+
+
+\seealso{
+
+\link[gendata:genmvnorm]{genmvnorm}
+\link[gendata:revcode]{revcode}
+\link[gendata:dtrans]{dtrans}
+}
+\examples{
+sdata<-genmvnorm(cor=c(.7,.2,.3),k=3,n=500,seed=12345)
+cor(sdata)
+summary(sdata[,1])
+#note the min and max of variable X1
+#changes variable one to have a minimum of 10 and a maximum of 50
+#correlations remain the same
+
+s2<-recalib(sdata,1,10,50)
+cor(s2)
+summary(s2[,1])
+#note revised values of variable X1
+
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
diff --git a/man/revcode.Rd b/man/revcode.Rd
@@ -0,0 +1,35 @@
+\name{revcode}
+\alias{revcode}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Reverse coding variables
+}
+\description{
+Reverse codes variables
+}
+\usage{
+revcode(data, vars)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{data}{
+indicates your dataset.
+}
+  \item{vars}{
+indicates the variable number or name to reverse code.
+}
+}
+
+\author{
+Francis Huang
+}
+
+
+\seealso{
+\link[gendata:genmvnorm]{genmvnorm}
+\link[gendata:dtrans]{dtrans}
+\link[gendata:recalib]{recalib}
+}
+
+\keyword{ reverse }
+\keyword{ recode }% __ONLY ONE__ keyword per line