version 1.0

cran · Aug 24, 2023 · 07b81d9 · 07b81d9
commit 07b81d9
Show file tree

Hide file tree

Showing 13 changed files with 579 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,32 @@
+Package: PreProcessRecordLinkage
+Type: Package
+Title: Preprocessing Record Linkage
+Version: 1.0
+Date: 2023-08-16
+Authors@R: c(
+      person(given = "Hossein",
+        family = "Hassani",
+        role = c("aut"),
+        email = "Hassani.stat@gmail.com"
+        ),
+      person(given = "Leila",
+        family = "Marvian Mashhad",
+        role = c("aut","cre"),
+        email = "Leila.marveian@gmail.com"
+        )
+    )	 
+Description: In this record linkage package, data preprocessing has been meticulously executed
+  to cover a wide range of datasets, ensuring that variable names are standardized using synonyms. 
+  This approach facilitates seamless data integration and analysis across various datasets. While users 
+  have the flexibility to modify variable names, the system intelligently ensures that 
+  changes are only permitted when they do not compromise data consistency or essential variable essence.
+License: GPL-3
+Depends: tm, syn, RecordLinkage
+Imports: stringr, data.table
+Maintainer: Leila Marvian Mashhad <Leila.marveian@gmail.com>
+NeedsCompilation: no
+Packaged: 2023-08-24 09:43:03 UTC; ne_da
+Author: Hossein Hassani [aut],
+  Leila Marvian Mashhad [aut, cre]
+Repository: CRAN
+Date/Publication: 2023-08-24 11:00:17 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,12 @@
+26c1e6d639a29ac4842abf25339f484d *DESCRIPTION
+60ac651a261b19855d9a33e3dc44fb07 *NAMESPACE
+69a5c8d719b1604e3059ee16ce77ecdf *R/chzInput.R
+e4c49075a3906fcf9cdb9a9387c6ea13 *R/create_new_data.R
+f9df0f89babcc954b83489af44ac6825 *R/preproc.R
+796b898ee61d83f65e373754b224d27a *R/preprocLinkage.R
+bd80a97c21abe0af03f7a327df3ada78 *R/selVar.R
+ec68c05bf87d3d4b3bab2af03877d18d *man/chzInput.Rd
+f33c15519bcce59f399a260fcc3e5aa9 *man/create_new_data.Rd
+c8d68eb8068b9e0d9d5092944f93ad5e *man/preproc.Rd
+96401d7d5ed08fbb357c0b2377dcbf5b *man/preprocLinkage.Rd
+b6da3159842a9cfce7a156420ef42916 *man/selVar.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,13 @@
+export("chzInput", 
+	"create_new_data", 
+	"preproc", 
+	"preprocLinkage", 
+	"selVar")
+
+import("tm",
+       "stringr",
+       "syn",
+       "RecordLinkage",
+       "data.table")
+
+importFrom("utils", "head")
diff --git a/R/chzInput.R b/R/chzInput.R
@@ -0,0 +1,14 @@
+chzInput <- function(d1, d2, chz = "NULL"){
+	pre_d <- preproc(d1, d2)
+	#ii <- pre_d$num_changed_var
+	cc <- pre_d$num_changed_var_d2
+
+	pre_d$name_d2[cc[chz]] <- pre_d$name_initial2[cc[chz]]
+
+	if (length(pre_d$name_initial2[cc[chz]]) == 0) {
+          message("No variable names have been changed.")
+	    return(pre_d$name_d2)
+     } else {
+      return(pre_d$name_d2)
+         }	
+}
diff --git a/R/create_new_data.R b/R/create_new_data.R
@@ -0,0 +1,39 @@
+create_new_data <-
+function(d1, d2, chz = "NULL"){
+
+pre_d <- preproc(d1, d2)
+
+names(d1) <- pre_d$name_d1
+names(d2) <- chzInput(d1, d2, chz = chz)
+
+d1 <- d1[sort(names(d1))]
+d2 <- d2[sort(names(d2))]
+
+data1 <- d1[intersect(names(d1),names(d2))] #Finding intersections columns
+data2 <- d2[intersect(names(d1),names(d2))] #Finding intersections columns
+
+data1 <- lapply(data1, tolower)
+data2 <- lapply(data2, tolower)
+
+if(sum(names(d1)=="gender")>0)
+{
+data1$gender[data1$gender=="male"] <- "m"
+data1$gender[data1$gender=="female"] <- "f"
+
+data2$gender[data2$gender=="male"] <- "m"
+data2$gender[data2$gender=="female"] <- "f"
+}
+
+if(sum(names(d1)=="sex")>0)
+{
+data1$sex[data1$sex=="male"] <- "m"
+data1$sex[data1$sex=="female"] <- "f"
+
+data2$sex[data2$sex=="male"] <- "m"
+data2$sex[data2$sex=="female"] <- "f"
+}
+
+data1 <- data.frame(data1)
+data2 <- data.frame(data2)
+return(list(data1 = data1,data2 = data2))
+}
diff --git a/R/preproc.R b/R/preproc.R
@@ -0,0 +1,108 @@
+preproc <- function(d1, d2)
+{
+      ## ----- Data Preparation ------
+	names(d1) <- tolower(names(d1))
+	names(d2) <- tolower(names(d2))
+
+	#Function for removing Punctuation mark without remove space between them
+	replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]"," ", x))}) 
+
+	cnd3 <- Corpus(VectorSource(names(d1)))
+	cnd4 <- tm_map(cnd3, replacePunctuation )
+
+	names(d1) <- t(as.matrix(data.frame(cnames=get('content', cnd4))))
+
+	cnd <- Corpus(VectorSource(names(d2)))
+	cnd2 <- tm_map(cnd, replacePunctuation )
+
+	names(d2) <- t(as.matrix(data.frame(cnames=get('content', cnd2))))
+
+	##Removing possible empty spaces created at the beginning and end of variable names after removing punctuation marks
+	names(d1) <- trimws(names(d1))
+	names(d2) <- trimws(names(d2))
+
+	## ---- Names of variables before finding synonyms
+	name_initial1 <- names(d1)
+	name_initial2 <- names(d2)
+
+	## ---- Finding synonyms by syn package ----
+	ii = c()
+	cc = c()
+	for(i in 1:length(syns(names(d1)))){
+	if(sum(names(d2)[names(d2) %in% syns(names(d1))[[i]]] != names(d1)[i])!= 0 )
+	{
+	ii = c(ii,i) # Changes related to the first data set
+	cc1 = which(names(d2) %in% syns(names(d1))[[i]])
+	cc = c(cc,cc1) #Changes related to the second data set
+	names(d2)[names(d2) %in% syns(names(d1))[[i]]] <- names(d1)[i]
+	}	
+	} 
+
+	var_d2 <-  names(d2)[cc]     #Variables that have been changed after preprocessing
+	var_init_d2 <- name_initial2[cc] #Initial variables 
+
+	# Substituting a few well-known variables for assimilation	
+	names(d1) <- gsub("first name","name1",names(d1))
+	names(d1) <- gsub("last name","name2",names(d1))
+
+	names(d2) <- gsub("first name","name1",names(d2))
+	names(d2) <- gsub("last name","name2",names(d2))
+
+	#names(d1) <- gsub("firstname","name1",names(d1))
+	#names(d1) <- gsub("lastname","name2",names(d1))
+
+	#names(d2) <- gsub("firstname","name1",names(d2))
+	#names(d2) <- gsub("lastname","name2",names(d2))
+
+	names(d1) <- gsub("first","name1",names(d1))
+	names(d1) <- gsub("last","name2",names(d1))
+
+	names(d2) <- gsub("first","name1",names(d2))
+	names(d2) <- gsub("last","name2",names(d2))
+
+	names(d1) <- gsub("sex","gender",names(d1))
+	names(d2) <- gsub("sex","gender",names(d2))
+
+
+	# Remove the space between the variable name
+	names(d1) <- str_replace_all(names(d1),' ','')
+	names(d2) <- str_replace_all(names(d2),' ','')
+
+
+	mylist <- list(var_d2 = var_d2, cls_var_d2 = class(var_d2),
+			  var_init_d2 = var_init_d2, cls_var_init_d2 = class(var_init_d2), 
+			  head_changed_data1 = head(d1[ii]),
+			  head_changed_data2 = head(d2[cc]),
+			  num_changed_var_d1 = ii,
+			  num_changed_var_d2 = cc,
+			  name_d1 = names(d1), name_initial1 = name_initial1,
+			  name_d2 = names(d2), name_initial2 = name_initial2,
+			  d1 = d1, d2 = d2)
+		        #counter = 1:length(ii))
+
+	attr(mylist, "class") <- "explain"
+	return(mylist)
+}
+
+print.explain <-
+function(x, ...) 
+{
+	if (length(x$var2) == 0) {
+       message("No variable names have been changed.")
+       } else {
+	message("Changed variable's names are:")
+      message(paste(x$var2),'\n')  	   
+	message("Changed variable's classes are:")
+	message(paste(class(x$var_d2),'\n'))
+	message("Initial variable's names for changed variable's names are:")
+      message(paste(x$var_init_d2),'\n')  	
+	message("Initial variable's classes are:")
+	message(paste(class(x$var_init_d2), '\n')) 
+	message("A number of changed variable values for the first dataset are:\n")
+	print(head(x$d1[x$num_changed_var_d1]))
+	message("A number of changed variable values for the second dataset are:\n")
+	print(head(x$d2[x$num_changed_var_d2]))
+	message("Number of changed variable's names are:\n", x$num_changed_var_d1,"\n")
+	}
+}
+
diff --git a/R/preprocLinkage.R b/R/preprocLinkage.R
@@ -0,0 +1,103 @@
+preprocLinkage <- function(d1,d2,chz="NULL",var=c("age","sex"),
+threshold=0.9,res1="res1",res2="res2")
+{
+	dd <- create_new_data(d1, d2, chz = chz)
+	data1 <- dd$data1
+	data2 <- dd$data2
+
+	res <- RLBigDataLinkage(dataset1 =data1 ,dataset2 = data2,
+                              strcmp = TRUE, blockfld = var)
+
+	res1 <- epiWeights(res)
+
+	result <- epiClassify(rpairs = res1, threshold.upper = threshold)
+
+	finalres <- getPairs(result, min.weight=0.7, filter.link = "link")
+
+	finalres2 <- finalres[-seq(0,nrow(finalres), by = 3)[-1],] #Excluding empty rows 
+	rows <- 1:(nrow(finalres2)/2)
+	number_int <- rep(rows, each = 2)
+	number_int[seq(0,length(number_int),by=2)[-1]] = " "
+	if(nrow(finalres2)==0){return(message("No special Linkage has been done."))}
+
+	finalres2 <- cbind(number_int, finalres2)
+
+	dataset <- rep(c('dataset1','dataset2'),(length(rows)/2))#Computing dataset's names
+	finalres2 <- cbind(finalres2,dataset)
+
+	if(nrow(finalres2)<1000000)
+	{
+		fwrite(finalres2, file = paste(tempdir(),'\\',res1,'.csv', sep = ''))
+		message(paste(tempdir(), '\\', res1,'.csv',' created', sep='')) 
+	}
+
+	if(nrow(finalres2)>=1000000)
+	{
+		save(finalres2, file = paste(tempdir(),'\\', res1,'.rdata', sep=''))
+		message(paste(tempdir(), '\\', res1,'.RData',' created', sep=''))
+		message('To see the results in the created file, first call the data.table package')
+	}
+
+	dif1 <- setdiff(names(d1), names(data1))#Finding different variables
+	dif2 <- setdiff(names(d2), names(data2))
+
+	finalres3 <- finalres2[-length(finalres2)]   #Removing the last column(it has been shown dataset's name)
+
+	#Creating new data frames based on uncommon columns
+	data3 <- cbind(data1, d1[dif1])
+	data4 <- cbind(data2, d2[dif2])
+
+	#Creating NA matrix for uncommon columns usind dataset1
+	mat1 <- matrix(data=NA,ncol=length(setdiff(names(data4),names(data3))),nrow=nrow(data3))
+	colnames(mat1) <- setdiff(names(data4), names(data3))
+
+	#Creating data frames with uncommon columns
+	data5 <- cbind(data3, mat1)
+
+	#Creating NA matrix for uncommon columns usind dataset2
+	mat2 <- matrix(data=NA,ncol=length(setdiff(names(data3),names(data4))),nrow=nrow(data4))
+	colnames(mat2) <- setdiff(names(data3), names(data4))
+
+	#Creating data frames with uncommon columns
+	data6 <- cbind(data4,mat2)
+
+	d3 <- data5[seq(1,(length(finalres3$id))-1, by = 2),]
+	d4 <- data6[seq(0,length(finalres3$id), by = 2)[-1],]
+
+	mm <- c()
+	for(x in seq(1,nrow(finalres3), by = 2))
+	{
+		mm1 <- rbind(cbind(data5[finalres3[x,]$id,][dif1],data5[finalres3[x,]$id,][dif2]),
+		cbind(data6[finalres3[(x+1),]$id,][dif1],data6[finalres3[(x+1),]$id,][dif2]))
+		mm <- rbind(mm, mm1)
+	}
+
+	if (sum(dim(mm)) != 0){
+      finalres4 <- cbind(finalres3, mm)} else {
+	finalres4 = finalres3
+	}
+
+	mm <- c()
+
+	for(x in seq(1,nrow(finalres4), by = 2))
+	{
+		mm1 <- cbind(finalres4[x,],finalres4[(x+1),])
+		mm <- rbind(mm,mm1)
+	}
+
+	finalres4 <- mm
+
+	if(nrow(finalres4)<1000000)
+	{
+		fwrite(finalres4, file = paste(tempdir(),'\\',res2,'.csv', sep = ''))
+		message(paste(tempdir(), '\\', res2,'.csv',' created', sep='')) 
+	}
+
+	if(nrow(finalres4)>=1000000)
+	{
+		save(finalres4, file = paste(tempdir(),'\\', res2,'.rdata', sep=''))
+		message(paste(tempdir(), '\\', res1,'.RData',' created', sep=''))
+		message('To see the results in the created file, first call the data.table package')
+	}
+}
+
diff --git a/R/selVar.R b/R/selVar.R
@@ -0,0 +1,12 @@
+selVar<- function(d1, d2, chz = "NULL")
+{
+	message("please select variables from this function's output and set var1 arqument 
+in the preprocLinkage function\n")
+	dd <- create_new_data(d1, d2, chz)
+
+	data1 <- dd$data1
+	data2 <- dd$data2
+
+	name_int <- names(data1)  #Intersection's names
+	return(name_int)	
+}
diff --git a/man/chzInput.Rd b/man/chzInput.Rd
@@ -0,0 +1,44 @@
+\name{chzInput}
+\alias{chzInput}
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{
+Consulting User
+}
+\description{
+After the pre processing of the data sets by \code{preproc} function, a series of changes were made on the names of the two variables for uniformity. Sometimes these changes of names based on synonyms are not desired by the user. In this function, according to the output of the \code{preproc} function, the user is asked to tell the program that any change in the name of the variables that he does not want.
+}
+\usage{
+chzInput(d1, d2, chz = "NULL")
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{d1}{
+A data frame.
+}
+  \item{d2}{
+A data frame.
+}
+  \item{chz}{
+the number of the name of the variable that the user does not want to change based on the output of the \code{preproc} function.
+}
+}
+\details{
+For more details about this function, refer to \code{preproc} function manual.
+}
+\value{
+A vector of characters.
+It is a vector of characters that shows the names of the variables of the second data set based on the opinion of the user who said which variable name should not be changed.
+}
+
+\author{
+Hossein Hassani and and Leila Marvian Mashhad.
+}
+
+\seealso{
+\code{\link{preproc}}
+}
+\examples{
+  d1 = RLdata500
+  d2 = RLdata10000
+  chzInput(d1, d2)
+}