Skip to content

Commit

Permalink
version 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Leilamarvian authored and cran-robot committed Aug 24, 2023
0 parents commit 07b81d9
Show file tree
Hide file tree
Showing 13 changed files with 579 additions and 0 deletions.
32 changes: 32 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Package: PreProcessRecordLinkage
Type: Package
Title: Preprocessing Record Linkage
Version: 1.0
Date: 2023-08-16
Authors@R: c(
person(given = "Hossein",
family = "Hassani",
role = c("aut"),
email = "Hassani.stat@gmail.com"
),
person(given = "Leila",
family = "Marvian Mashhad",
role = c("aut","cre"),
email = "Leila.marveian@gmail.com"
)
)
Description: In this record linkage package, data preprocessing has been meticulously executed
to cover a wide range of datasets, ensuring that variable names are standardized using synonyms.
This approach facilitates seamless data integration and analysis across various datasets. While users
have the flexibility to modify variable names, the system intelligently ensures that
changes are only permitted when they do not compromise data consistency or essential variable essence.
License: GPL-3
Depends: tm, syn, RecordLinkage
Imports: stringr, data.table
Maintainer: Leila Marvian Mashhad <Leila.marveian@gmail.com>
NeedsCompilation: no
Packaged: 2023-08-24 09:43:03 UTC; ne_da
Author: Hossein Hassani [aut],
Leila Marvian Mashhad [aut, cre]
Repository: CRAN
Date/Publication: 2023-08-24 11:00:17 UTC
12 changes: 12 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
26c1e6d639a29ac4842abf25339f484d *DESCRIPTION
60ac651a261b19855d9a33e3dc44fb07 *NAMESPACE
69a5c8d719b1604e3059ee16ce77ecdf *R/chzInput.R
e4c49075a3906fcf9cdb9a9387c6ea13 *R/create_new_data.R
f9df0f89babcc954b83489af44ac6825 *R/preproc.R
796b898ee61d83f65e373754b224d27a *R/preprocLinkage.R
bd80a97c21abe0af03f7a327df3ada78 *R/selVar.R
ec68c05bf87d3d4b3bab2af03877d18d *man/chzInput.Rd
f33c15519bcce59f399a260fcc3e5aa9 *man/create_new_data.Rd
c8d68eb8068b9e0d9d5092944f93ad5e *man/preproc.Rd
96401d7d5ed08fbb357c0b2377dcbf5b *man/preprocLinkage.Rd
b6da3159842a9cfce7a156420ef42916 *man/selVar.Rd
13 changes: 13 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export("chzInput",
"create_new_data",
"preproc",
"preprocLinkage",
"selVar")

import("tm",
"stringr",
"syn",
"RecordLinkage",
"data.table")

importFrom("utils", "head")
14 changes: 14 additions & 0 deletions R/chzInput.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
chzInput <- function(d1, d2, chz = "NULL"){
pre_d <- preproc(d1, d2)
#ii <- pre_d$num_changed_var
cc <- pre_d$num_changed_var_d2

pre_d$name_d2[cc[chz]] <- pre_d$name_initial2[cc[chz]]

if (length(pre_d$name_initial2[cc[chz]]) == 0) {
message("No variable names have been changed.")
return(pre_d$name_d2)
} else {
return(pre_d$name_d2)
}
}
39 changes: 39 additions & 0 deletions R/create_new_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
create_new_data <-
function(d1, d2, chz = "NULL"){

pre_d <- preproc(d1, d2)

names(d1) <- pre_d$name_d1
names(d2) <- chzInput(d1, d2, chz = chz)

d1 <- d1[sort(names(d1))]
d2 <- d2[sort(names(d2))]

data1 <- d1[intersect(names(d1),names(d2))] #Finding intersections columns
data2 <- d2[intersect(names(d1),names(d2))] #Finding intersections columns

data1 <- lapply(data1, tolower)
data2 <- lapply(data2, tolower)

if(sum(names(d1)=="gender")>0)
{
data1$gender[data1$gender=="male"] <- "m"
data1$gender[data1$gender=="female"] <- "f"

data2$gender[data2$gender=="male"] <- "m"
data2$gender[data2$gender=="female"] <- "f"
}

if(sum(names(d1)=="sex")>0)
{
data1$sex[data1$sex=="male"] <- "m"
data1$sex[data1$sex=="female"] <- "f"

data2$sex[data2$sex=="male"] <- "m"
data2$sex[data2$sex=="female"] <- "f"
}

data1 <- data.frame(data1)
data2 <- data.frame(data2)
return(list(data1 = data1,data2 = data2))
}
108 changes: 108 additions & 0 deletions R/preproc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
preproc <- function(d1, d2)
{
## ----- Data Preparation ------
names(d1) <- tolower(names(d1))
names(d2) <- tolower(names(d2))

#Function for removing Punctuation mark without remove space between them
replacePunctuation <- content_transformer(function(x) {return (gsub("[[:punct:]]"," ", x))})

cnd3 <- Corpus(VectorSource(names(d1)))
cnd4 <- tm_map(cnd3, replacePunctuation )

names(d1) <- t(as.matrix(data.frame(cnames=get('content', cnd4))))

cnd <- Corpus(VectorSource(names(d2)))
cnd2 <- tm_map(cnd, replacePunctuation )

names(d2) <- t(as.matrix(data.frame(cnames=get('content', cnd2))))

##Removing possible empty spaces created at the beginning and end of variable names after removing punctuation marks
names(d1) <- trimws(names(d1))
names(d2) <- trimws(names(d2))

## ---- Names of variables before finding synonyms
name_initial1 <- names(d1)
name_initial2 <- names(d2)

## ---- Finding synonyms by syn package ----
ii = c()
cc = c()
for(i in 1:length(syns(names(d1)))){
if(sum(names(d2)[names(d2) %in% syns(names(d1))[[i]]] != names(d1)[i])!= 0 )
{
ii = c(ii,i) # Changes related to the first data set
cc1 = which(names(d2) %in% syns(names(d1))[[i]])
cc = c(cc,cc1) #Changes related to the second data set
names(d2)[names(d2) %in% syns(names(d1))[[i]]] <- names(d1)[i]
}
}

var_d2 <- names(d2)[cc] #Variables that have been changed after preprocessing
var_init_d2 <- name_initial2[cc] #Initial variables

# Substituting a few well-known variables for assimilation
names(d1) <- gsub("first name","name1",names(d1))
names(d1) <- gsub("last name","name2",names(d1))

names(d2) <- gsub("first name","name1",names(d2))
names(d2) <- gsub("last name","name2",names(d2))

#names(d1) <- gsub("firstname","name1",names(d1))
#names(d1) <- gsub("lastname","name2",names(d1))

#names(d2) <- gsub("firstname","name1",names(d2))
#names(d2) <- gsub("lastname","name2",names(d2))

names(d1) <- gsub("first","name1",names(d1))
names(d1) <- gsub("last","name2",names(d1))

names(d2) <- gsub("first","name1",names(d2))
names(d2) <- gsub("last","name2",names(d2))

names(d1) <- gsub("sex","gender",names(d1))
names(d2) <- gsub("sex","gender",names(d2))


# Remove the space between the variable name
names(d1) <- str_replace_all(names(d1),' ','')
names(d2) <- str_replace_all(names(d2),' ','')


mylist <- list(var_d2 = var_d2, cls_var_d2 = class(var_d2),
var_init_d2 = var_init_d2, cls_var_init_d2 = class(var_init_d2),
head_changed_data1 = head(d1[ii]),
head_changed_data2 = head(d2[cc]),
num_changed_var_d1 = ii,
num_changed_var_d2 = cc,
name_d1 = names(d1), name_initial1 = name_initial1,
name_d2 = names(d2), name_initial2 = name_initial2,
d1 = d1, d2 = d2)
#counter = 1:length(ii))

attr(mylist, "class") <- "explain"
return(mylist)
}

print.explain <-
function(x, ...)
{
if (length(x$var2) == 0) {
message("No variable names have been changed.")
} else {
message("Changed variable's names are:")
message(paste(x$var2),'\n')
message("Changed variable's classes are:")
message(paste(class(x$var_d2),'\n'))
message("Initial variable's names for changed variable's names are:")
message(paste(x$var_init_d2),'\n')
message("Initial variable's classes are:")
message(paste(class(x$var_init_d2), '\n'))
message("A number of changed variable values for the first dataset are:\n")
print(head(x$d1[x$num_changed_var_d1]))
message("A number of changed variable values for the second dataset are:\n")
print(head(x$d2[x$num_changed_var_d2]))
message("Number of changed variable's names are:\n", x$num_changed_var_d1,"\n")
}
}

103 changes: 103 additions & 0 deletions R/preprocLinkage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
preprocLinkage <- function(d1,d2,chz="NULL",var=c("age","sex"),
threshold=0.9,res1="res1",res2="res2")
{
dd <- create_new_data(d1, d2, chz = chz)
data1 <- dd$data1
data2 <- dd$data2

res <- RLBigDataLinkage(dataset1 =data1 ,dataset2 = data2,
strcmp = TRUE, blockfld = var)

res1 <- epiWeights(res)

result <- epiClassify(rpairs = res1, threshold.upper = threshold)

finalres <- getPairs(result, min.weight=0.7, filter.link = "link")

finalres2 <- finalres[-seq(0,nrow(finalres), by = 3)[-1],] #Excluding empty rows
rows <- 1:(nrow(finalres2)/2)
number_int <- rep(rows, each = 2)
number_int[seq(0,length(number_int),by=2)[-1]] = " "
if(nrow(finalres2)==0){return(message("No special Linkage has been done."))}

finalres2 <- cbind(number_int, finalres2)

dataset <- rep(c('dataset1','dataset2'),(length(rows)/2))#Computing dataset's names
finalres2 <- cbind(finalres2,dataset)

if(nrow(finalres2)<1000000)
{
fwrite(finalres2, file = paste(tempdir(),'\\',res1,'.csv', sep = ''))
message(paste(tempdir(), '\\', res1,'.csv',' created', sep=''))
}

if(nrow(finalres2)>=1000000)
{
save(finalres2, file = paste(tempdir(),'\\', res1,'.rdata', sep=''))
message(paste(tempdir(), '\\', res1,'.RData',' created', sep=''))
message('To see the results in the created file, first call the data.table package')
}

dif1 <- setdiff(names(d1), names(data1))#Finding different variables
dif2 <- setdiff(names(d2), names(data2))

finalres3 <- finalres2[-length(finalres2)] #Removing the last column(it has been shown dataset's name)

#Creating new data frames based on uncommon columns
data3 <- cbind(data1, d1[dif1])
data4 <- cbind(data2, d2[dif2])

#Creating NA matrix for uncommon columns usind dataset1
mat1 <- matrix(data=NA,ncol=length(setdiff(names(data4),names(data3))),nrow=nrow(data3))
colnames(mat1) <- setdiff(names(data4), names(data3))

#Creating data frames with uncommon columns
data5 <- cbind(data3, mat1)

#Creating NA matrix for uncommon columns usind dataset2
mat2 <- matrix(data=NA,ncol=length(setdiff(names(data3),names(data4))),nrow=nrow(data4))
colnames(mat2) <- setdiff(names(data3), names(data4))

#Creating data frames with uncommon columns
data6 <- cbind(data4,mat2)

d3 <- data5[seq(1,(length(finalres3$id))-1, by = 2),]
d4 <- data6[seq(0,length(finalres3$id), by = 2)[-1],]

mm <- c()
for(x in seq(1,nrow(finalres3), by = 2))
{
mm1 <- rbind(cbind(data5[finalres3[x,]$id,][dif1],data5[finalres3[x,]$id,][dif2]),
cbind(data6[finalres3[(x+1),]$id,][dif1],data6[finalres3[(x+1),]$id,][dif2]))
mm <- rbind(mm, mm1)
}

if (sum(dim(mm)) != 0){
finalres4 <- cbind(finalres3, mm)} else {
finalres4 = finalres3
}

mm <- c()

for(x in seq(1,nrow(finalres4), by = 2))
{
mm1 <- cbind(finalres4[x,],finalres4[(x+1),])
mm <- rbind(mm,mm1)
}

finalres4 <- mm

if(nrow(finalres4)<1000000)
{
fwrite(finalres4, file = paste(tempdir(),'\\',res2,'.csv', sep = ''))
message(paste(tempdir(), '\\', res2,'.csv',' created', sep=''))
}

if(nrow(finalres4)>=1000000)
{
save(finalres4, file = paste(tempdir(),'\\', res2,'.rdata', sep=''))
message(paste(tempdir(), '\\', res1,'.RData',' created', sep=''))
message('To see the results in the created file, first call the data.table package')
}
}

12 changes: 12 additions & 0 deletions R/selVar.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
selVar<- function(d1, d2, chz = "NULL")
{
message("please select variables from this function's output and set var1 arqument
in the preprocLinkage function\n")
dd <- create_new_data(d1, d2, chz)

data1 <- dd$data1
data2 <- dd$data2

name_int <- names(data1) #Intersection's names
return(name_int)
}
44 changes: 44 additions & 0 deletions man/chzInput.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
\name{chzInput}
\alias{chzInput}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{
Consulting User
}
\description{
After the pre processing of the data sets by \code{preproc} function, a series of changes were made on the names of the two variables for uniformity. Sometimes these changes of names based on synonyms are not desired by the user. In this function, according to the output of the \code{preproc} function, the user is asked to tell the program that any change in the name of the variables that he does not want.
}
\usage{
chzInput(d1, d2, chz = "NULL")
}
%- maybe also 'usage' for other objects documented here.
\arguments{
\item{d1}{
A data frame.
}
\item{d2}{
A data frame.
}
\item{chz}{
the number of the name of the variable that the user does not want to change based on the output of the \code{preproc} function.
}
}
\details{
For more details about this function, refer to \code{preproc} function manual.
}
\value{
A vector of characters.
It is a vector of characters that shows the names of the variables of the second data set based on the opinion of the user who said which variable name should not be changed.
}

\author{
Hossein Hassani and and Leila Marvian Mashhad.
}

\seealso{
\code{\link{preproc}}
}
\examples{
d1 = RLdata500
d2 = RLdata10000
chzInput(d1, d2)
}

0 comments on commit 07b81d9

Please sign in to comment.