R/read.epiinfo.R

### This file is part of the 'foreign' package for R.

### R/read.epiinfo.R
### (c) 2002-4 Thomas Lumley
### Patches (c) 2002 Mark Myatt

#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  A copy of the GNU General Public License is available at
#  http://www.r-project.org/Licenses/

read.epiinfo <- function (file, read.deleted = FALSE,
                          guess.broken.dates = FALSE, thisyear = NULL,
                          lower.case.names = FALSE)
  {
    if (is.character(file)) {
        file <- file(file, "rt")
        on.exit(close(file))
    }
    if (!inherits(file, "connection"))
        stop("argument 'file' must be a character string or connection")
    if (!isOpen(file)) {
        open(file, "rt")
        on.exit(close(file))
    }
    line <- readLines(file, 1L, ok = TRUE)
    headerlength <- na.omit(sapply(strsplit(line, " ")[[1L]], as.numeric))[1L]
    if (headerlength <= 0L)
        stop("file has zero or fewer variables: probably not an EpiInfo file")
    headerlines <- readLines(file, n = headerlength)
    pushBack(headerlines, file)
    comments <- vapply(headerlines, function(s) substring(s, 46L, 46L + 80L), "")
    #
    # Added comment = ""  to fix '#' as entrychar being read as a comment
    #
    header <- scan(file, nlines = headerlength,
                   what = list(name = "", x = 0, y = 0, color = 0, x1 = 0,
                   y1 = 0, type = 0, len = 0, color = 0),
                   flush = TRUE, quiet = TRUE, comment.char = "")
    header <- as.data.frame(lapply(header, I))
    header$start <- cumsum(c(1L, header$len))[1L:headerlength]
    header$stop <- cumsum(header$len)
    multiline <- ceiling(max(header$stop) / 78L)
    really.variables <- header$len != 0
    header <- header[really.variables, ]
    entrychar <- substr(header$name, 1L, 1L)
    if (all(entrychar %in% c("#", "_")))
        header$name <- substr(header$name, 2L, 12L)
    comments <- comments[really.variables]
    #
    # Added support for EpiData introduced field types:
    #
    #   12  Automatic ID number fields (treated as numeric)
    #   16  European (i.e. dd/mm/yyyy) format automatic date
    #   17  SOUNDEX field
    #
    numbers <- (header$len > 0L) & ((header$type %in% c(0L, 6L, 12L)) | (header$type > 12L)) & !(header$type %in% c(16L, 17L))
    datalines <- scan(file, what = "", sep = "\n", quote = "", quiet = TRUE, blank.lines.skip = TRUE, comment.char = "")
    #
    # Added check for empty file
    #
    if (length(datalines) == 0L)
      stop("no records in file")
    if (length(datalines)%%multiline)
        warning("wrong number of records")
    datalines <- matrix(datalines, nrow = multiline)
    if (multiline > 1L)
        datalines[-multiline, ] <- substr(datalines[-multiline, ], 1L, 78L)
    datalines <- apply(datalines, 2L, paste, collapse = "")
    deleted <- substr(datalines, nchar(datalines), nchar(datalines)) == "?"
    nvars <- NROW(header)
    data <- as.data.frame(lapply(1L:nvars, function(i) I(substring(datalines, header$start[i], header$stop[i]))))
    names(data) <- header$name
    names(comments) <- header$name
    if (is.na(read.deleted))
        data[deleted, ] <- NA
    else if (!read.deleted)
        data <- data[!deleted, ]
    if (guess.broken.dates && is.null(thisyear))
        thisyear <- format(Sys.time(), format = "%Y")
    #
    # Added support for field types:
    #
    #   10  US (i.e. mm/dd/yyyy) format field (EpiInfo)
    #   12  Automatic ID number (treated as numeric)
    #   16  European (i.e. dd/mm/yyyy) format automatic date (EpiData)
    #   17  SOUNDEX field (EpiData)
    #
    for (i in 1L:nvars) {
        if (numbers[i])
            data[[i]] <- as.numeric(data[[i]])
        else if (header$type[i] == 5L)
            data[[i]] <- ifelse(data[[i]] %in% c("Y", "N"), data[[i]] == "Y", NA)
        else if (header$type[i] %in% c(11L, 16L) && header$len[i] == 5L && guess.broken.dates)
            data[[i]] <- as.Date(strptime(paste(data[[i]], thisyear, sep = "/"), format = "%d/%m/%Y"))
        else if (header$type[i] %in% c(11L, 16L) && header$len[i] == 8L && guess.broken.dates)
            data[[i]] <- as.Date(strptime(data[[i]], format = "%d/%m/%y"))
        else if (header$type[i] %in% c(11L, 16L) && header$len[i] == 10L)
            data[[i]] <- as.Date(strptime(data[[i]], format = "%d/%m/%Y"))
        else if (header$type[i] %in% c(2L, 10L) && header$len[i] == 5L && guess.broken.dates)
            data[[i]] <- as.Date(strptime(paste(data[[i]], thisyear, sep = "/"), format = "%m/%d/%Y"))
        else if (header$type[i] %in% c(2L, 10L) && header$len[i] == 8L && guess.broken.dates)
            data[[i]] <- as.Date(strptime(data[[i]], format = "%m/%d/%y"))
        else if (header$type[i] %in% c(2L, 10L) && header$len[i] == 10L)
            data[[i]] <- as.Date(strptime(data[[i]], format = "%m/%d/%Y"))
        #
        # SOUNDEX (type 17) fields
        #
        else if (header$type[i] == 17L) {
            data[[i]][substr(data[[i]], 1L, 1L) == " "] <- NA
            data[[i]] <- substr(data[[i]], 1L, 5L)
            }
        else {
            blanks <- grep("^[[:blank:]]*$", data[[i]])
            data[[i]][blanks] <- NA
        }
    }
    if (!is.na(read.deleted) && read.deleted)
        attr(data, "deleted") <- deleted
    attr(data, "prompts") <- comments
    #
    # Added parameter and code to specify lower case variable names
    #
    if (lower.case.names)
      names(data) <- tolower(names(data))
    data
}