Skip to content

Commit

Permalink
version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
BrianAronson authored and cran-robot committed Jul 1, 2019
0 parents commit be5c9c4
Show file tree
Hide file tree
Showing 34 changed files with 1,230 additions and 0 deletions.
18 changes: 18 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,18 @@
Package: easyPSID
Title: Reading, Formatting, and Organizing the Panel Study of Income
Dynamics (PSID)
Version: 0.1.0
Authors@R: person("Brian", "Aronson", email = "bdaronson@gmail.com", role = c("aut", "cre"))
Description: Provides various functions for reading and preparing the Panel Study of Income Dynamics (PSID) for longitudinal analysis, including functions that read the PSID's fixed width format files directly into R, rename all of the PSID's longitudinal variables so that recurring variables have consistent names across years, simplify assembling longitudinal datasets from cross sections of the PSID Family Files, and export the resulting PSID files into file formats common among other statistical programming languages ('SAS', 'STATA', and 'SPSS').
Depends: R (>= 3.0.1)
Imports: stringr (>= 1.0.0), LaF (>= 0.6.0), foreign (>= 0.8-67)
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0
NeedsCompilation: no
Packaged: 2019-06-27 17:14:05 UTC; bda13
Author: Brian Aronson [aut, cre]
Maintainer: Brian Aronson <bdaronson@gmail.com>
Repository: CRAN
Date/Publication: 2019-07-01 10:40:03 UTC
2 changes: 2 additions & 0 deletions LICENSE
@@ -0,0 +1,2 @@
YEAR: 2019
COPYRIGHT HOLDER: Brian Aronson
33 changes: 33 additions & 0 deletions MD5
@@ -0,0 +1,33 @@
48e637c1ec93b52fa6075a99e45a8455 *DESCRIPTION
02d1618577bd2d0466aa03f413aca193 *LICENSE
9506bff0123601879225c2af3647593c *NAMESPACE
504c78d1143d2eabbd7353bebcc55ee2 *R/10find_desc.R
cfc68a5cdddb81746aafa197f464b6ba *R/1unzip_all_files.R
062d276c9308d1224c420382ee0aaf2a *R/2convert_to_rds.R
1860d15929b916dee6838fc6f6248716 *R/3rename_fam_vars.R
f7c51e1c966398975dc7abbab7a58161 *R/4rename_ind_vars.R
cff7ac15d77e2fd53628e881d932b3d3 *R/5create_custom_panel.R
5eddd106681ec152066276a80931d8fa *R/6create_extract.R
d1b5d0e6a1acb4cc5341eef4f0745346 *R/7convert_from_rdata.R
6c1013c14d817547ce674d85fbd2f7ed *R/8find_name.R
220925bea5f68f355e628352b4e141db *R/9find_years.R
00ee728a84bf1ccda82983f101a23f94 *R/PSID_Package.Documentation_function.R
908cf2deb6da90fdac560e91e9265953 *R/sysdata.rda
90bc0e395f4bcf6dc42cd1070c944d62 *README.md
a3ec50e70b44fb699009d06d6d1c402e *inst/extdata/rds_dir/FAM1968.rds
a24bf4365b653a291af90f9d9fb457d7 *inst/extdata/rds_dir/FAM1969.rds
b775868c5cd850722782a8ce5297228d *inst/extdata/rds_dir/IND2015.rds
d9542fbbf2513e8559a8be7317c492be *inst/extdata/unzip_dir/FAM1984.do
6b31772af4073883c76d77aae74499cb *inst/extdata/unzip_dir/FAM1984.txt
903436bd03d13eee3bdaf84bcaa2456a *inst/extdata/zip_dir/fam1968.zip
afe3f0e80913ceebd730903077fb25b4 *man/convert_from_rds.Rd
dc0873bd81886c48b5df83555ea6e522 *man/convert_to_rds.Rd
8b87c53e7689dd30607c94331317fe78 *man/create_custom_panel.Rd
005e57ff5f826054a68df190e089465b *man/create_extract.Rd
8cca39d80dea448b4043ea520ede9e66 *man/easyPSID-package.Rd
a5fe95482fbcd9ed1354d3a884a7aef3 *man/find_description.Rd
a55e08b7c1a1c1a6e2f714c55e567c71 *man/find_name.Rd
81bae55cab97ac5a020621077ad19eba *man/find_years.Rd
3b7b2ddefacf1506ffd638a87f43f5b7 *man/rename_fam_vars.Rd
e4fb51302e862a2a7c6135eecfd9a30b *man/rename_ind_vars.Rd
342a6ed56d8ddd08f725e28d55e72e8d *man/unzip_all_files.Rd
19 changes: 19 additions & 0 deletions NAMESPACE
@@ -0,0 +1,19 @@
# Generated by roxygen2: do not edit by hand

export(convert_from_rds)
export(convert_to_rds)
export(create_custom_panel)
export(create_extract)
export(find_description)
export(find_name)
export(find_years)
export(rename_fam_vars)
export(rename_ind_vars)
export(unzip_all_files)
importFrom(LaF,laf_open_fwf)
importFrom(foreign,write.dta)
importFrom(foreign,write.foreign)
importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all)
importFrom(utils,object.size)
importFrom(utils,unzip)
23 changes: 23 additions & 0 deletions R/10find_desc.R
@@ -0,0 +1,23 @@
#' Find description of PSID variable
#' @description Finds the descriptions of selected PSID variables.
#' @param variables Variable names to look up (as individual string or vector of strings)
#' @keywords PSID
#' @export
#' @examples find_description(variables=c("V2","V30"))

find_description<-function(variables){
#1) Make sure variable is in character format
variables<-as.character(variables)
#2) Find variables in reference dataset
varrow<-data$psid_desc[match(variables,data$psid_desc[,1]),]
#3) If variables is not in reference dataset, just show the variables name
if(all(is.na(varrow))){
return(variables)
}
#4) Otherwise, print variables name and description
return(varrow)
}




27 changes: 27 additions & 0 deletions R/1unzip_all_files.R
@@ -0,0 +1,27 @@
#' Unzip all PSID files
#' @description Unzips all .zip_files files in the specified directory.
#' @param in_direc Directory of .zip files to be unzipped
#' @param out_direc Directory for unzipped PSID files to be placed
#' @keywords PSID
#' @export
#' @importFrom utils object.size unzip
#' @examples
#' unzip_all_files(
#' in_direc=system.file("extdata","zip_dir", package = "easyPSID"),
#' out_direc=tempdir()
#' }

unzip_all_files<-function(in_direc,out_direc){
#1) Deprecated - Set directories to current directory if none supplied
# in_direc<-ifelse(is.null(in_direc),getwd(),in_direc)
# out_direc<-ifelse(is.null(out_direc),in_direc,out_direc)
#2) Find all zip_files files in folder
zip_files<-list.files(path=in_direc,pattern= c("*\\.zip$" ),full.names=T)
#3) Unzip_files files to specified directories
dir.create(out_direc,showWarnings = F)
for(i in 1:length(zip_files)){
unzip(zip_files[i],exdir=out_direc)
}
#4) Indicate that files are unzip_filesped
message("Files Unzipped")
}
74 changes: 74 additions & 0 deletions R/2convert_to_rds.R
@@ -0,0 +1,74 @@
#' Convert all PSID files from .txt format to .rds format
#' @description Converts all PSID fixed width format .txt files in a selected directory into .rds format. Importantly, this function assumes that all files contained in the original PSID .zip files (especially those ending in .do) are present in the same directory as the PSID .txt files, and that all files within that directory have the same names as when first unzipped.
#' @param in_direc Directory containing unzipped PSID .txt and .do files
#' @param out_direc Directory to place PSID .rds files into
#' @keywords PSID
#' @export
#' @importFrom stringr str_extract str_extract_all
#' @importFrom LaF laf_open_fwf
#' @examples
#' convert_to_rds(
#' in_direc=system.file("extdata","unzip_dir", package = "easyPSID"),
#' out_direc=tempdir()
#' )

convert_to_rds<-function(in_direc,out_direc){
#1 - Deprecated - Specify directory
# in_direc<-ifelse(is.null(in_direc),getwd(),in_direc)
# out_direc<-ifelse(is.null(out_direc),getwd(),out_direc)
#2 - Read do and txt files
do_files<-c(list.files(path=in_direc,pattern= c("*\\.do$" ),full.names=T))
txt_files<-c(list.files(path=in_direc,pattern= c("*\\.txt$" ),full.names=T))
txt_files2<-c(list.files(path=in_direc,pattern= c("*\\.txt$" )))
#3 - Convert Stata .do files to R syntax
for(i in 1:length(do_files)){
#a) Read stata code
stata_code <- readLines(do_files[i])
#b) Keep relevant lines
code_start <- min(grep('infix', stata_code))
code_end <- min(grep('using', stata_code))-1
stata_keep <- stata_code[code_start:code_end]
stata_keep<-stata_keep[-1] #(just contains "infix")
#c) Extract all variable names
variable_names <- unlist(str_extract_all(stata_keep, '[A-Z][a-zA-Z0-9_]+'))
#d) Extract all variable labels
#i) Find where relevant code starts
labs_start <- min(grep('label', stata_code))
labs_end <- max(grep('label', stata_code))
stata_labs <- stata_code[labs_start:labs_end]
#ii) Grab strings between quotes
var_descriptions<-vector(length=length(stata_labs))
for(j in 1:length(var_descriptions)){
temp<-str_extract_all(stata_labs[j],"\"(.+)\"")[[1]]
#Remove quotes
var_descriptions[j]<-gsub("\"","",temp)
#Remove extra spaces
var_descriptions[j]<-gsub(" ","",var_descriptions[j])
}
#iii) Extract all column indices
temp_columns<-unlist(str_extract_all(stata_keep,'[0-9]+ [/-] [0-9]+'))
#iv) Change column ranges to column counts
columns<-vector()
for(j in 1:length(temp_columns)){
a<-temp_columns[j]
b<-as.numeric(unlist(str_extract_all(a,'[0-9]+')))
columns[j]<-b[2]-b[1]+1
}
#4) Use converted stata code to read in txt files and use stata variable labels
temp_data <- laf_open_fwf(txt_files[i], column_widths = columns,column_types = rep("double",length(columns)))
temp_data <- temp_data[,]
names(temp_data) <- variable_names
attr(temp_data,"var.labels")<-var_descriptions
#5) Save files as "[Type] [Year].rds"
#a) Determine file name
temp_year<-str_extract(txt_files2[i],'[0-9]+')
temp_heading<-substr(txt_files2[i],1,3)
temp_name<-paste(temp_heading,temp_year,".rds",sep="")
#b) Save file
dir.create(out_direc,showWarnings = F)
saveRDS(object=temp_data,file=paste(out_direc,"/",temp_name,sep = ""))
#c) Notify user of progress
message(paste(temp_name,"Converted"))
}
}

47 changes: 47 additions & 0 deletions R/3rename_fam_vars.R
@@ -0,0 +1,47 @@
#' Rename longitudinal Family File variables
#' @description Renames all longitudinal variables in every PSID Family File of a given directory, such that variables are labeled with the variable name used when the variable was first made available in the PSID. For example, the "Release Number" variable was first recorded in the PSID dataset in 1968 as variable "V1" but its name in the 1969 family file is "V441". This program changes the "Release Number" variable name to "V1" in 1968 and all subsequent waves.
#' @param in_direc Directory of PSID .rds files to rename
#' @param out_direc Directory for renamed PSID .rds files to be saves to. Warning: If no directory specified, this function will overwrite the Family Files in the current directory.
#' @keywords PSID
#' @export
#' @examples
#' rename_fam_vars(
#' in_direc=system.file("extdata","rds_dir", package = "easyPSID"),
#' out_direc=tempdir()
#' )

rename_fam_vars<-function(in_direc,out_direc){
#1) Deprecate - Set Directory
# in_direc<-ifelse(is.null(in_direc),getwd(),in_direc)
# out_direc<-ifelse(is.null(out_direc),getwd(),out_direc)
#2) Create list of files to alter
file_names<-list.files(path=in_direc,pattern= c("^FAM.*\\.rds$" ),full.names=T) #only files beginning with title "FAM" and ending with ".rds"
file_names2<-list.files(path=in_direc,pattern= c("^FAM.*\\.rds$" ))
#3) For each file:
for(j in 1:length(file_names)){
#a) Load first file and rename to "temp_data"
temp_data<-readRDS(file_names[j])
#b) Create vector of variable names
temp_names<-names(temp_data)
#c) Determine which column to search
temp_column<-match(gsub('\\D+','', file_names2[j]),substring(names(data$psid_all),2))
#d) For each year:
for (i in 1:length(temp_names)){
#i) Find and list equivalent variables from other years
temp<-as.matrix(data$psid_all[data$psid_all[,temp_column]==temp_names[i],])
#ii) Remove years where that variable didn't exist
temp<-temp[temp!=""]
#iii) If variable not in list, skip
if (is.na(temp[1])){
next()}
temp_names[i]<-temp[1] # otherwise replace variable name to name of variable in earliest occurring dataset
}
#4) Change names of dataframes
names(temp_data)<-temp_names
#5) Save output into same directory and with same name as previously
dir.create(out_direc,showWarnings = F)
saveRDS(temp_data,file=paste(out_direc,"/",file_names2[j],sep = ""))
message(paste(file_names2[j],"renamed"))
rm(list=c("temp_data"))
}
}
123 changes: 123 additions & 0 deletions R/4rename_ind_vars.R
@@ -0,0 +1,123 @@
#' Renames longitudinal Cross-year Individual variables and saves in long format
#' @description Renames all repeated variables in the Cross-year Individual file so that matching variables across waves have the same name, and transforms the resulting dataset into long format. The longitudinal file does not include rows for respondents who were missing in a given wave, and cross-sectional variables are marked as NA during waves when they were not asked. In addition, the resulting file adds two variables for ease of use: "Year" and "fam_id_68".
#'
#' This function may require up to 8gb of RAM, and will likely throw "cannot allocate memory" errors to users with less RAM on their computer. Users with memory issues should implement the "only_long_vars" or "cust_vars" options.
#'
#' @param in_direc Directory of PSID Cross-year Individual file .rds file
#' @param out_direc Directory for renamed and transformed PSID Cross-year Individual file to be saved to
#' @param only_long_vars Keep only longitudinal variables in dataset
#' @param cust_vars Custom variables to keep in dataset (as character vector). Output will always include "ER30001", "fam_id_68", and "Year"
#' @keywords PSID
#' @export
#' @examples
#' rename_ind_vars(
#' only_long_vars=TRUE,
#' in_direc=system.file("extdata","rds_dir", package = "easyPSID"),
#' out_direc=tempdir()
#' )

rename_ind_vars<-function(in_direc,out_direc,only_long_vars=F,cust_vars=NULL){
#1) Deprecated - Set directory
# in_direc<-ifelse(is.null(in_direc),getwd(),in_direc)
# out_direc<-ifelse(is.null(out_direc),getwd(),out_direc)
#2) Load rds file
temp_file<-list.files(path=in_direc,pattern= c("IND.*\\.rds$" ),full.names=T)[1]
temp_file2<-list.files(path=in_direc,pattern= c("IND.*\\.rds$" ))[1]
df<-readRDS(temp_file)
#3) Identify dimensions of eventual dataframe
ncols<-dim(data$psid_ind)[1]
nyears<-dim(data$psid_ind)[2]
ninds<-nrow(df)
nrows<-nyears*ninds
#4) Create transposed version of data$psid_ind, sorted by time first in dataset
early_name<-apply(data$psid_ind,1,function(x) x[x!=""][1])
index<-data$psid_ind[order(early_name),]
index<-t(index)
#5) For convenience, create a column in data that is all NAs
df$colna<-NA
#6) Identify whether variable is cross-sectional
cross<-apply(index,2, function(x) length(x[x!=""]))
cross<-ifelse(cross==1,1,0)
#7) Identify whether observation was not present during a given wave
#ER30003 indicates whether non-response in given wave
ER30003<-which(index=="ER30003",arr.ind = T)[2]
#i) Find corresponding variable names
varnames<-as.vector(index[,ER30003])
#ii) Replace missing variable names with colna
varnames[varnames==""]<-"colna"
#iii) Pull out those column names and convert to vector
resp<-vector()
for(j in 1:length(varnames)){
resp<-c(resp,df[,varnames[j]])
}
#iv) Remove missing individuals
resp[resp>0]<-1
#8) Subset to custom variables or longitudinal variables
if(!is.null(cust_vars)){
if(!"ER30001" %in% cust_vars){
cust_vars<-c(cust_vars,"ER30001")
}
indcols<-sapply(cust_vars,function(x) which(index == x,arr.ind = T)[2])
index<-index[,indcols]
}else if(only_long_vars==T){
index<-index[,cross==0]
}
#9) For each unique variable in individual file, convert to long format
#i) Find corresponding variable names across waves
ind_longdf<-lapply(as.data.frame(index),as.vector)
#ii) Replace missing variable names with colna
ind_longdf<-lapply(ind_longdf,function(x) ifelse(x=="","colna",x))
#iii) Pull out those columns
ind_longdf<-lapply(ind_longdf,function(x) df[,x])
#iv) Convert columns to single vector
fastunlist<-function(x){
x<-c(as.matrix(x))
x<-x[resp==1] #(not usually part of function)
}
percent <- function(x, digits = 0, format = "f", ...) {
paste0(formatC(100 * x, format = format, digits = digits, ...), "%")
}
for(i in 1:length(ind_longdf)){
ind_longdf[[i]]<-fastunlist(ind_longdf[[i]])
if(i/10==floor(i/10)){
gc()
message(percent(i/length(ind_longdf)))
}
}
gc()
#v) Convert list to dataframe
ind_longdf<-do.call(cbind,ind_longdf)
gc()
ind_longdf<-as.data.frame(ind_longdf)
gc()
#10) Add metadata to dataframe
#a) Create proper names for columns
if(!is.null(cust_vars)){
early_name2<-cust_vars
}else if(only_long_vars==T){
early_name2<-apply(index,2,function(x) x[x!=""][1])
}else{
early_name2<-early_name[order(early_name)]
}
names(ind_longdf)<-early_name2
#b) Bring back variable descriptions
temp<-find_description(names(ind_longdf))
attr(ind_longdf,"var.labels")<-temp$Description
#c) Add a year variable
uyears<-gsub('\\D+','', row.names(index))
Year<-rep(uyears,each=ninds)
Year<-Year[resp==1]
ind_longdf$Year<-Year
#d) Add a family id variable
fam_id_68<-ind_longdf$ER30001
#11) Save file
dir.create(out_direc,showWarnings = F)
temp_file2<-gsub(".rds","",temp_file2)
saveRDS(ind_longdf,file=paste(out_direc,"/",temp_file2," - Long Format.rds",sep=""))
message(percent(1))
}





0 comments on commit be5c9c4

Please sign in to comment.