Skip to content

Commit

Permalink
refactor genotype data read: frequency format
Browse files Browse the repository at this point in the history
  • Loading branch information
hdbeukel committed May 9, 2016
1 parent dc7ccc6 commit 43081cd
Show file tree
Hide file tree
Showing 28 changed files with 152 additions and 176 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -299,31 +299,31 @@ public static GenotypeData readData(Path filePath, FileType type) throws IOExcep
*
* <p>For {@link GenotypeDataFormat#DEFAULT} the file contains one or more consecutive columns per marker in
* which the observed alleles are specified (by name/id/number). This format is suited for datasets with a fixed
* number of allele observations per combination of individual and marker. Common cases are those with one
* or two columns per marker, e.g. suited for fully homozygous and diploid datasets, respectively. Any
* (possibly varying) number of columns per marker is supported.
* number of allele observations per marker in each individual. Common cases are those with one
* or two columns per marker, e.g. suited for fully homozygous and diploid datasets, respectively.
* Any (possibly varying) number of columns per marker is supported.
* <p>
* Missing values are encoding as empty cells.
* <p>
* A required first header row and column are included to specify individual and marker names, respectively,
* identified with column/row header "NAME". If item names are not unique or defined for some but not all items,
* a second header column "ID" should be included to provide unique identifiers for at least those items whose
* name is undefined or not unique.
* A required first header row and column are included to specify unique item identifiers and marker names,
* respectively, identified with column/row header "ID". Optionally a second header column "NAME" can be
* included to provide (not necessarily unique) item names in addition to the unique identifiers.
* <p>
* Consecutive columns corresponding to the same marker should be tagged with the name of that marker, optionally
* followed by an arbitrary suffix starting with a dash, underscore or dot character. The latter allows to use
* column names such as "M1-1" and "M1-2", "M1.a" and "M1.b" or "M1_1" and "M1_2" for a marker named "M1" (in case
* of two columns per marker). The marker name itself can not contain any dash, underscore or dot characters,
* otherwise part of the name will be lost when loading the data. Marker names should be unique.
* otherwise part of the name will be lost when loading the data.
*
* <p>For {@link GenotypeDataFormat#FREQUENCY} the file contains allele frequencies following the
* requirements as described in the constructor {@link #SimpleGenotypeData(String, SimpleEntity[], String[],
* String[][], Double[][][])}. Missing frequencies are encoding as empty cells.
* The file starts with a compulsory header row from which (unique) marker names and allele counts are inferred.
* The file starts with a compulsory header row from which marker names and allele counts are inferred.
* All columns corresponding to the same marker occur consecutively in the file and are named after that marker.
* There is one required header column with item names, which is identified with column header "NAME".
* If the provided item names are not unique or defined for some but not all items, a second header column "ID"
* should be included to provide unique identifiers for at least those items whose name is undefined or not unique.
* Marker names are required to be unique.
* There is one compulsory header column "ID" containing unique item identifiers.
* Optionally a second header column "NAME" can be included to provide (not necessarily unique)
* item names in addition to the unique identifiers.
* Finally, an optional second header row can be included to define allele names per marker, identified with row
* header "ALLELE". Allele names need not be unique and can be undefined for some alleles by leaving the
* corresponding cells empty.
Expand Down Expand Up @@ -409,13 +409,12 @@ private static SimpleGenotypeData readFrequencyData(Path filePath, FileType type
reader.nextRow();
String[] markerNamesRow = reader.getRowCellsAsStringArray();
// check presence of header columns
boolean withNames = (markerNamesRow.length >= 1 && Objects.equals(markerNamesRow[0], NAMES_HEADER));
boolean withIds = (markerNamesRow.length >= 2 && Objects.equals(markerNamesRow[1], IDENTIFIERS_HEADER));
int numHeaderCols = 0;
if(withNames){
numHeaderCols++;
if(markerNamesRow.length < 1 || !Objects.equals(markerNamesRow[0], IDENTIFIERS_HEADER)){
throw new IOException("Missing header column ID.");
}
if(withIds){
boolean withNames = (markerNamesRow.length >= 2 && Objects.equals(markerNamesRow[1], NAMES_HEADER));
int numHeaderCols = 1;
if(withNames){
numHeaderCols++;
}
// infer and check number of data columns
Expand Down Expand Up @@ -485,10 +484,10 @@ private static SimpleGenotypeData readFrequencyData(Path filePath, FileType type
}

// check for allele names row
if(withNames && Objects.equals(row[0], ALLELE_NAMES_HEADER)){
if(Objects.equals(row[0], ALLELE_NAMES_HEADER)){
// verify: second row
if(r != 1){
throw new IOException("Allele names header should be second row in the file.");
throw new IOException("Allele names header should be the second row in the file.");
}
// extract allele names grouped per marker
int aglob = numHeaderCols;
Expand All @@ -504,12 +503,11 @@ private static SimpleGenotypeData readFrequencyData(Path filePath, FileType type

// process data row

// extract row headers, if any (name/identifier)
// extract unique item identifier
itemIdentifiers.add(StringUtils.unquote(row[0]));
// extract item name, if included
if(withNames){
itemNames.add(StringUtils.unquote(row[0]));
}
if(withIds){
itemIdentifiers.add(StringUtils.unquote(row[1]));
itemNames.add(StringUtils.unquote(row[1]));
}

// group frequencies per marker
Expand Down Expand Up @@ -549,18 +547,9 @@ private static SimpleGenotypeData readFrequencyData(Path filePath, FileType type
// combine names and identifiers in headers
SimpleEntity[] headers = new SimpleEntity[n];
for(int i = 0; i < n; i++){
String name = withNames ? itemNames.get(i) : null;
String identifier = withIds ? itemIdentifiers.get(i) : null;
if(name != null || identifier != null){
if(identifier == null){
headers[i] = new SimpleEntityPojo(name, name);
} else {
headers[i] = new SimpleEntityPojo(identifier, name);
}
}
}
if(Arrays.stream(headers).allMatch(Objects::isNull)){
headers = null;
String identifier = itemIdentifiers.get(i);
String name = withNames ? itemNames.get(i) : itemIdentifiers.get(i);
headers[i] = new SimpleEntityPojo(identifier, name);
}

// convert collections to arrays
Expand Down Expand Up @@ -878,11 +867,11 @@ public void writeData(Path filePath, FileType fileType, GenotypeDataFormat forma
throw new IOException("Can not create writer for file " + filePath + ".");
}

writer.writeCell(NAMES_HEADER);
writer.writeCell(IDENTIFIERS_HEADER);

writer.newColumn() ;

writer.writeCell(IDENTIFIERS_HEADER);
writer.writeCell(NAMES_HEADER);

for (int i = 0 ; i < alleleNames.length ; ++i) {
for (int j = 0 ; j < alleleNames[i].length ; ++j) {
Expand All @@ -908,11 +897,11 @@ public void writeData(Path filePath, FileType fileType, GenotypeDataFormat forma
writer.newRow();

header = getHeader(i);
writer.writeCell(header.getName());
writer.writeCell(header.getUniqueIdentifier());

writer.newColumn() ;

writer.writeCell(header.getUniqueIdentifier());
writer.writeCell(header.getName());

for (int j = 0; j < alleleFrequencies[i].length; ++j) {
writer.newColumn() ;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@
*/
public class SimpleGenotypeDataTest {

private static final String TXT_NAMES = "/frequency_genotypes/names.txt";
private static final String CSV_NAMES = "/frequency_genotypes/names.csv";
private static final String CSV_NAMES_IDS = "/frequency_genotypes/names-and-ids.csv";
private static final String TXT_IDS = "/frequency_genotypes/ids.txt";
private static final String CSV_IDS = "/frequency_genotypes/ids.csv";
private static final String CSV_IDS_NAMES = "/frequency_genotypes/ids-and-names.csv";
private static final String CSV_NO_ALLELE_NAMES = "/frequency_genotypes/no-allele-names.csv";

private static final String DIPLOID_TXT_NAMES = "/diploid_genotypes/names.txt";
Expand Down Expand Up @@ -94,9 +94,9 @@ public static void afterClass(){
System.out.println("Done");
}

/***********/
/* GENERAL */
/***********/
/*************/
/* FREQUENCY */
/*************/

@Test
public void inMemory() {
Expand All @@ -123,42 +123,41 @@ public void inMemoryWithName() {
}

@Test
public void fromTxtFileWithNames() throws IOException {
dataName = "names.txt";
public void fromTxtFileWithIds() throws IOException {
dataName = "ids.txt";
expectedHeaders = HEADERS_UNIQUE_NAMES;
expectedMarkerNames = MARKER_NAMES;
expectedAlleleNames = ALLELE_NAMES;
System.out.println(" |- File " + dataName);
testDataFrequencies(SimpleGenotypeData.readData(
Paths.get(SimpleGenotypeDataTest.class.getResource(TXT_NAMES).getPath()), FileType.TXT
Paths.get(SimpleGenotypeDataTest.class.getResource(TXT_IDS).getPath()), FileType.TXT
));
}

@Test
public void fromCsvFileWithNames() throws IOException {
dataName = "names.csv";
public void fromCsvFileWithIds() throws IOException {
dataName = "ids.csv";
expectedHeaders = HEADERS_UNIQUE_NAMES;
expectedMarkerNames = MARKER_NAMES;
expectedAlleleNames = ALLELE_NAMES;
System.out.println(" |- File " + dataName);
testDataFrequencies(SimpleGenotypeData.readData(
Paths.get(SimpleGenotypeDataTest.class.getResource(CSV_NAMES).getPath()), FileType.CSV
Paths.get(SimpleGenotypeDataTest.class.getResource(CSV_IDS).getPath()), FileType.CSV
));
}

@Test
public void fromCsvFileWithNamesAndIDs() throws IOException {
dataName = "names-and-ids.csv";
public void fromCsvFileWithIdsAndNames() throws IOException {
dataName = "ids-and-names.csv";
expectedHeaders = HEADERS_NON_UNIQUE_NAMES;
expectedMarkerNames = MARKER_NAMES;
expectedAlleleNames = ALLELE_NAMES;
System.out.println(" |- File " + dataName);
testDataFrequencies(SimpleGenotypeData.readData(
Paths.get(SimpleGenotypeDataTest.class.getResource(CSV_NAMES_IDS).getPath()), FileType.CSV
Paths.get(SimpleGenotypeDataTest.class.getResource(CSV_IDS_NAMES).getPath()), FileType.CSV
));
}

// TODO should not allele names be compulsory?
@Test
public void fromCsvFileWithoutAlleleNames() throws IOException {
dataName = "no-allele-names.csv";
Expand All @@ -172,8 +171,8 @@ public void fromCsvFileWithoutAlleleNames() throws IOException {
}

@Test
public void toTxtFileWithNames() throws IOException {
dataName = "names.txt";
public void toTxtFile() throws IOException {
dataName = "out.txt";
expectedHeaders = HEADERS_UNIQUE_NAMES;
expectedMarkerNames = MARKER_NAMES;
expectedAlleleNames = UNDEFINED_ALLELE_NAMES;
Expand All @@ -185,7 +184,7 @@ public void toTxtFileWithNames() throws IOException {

Files.createDirectories(path) ;

path = Files.createTempDirectory(path, "GenoFreqs-TxtFileWithNames") ;
path = Files.createTempDirectory(path, "GenoFreqs-Txt") ;

path = Paths.get(path.toString(), dataName) ;

Expand All @@ -199,8 +198,8 @@ public void toTxtFileWithNames() throws IOException {
}

@Test
public void toCsvFileWithNamesAndAlleleNames() throws IOException {
dataName = "allele-names.csv";
public void toCsvFileWithAlleleNames() throws IOException {
dataName = "out.csv";
expectedHeaders = HEADERS_UNIQUE_NAMES;
expectedMarkerNames = MARKER_NAMES;
expectedAlleleNames = ALLELE_NAMES;
Expand All @@ -212,7 +211,7 @@ public void toCsvFileWithNamesAndAlleleNames() throws IOException {

Files.createDirectories(path) ;

path = Files.createTempDirectory(path, "GenoFreqs-CsvFileWithNamesAndAlleleNames") ;
path = Files.createTempDirectory(path, "GenoFreqs-CsvAlleleNames") ;

path = Paths.get(path.toString(), dataName) ;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
NAME , mk1 , mk1 , mk1 , mk2 , mk2 , mk3 , mk3 , mk3 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
ID , mk1 , mk1 , mk1 , mk2 , mk2 , mk3 , mk3 , mk3 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
Alice , , , , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.00 , 0.00 , 0.50 , 0.50 , , , , 0.00 , 1.00 , 1.00 , 0.00
Dave , 1.00 , 0.00 , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 1.00 , 0.00 , 0.00 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , 0.00 , 1.00
Bob , 0.60 , 0.00 , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.25 , 0.25 , 0.25 , 0.25 , 0.00 , 0.50 , 0.50 , 0.00 , 1.00 , 1.00 , 0.00
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
NAME , ID , mk1 , mk1 , 'mk1', mk2 , mk2 , mk3 , mk3 , mk3 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
ALLELE, , mk1-1, mk1-2, mk1-3, mk2-1, mk2-2, , mk3-2, " " , mk4-1, mk4-2, mk4-3, mk4-4, '' , mk5-2, mk5-3, mk6-1, mk6-2, mk7-1, "mk7-2"
Alice , acc-1 , , , , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.00 , 0.00 , 0.50 , 0.50 , , , , 0.00 , 1.00 , 1.00 , 0.00
, acc-2 , 1.00 , 0.00 , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 1.00 , 0.00 , 0.00 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , 0.00 , 1.00
Bob , 'acc-3', 0.60 , 0.00 , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.25 , 0.25 , 0.25 , 0.25 , 0.00 , 0.50 , 0.50 , 0.00 , 1.00 , 1.00 , 0.00
Bob , acc-4 , , , , 1.00 , 0.00 , , , , 0.00 , 0.00 , 1.00 , 0.00 , 0.33 , 0.33 , 0.33 , 0.00 , 1.00 , 1.00 , 0.00
, "acc-2", 0.33 , 0.33 , 0.33 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.50 , 0.00 , 0.50 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , ,
ID , mk1 , mk1 , 'mk1', mk2 , mk2 , mk3 , mk3 , mk3 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
ALLELE , mk1-1, mk1-2, mk1-3, mk2-1, mk2-2, , mk3-2, " " , mk4-1, mk4-2, mk4-3, mk4-4, '' , mk5-2, mk5-3, mk6-1, mk6-2, mk7-1, "mk7-2"
acc-1 , , , , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.00 , 0.00 , 0.50 , 0.50 , , , , 0.00 , 1.00 , 1.00 , 0.00
acc-2 , 1.00 , 0.00 , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 1.00 , 0.00 , 0.00 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , 0.00 , 1.00
'acc-3', 0.60 , 0.00 , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.25 , 0.25 , 0.25 , 0.25 , 0.00 , 0.50 , 0.50 , 0.00 , 1.00 , 1.00 , 0.00
acc-4 , , , , 1.00 , 0.00 , , , , 0.00 , 0.00 , 1.00 , 0.00 , 0.33 , 0.33 , 0.33 , 0.00 , 1.00 , 1.00 , 0.00
"acc-2", 0.33 , 0.33 , 0.33 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.50 , 0.00 , 0.50 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , ,
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
NAME , ID , mk1 , mk1 , 'mk1', mk2 , mk2 , mk1 , mk1 , mk1 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
ALLELE, , mk1-1, mk1-2, mk1-3, mk2-1, mk2-2, , mk1-2, " " , mk4-1, mk4-2, mk4-3, mk4-4, '' , mk5-2, mk5-3, mk6-1, mk6-2, mk7-1, "mk7-2"
Alice , acc-1 , , , , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.00 , 0.00 , 0.50 , 0.50 , , , , 0.00 , 1.00 , 1.00 , 0.00
, acc-2 , 1.00 , 0.00 , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 1.00 , 0.00 , 0.00 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , 0.00 , 1.00
Bob , 'acc-3', 0.60 , 0.00 , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.25 , 0.25 , 0.25 , 0.25 , 0.00 , 0.50 , 0.50 , 0.00 , 1.00 , 1.00 , 0.00
Bob , acc-4 , , , , 1.00 , 0.00 , , , , 0.00 , 0.00 , 1.00 , 0.00 , 0.33 , 0.33 , 0.33 , 0.00 , 1.00 , 1.00 , 0.00
, "acc-5", 0.33 , 0.33 , 0.33 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.50 , 0.00 , 0.50 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , ,
ID , mk1 , mk1 , 'mk1', mk2 , mk2 , mk1 , mk1 , mk1 , mk4 , mk4 , mk4 , mk4 , mk5 , mk5 , mk5 , mk6 , mk6 , mk7 , mk7
ALLELE , mk1-1, mk1-2, mk1-3, mk2-1, mk2-2, , mk1-2, " " , mk4-1, mk4-2, mk4-3, mk4-4, '' , mk5-2, mk5-3, mk6-1, mk6-2, mk7-1, "mk7-2"
acc-1 , , , , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.00 , 0.00 , 0.50 , 0.50 , , , , 0.00 , 1.00 , 1.00 , 0.00
acc-2 , 1.00 , 0.00 , 0.00 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 1.00 , 0.00 , 0.00 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , 0.00 , 1.00
'acc-3', 0.60 , 0.00 , 0.40 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.25 , 0.25 , 0.25 , 0.25 , 0.00 , 0.50 , 0.50 , 0.00 , 1.00 , 1.00 , 0.00
acc-4 , , , , 1.00 , 0.00 , , , , 0.00 , 0.00 , 1.00 , 0.00 , 0.33 , 0.33 , 0.33 , 0.00 , 1.00 , 1.00 , 0.00
"acc-5", 0.33 , 0.33 , 0.33 , 0.50 , 0.50 , 0.00 , 0.50 , 0.50 , 0.50 , 0.00 , 0.50 , 0.00 , 0.33 , 0.33 , 0.33 , 1.00 , 0.00 , ,

This file was deleted.

Loading

0 comments on commit 43081cd

Please sign in to comment.