Skip to content

Commit

Permalink
Task 47: [IO] File-based writers all behave slightly differently
Browse files Browse the repository at this point in the history
http://code.google.com/p/dkpro-core-asl/issues/detail?id=47
- Added JCasFileWriter_ImplBase and using it as a base class for TextWriter, XmiWriter and XmlWriterInline.
  • Loading branch information
reckart committed Jan 28, 2012
1 parent 39f1007 commit 002d1cd
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 170 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/*******************************************************************************
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.api.io;

import static org.apache.commons.io.FileUtils.forceMkdir;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.io.FilenameUtils;
import org.apache.uima.jcas.JCas;
import org.uimafit.component.JCasConsumer_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;

/**
* @author Richard Eckart de Castilho
*/
public abstract class JCasFileWriter_ImplBase
extends JCasConsumer_ImplBase
{
/**
* The folder to write the generated XMI files to.
*/
public static final String PARAM_PATH = ComponentParameters.PARAM_TARGET_LOCATION;
@ConfigurationParameter(name=PARAM_PATH, mandatory=true)
private File path;

/**
* Enabled/disable gzip compression. If this is set, all files will have the ".gz" ending.
*/
public static final String PARAM_COMPRESS = "Compress";
@ConfigurationParameter(name=PARAM_COMPRESS, mandatory=true, defaultValue="false")
private boolean compress;

/**
* Remove the original extension.
*/
public static final String PARAM_STRIP_EXTENSION = "StripExtension";
@ConfigurationParameter(name=PARAM_STRIP_EXTENSION, mandatory=true, defaultValue="false")
private boolean stripExtension;

/**
* Use the document ID as file name even if a relative path information is present.
*/
public static final String PARAM_USE_DOCUMENT_ID = "UseDocumentId";
@ConfigurationParameter(name=PARAM_USE_DOCUMENT_ID, mandatory=true, defaultValue="false")
private boolean useDocumentId;

protected boolean isCompress()
{
return compress;
}

protected boolean isStripExtension()
{
return stripExtension;
}

protected boolean isUseDocumentId()
{
return useDocumentId;
}

protected OutputStream getOutputStream(JCas aJCas, String aExtension) throws IOException
{
File outputFile = getTargetPath(aJCas, aExtension);
return getOutputStream(outputFile);
}

/**
* Make sure the target directory exists and get a stream writing to the specified file within.
* If the file name ends in ".gz", the stream will be compressed.
*
* @param aFile the target file.
* @return a stream to write to.
*/
protected OutputStream getOutputStream(File aFile) throws IOException
{
// Create parent folders for output file and set up stream
if (aFile.getParentFile() != null) {
forceMkdir(aFile.getParentFile());
}
OutputStream os = new FileOutputStream(aFile);
if (aFile.getName().endsWith(".gz")) {
os = new GZIPOutputStream(os);
}
return os;
}

/**
* Get the relative path from the CAS. If the CAS does not contain relative path information or
* if {@link #PARAM_USE_DOCUMENT_ID} is set, the document ID is used.
*
* @param aJCas a CAS.
* @return the relative target path.
*/
protected String getRelativePath(JCas aJCas)
{
DocumentMetaData meta = DocumentMetaData.get(aJCas);
String baseUri = meta.getDocumentBaseUri();
String docUri = meta.getDocumentUri();

if (!useDocumentId && (baseUri != null)) {
String relativeDocumentPath;
if ((docUri == null) || !docUri.startsWith(baseUri)) {
throw new IllegalStateException("Base URI [" + baseUri
+ "] is not a prefix of document URI [" + docUri + "]");
}
relativeDocumentPath = docUri.substring(baseUri.length());
if (stripExtension) {
relativeDocumentPath = FilenameUtils.removeExtension(relativeDocumentPath);
}
return relativeDocumentPath;
}
else {
String relativeDocumentPath;
if (meta.getDocumentId() == null) {
throw new IllegalStateException("Neither base URI/document URI nor document ID set");
}
relativeDocumentPath = meta.getDocumentId();
return relativeDocumentPath;
}
}

/**
* Get the full target path for the given CAS and extension. If the
* {@link #PARAM_COMPRESS} is set, ".gz" is appended to the path.
*
* @param aRelativePath the relative path.
* @param aExtension the extension.
* @return the full path.
*/
protected File getTargetPath(JCas aJCas, String aExtension)
{
return getTargetPath(getRelativePath(aJCas), aExtension);
}

/**
* Get the full target path for the given relative path and extension. If the
* {@link #PARAM_COMPRESS} is set, ".gz" is appended to the path.
*
* @param aRelativePath the relative path.
* @param aExtension the extension.
* @return the full path.
*/
protected File getTargetPath(String aRelativePath, String aExtension)
{
if (compress) {
return new File(path, aRelativePath + aExtension + ".gz");
}
else {
return new File(path, aRelativePath + aExtension);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,70 +19,30 @@

import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.io.IOUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.uimafit.component.JCasConsumer_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;

/**
* UIMA CAS consumer writing the CAS document text as plain text file.
*
* @author Richard Eckart de Castilho
*/
public class TextWriter
extends JCasConsumer_ImplBase
extends JCasFileWriter_ImplBase
{
public static final String PARAM_PATH = ComponentParameters.PARAM_TARGET_LOCATION;
@ConfigurationParameter(name=PARAM_PATH, mandatory=true)
private File path;

public static final String PARAM_COMPRESS = "Compress";
@ConfigurationParameter(name=PARAM_COMPRESS, mandatory=true, defaultValue="false")
private boolean compress;


@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
DocumentMetaData meta = DocumentMetaData.get(aJCas);
String baseUri = meta.getDocumentBaseUri();
String docUri = meta.getDocumentUri();

if (!docUri.startsWith(baseUri)) {
throw new IllegalStateException("Base URI [" + baseUri
+ "] is not a prefix of document URI [" + docUri + "]");
}

String relativeDocumentPath = docUri.substring(baseUri.length());
OutputStream docOS = null;
try {
File docOut;

if (compress) {
docOut = new File(path, relativeDocumentPath+".xmi.gz").getAbsoluteFile();
}
else {
docOut = new File(path, relativeDocumentPath+".xmi").getAbsoluteFile();
}

docOut.getParentFile().mkdirs();

docOS = new FileOutputStream(docOut);

if (compress) {
docOS = new GZIPOutputStream(docOS);
}

docOS = getOutputStream(aJCas, ".txt");

IOUtils.write(aJCas.getDocumentText(), docOS);
}
catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,24 @@
******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.io.xmi;

import static org.apache.commons.io.FileUtils.forceMkdir;
import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.jcas.JCas;
import org.apache.uima.util.TypeSystemUtil;
import org.uimafit.component.JCasConsumer_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;

/**
* @author Richard Eckart de Castilho
*/
public class XmiWriter
extends JCasConsumer_ImplBase
extends JCasFileWriter_ImplBase
{
/**
* The folder to write the generated XMI files to.
*/
public static final String PARAM_PATH = ComponentParameters.PARAM_TARGET_LOCATION;
@ConfigurationParameter(name=PARAM_PATH, mandatory=true)
private File path;

/**
* Location to write the type system to. If this is not set, a file called typesystem.xml will
* be written to the XMI output path. If this is set, it is expected to be a file relative
Expand All @@ -61,79 +48,22 @@ public class XmiWriter
@ConfigurationParameter(name=PARAM_TYPE_SYSTEM_FILE, mandatory=false)
private File typeSystemFile;

/**
* Enabled/disable gzip compression. If this is set, all files will have the ".gz" ending.
*/
public static final String PARAM_COMPRESS = "Compress";
@ConfigurationParameter(name=PARAM_COMPRESS, mandatory=true, defaultValue="false")
private boolean compress;


@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
DocumentMetaData meta = DocumentMetaData.get(aJCas);
String baseUri = meta.getDocumentBaseUri();
String docUri = meta.getDocumentUri();

String relativeDocumentPath;
if (baseUri != null) {
if ((docUri == null) || !docUri.startsWith(baseUri)) {
throw new IllegalStateException("Base URI [" + baseUri
+ "] is not a prefix of document URI [" + docUri + "]");
}
relativeDocumentPath = docUri.substring(baseUri.length());
}
else {
if (meta.getDocumentId() == null) {
throw new IllegalStateException("Neither base URI/document URI nor document ID set");
}
relativeDocumentPath = meta.getDocumentId();
}

OutputStream docOS = null;
OutputStream typeOS = null;
try {
File docOut;
File typeOut;

// Set names accoring to whether compression is used or not
if (compress) {
docOut = new File(path, relativeDocumentPath+".xmi.gz").getAbsoluteFile();
typeOut = new File(path, "typesystem.xml.gz").getAbsoluteFile();
}
else {
docOut = new File(path, relativeDocumentPath+".xmi").getAbsoluteFile();
typeOut = new File(path, "typesystem.xml").getAbsoluteFile();
}


// Create parent folders for XMI file and set up stream
if (docOut.getParentFile() != null) {
forceMkdir(docOut.getParentFile());
}
docOS = new FileOutputStream(docOut);
if (compress) {
docOS = new GZIPOutputStream(docOS);
}

// Set up writing the type system
docOS = getOutputStream(aJCas, ".xmi");

if (typeSystemFile != null) {
if (typeSystemFile.getParentFile() != null) {
forceMkdir(typeSystemFile.getParentFile());
}
typeOut = typeSystemFile;
typeOS = new FileOutputStream(typeSystemFile);
typeOS = getOutputStream(typeSystemFile);
}
else {
typeOut.getParentFile().mkdirs();
typeOS = new FileOutputStream(typeOut);
typeOS = getOutputStream(getTargetPath("typesystem", ".xml"));
}
if (typeOut.getName().toLowerCase().endsWith(".gz")) {
typeOS = new GZIPOutputStream(typeOS);
}


XmiCasSerializer.serialize(aJCas.getCas(), docOS);
TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS);
}
Expand Down
Loading

0 comments on commit 002d1cd

Please sign in to comment.