Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
zesch committed Nov 5, 2012
1 parent 91503e3 commit 1dea5d5
Show file tree
Hide file tree
Showing 3 changed files with 268 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/main/java/com/googlecode/jweb1t/JWeb1TIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*******************************************************************************
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.googlecode.jweb1t;

import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.Queue;

import com.googlecode.jweb1t.util.NGramIterator;

/**
* Iterates over all n-grams of a given size in the data.
*/
public class JWeb1TIterator
{
private final String ngramLocation;
private final int ngramSize;

public JWeb1TIterator(final String aNGramLocation, final int aNGramSize)
throws IOException
{
ngramLocation = aNGramLocation;
ngramSize = aNGramSize;
}

/**
* Returns an iterator over the ngrams.
* Note that a certain order is not guaranteed.
*
* @throws IOException
*/
public NGramIterator getIterator()
throws IOException
{

final File ngramFile = new File(ngramLocation + "/" + ngramSize + "gms/");

Queue<File> ngramFiles = new LinkedList<File>();

if (ngramFile.isFile()) {
ngramFiles.add(ngramFile);
}
else {
final FolderScanner scanner = new FolderScanner(ngramFile);
scanner.setFilter(new IndexFilter());

while (scanner.hasNext()) {
for (final File file : scanner.next()) {
ngramFiles.add(file);
}
}
}

return new NGramIterator(ngramFiles);
}
}
96 changes: 96 additions & 0 deletions src/main/java/com/googlecode/jweb1t/util/NGramIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package com.googlecode.jweb1t.util;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.Iterator;
import java.util.Queue;

public class NGramIterator
implements Iterator<String>
{
private Queue<File> ngramFiles;
private LineNumberReader reader;
private String storedNextString;

public NGramIterator(Queue<File> ngramFiles) throws IOException {
this.ngramFiles = ngramFiles;

if (ngramFiles.size() > 0) {
this.reader = new LineNumberReader(new FileReader(ngramFiles.poll()));
}
else {
throw new IOException("Filelist is empty.");
}
}

public boolean hasNext()
{
// prefatch the next string
storedNextString = getNextString();

if (storedNextString == null) {
return false;
}
else {
return true;
}
}

public String next()
{
if (storedNextString == null) {
return getNextString();
}
else {
String returnValue = storedNextString;
storedNextString = null;
return returnValue;
}
}

public void remove()
{
throw new UnsupportedOperationException();
}

/**
* Returns the next string or null if there is no next string.
*
* @throws IOException
*/
private String getNextString() {
String nextString = null;

if (storedNextString != null) {
return storedNextString;
}

try {
String line = reader.readLine();
if (line != null) {
final String[] parts = line.split("\t");

if (parts.length != 2) {
System.err.println("Ill-formed line: " + line);
}

nextString = parts[0];
}
else {
reader.close();
if (!ngramFiles.isEmpty()) {
reader = new LineNumberReader(new FileReader(ngramFiles.poll()));
nextString = getNextString();
}
}
}
catch (IOException e) {
// fail gracefully as we cannot throw exception in hasNext() or next() anyway
e.printStackTrace();
}

return nextString;
}
}
100 changes: 100 additions & 0 deletions src/test/java/com/googlecode/jweb1t/JWeb1TIteratorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*******************************************************************************
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.googlecode.jweb1t;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;

import org.junit.Test;

import com.googlecode.jweb1t.util.NGramIterator;

public class JWeb1TIteratorTest
{
private static final String DATA_DIR = "src/test/resources/";

@Test
public void listUnigramsTest()
throws IOException
{
final JWeb1TIterator iterator = new JWeb1TIterator(DATA_DIR, 1);

NGramIterator ngramIterator = iterator.getIterator();
int i=0;
while (ngramIterator.hasNext()) {
System.out.println(ngramIterator.next());
i++;
}
assertEquals(11, i);
}

@Test
public void listBigramsTest()
throws IOException
{
final JWeb1TIterator iterator = new JWeb1TIterator(DATA_DIR, 2);

NGramIterator ngramIterator = iterator.getIterator();
int i=0;
while (ngramIterator.hasNext()) {
System.out.println(ngramIterator.next());
i++;
}
assertEquals(21, i);
}

@Test
public void multipleHasNextTest()
throws IOException
{
final JWeb1TIterator iterator = new JWeb1TIterator(DATA_DIR, 1);

NGramIterator ngramIterator = iterator.getIterator();
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());
assertTrue(ngramIterator.hasNext());

assertEquals("!", ngramIterator.next());
}

@Test
public void nextWithoutHasNext()
throws IOException
{
final JWeb1TIterator iterator = new JWeb1TIterator(DATA_DIR, 1);

NGramIterator ngramIterator = iterator.getIterator();
assertEquals("!", ngramIterator.next());
}
}

0 comments on commit 1dea5d5

Please sign in to comment.