This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

BIN -669 KB pages/index.ser
Binary file not shown.
@@ -10,6 +10,12 @@

/**
* Created by VGDC_1 on 2/8/2016.
*
* Brett Lenz 76382638
Carl Pacheco 47911659
Derek Edrich 34363846
Khang Tran 47508988
*/
public class Indexer {
public HashMap<String, WordTFIDF> index;
@@ -74,7 +80,16 @@ private void read(String filename) {
}

public void print() {
System.out.println(index);
try {
PrintWriter writer = new PrintWriter("theIndex.txt", "UTF-8");
for (String k : index.keySet())
writer.println(k + " -> " + index.get(k).toString());

writer.close();
}
catch (Exception e) {

}
}

public String maxTFIDF(String word) {
@@ -100,6 +115,9 @@ public static void main(String[] args) {
wordCount += words.size();
List<Frequency> frequencies = WordFrequencyCounter.computeWordFrequencies(words);
indexer.addFrequencies(filename, frequencies);
System.out.println(filename);
//if(i > 31500)
// break;
}

String runtime = ((Long)(System.currentTimeMillis()-start)).toString();
@@ -10,6 +10,11 @@

/**
* Created by VGDC_1 on 2/18/2016.
* Brett Lenz 76382638
Carl Pacheco 47911659
Derek Edrich 34363846
Khang Tran 47508988
*/
public final class WordTFIDF implements Serializable {
public String word;
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,37 @@
package infx141asst3;
/**
* Basic class for pairing a word/2-gram/palindrome with its frequency.
*
* DO NOT MODIFY THIS CLASS
*/
public final class Frequency {
private final String word;
private int frequency;

public Frequency(String word) {
this.word = word;
this.frequency = 0;
}

public Frequency(String word, int frequency) {
this.word = word;
this.frequency = frequency;
}

public String getText() {
return word;
}

public int getFrequency() {
return frequency;
}

public void incrementFrequency() {
frequency++;
}

@Override
public String toString() {
return word + ":" + frequency;
}
}
@@ -0,0 +1,140 @@
package infx141asst3;

import com.sun.org.apache.xpath.internal.SourceTree;
import javafx.util.Pair;

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

/**
* Created by VGDC_1 on 2/8/2016.
*
* Brett Lenz 76382638
Carl Pacheco 47911659
Derek Edrich 34363846
Khang Tran 47508988
*/
public class Indexer {
public HashMap<String, WordTFIDF> index;
public int corpusSize;
public HashMap<String, Integer> docSizes;

public Indexer() {
corpusSize = 0;
index = new HashMap<String, WordTFIDF>();
docSizes = new HashMap<String, Integer>();
}

public Indexer(String filename) {
read(filename);
}

public void addFrequencies(String filename, List<Frequency> frequencies) {
int numWords = 0;
for (int i = 0; i < frequencies.size(); i++) {
numWords += frequencies.get(i).getFrequency();
}

corpusSize += numWords;

docSizes.put(filename, numWords);

for (int i = 0; i < frequencies.size(); i++) {
if (!index.containsKey(frequencies.get(i).getText()))
index.put(frequencies.get(i).getText(), new WordTFIDF(frequencies.get(i).getText()));

index.get(frequencies.get(i).getText()).add(new Pair<String, Pair<Integer, Double>>(filename,
new Pair<Integer, Double>(frequencies.get(i).getFrequency(),
(double) (frequencies.get(i).getFrequency()) / (double) numWords)));
}
}

public boolean save() {
try {
FileOutputStream fileOut =
new FileOutputStream("pages/index.ser");
ObjectOutputStream out = new ObjectOutputStream(fileOut);
out.writeObject(index);
out.close();
fileOut.close();
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}

private void read(String filename) {
try {
FileInputStream fileIn = new FileInputStream(filename);
ObjectInputStream in = new ObjectInputStream(fileIn);
index = (HashMap<String, WordTFIDF>) in.readObject();
in.close();
fileIn.close();
} catch (Exception e) {
e.printStackTrace();
}
}

public void print() {
try {
PrintWriter writer = new PrintWriter("theIndex.txt", "UTF-8");
for (String k : index.keySet())
writer.println(k + " -> " + index.get(k).toString());

writer.close();
}
catch (Exception e) {

}
}

public String maxTFIDF(String word) {
if(index.containsKey(word))
return index.get(word).maxTFIDF(corpusSize);
else
return "";
}

public static void main(String[] args) {
Indexer indexer = new Indexer();

File folder = new File("pages");
String[] files = folder.list();

int wordCount = 0;
long start = System.currentTimeMillis();

for(int i = 0; i < files.length; i++)
{
String filename = "pages/"+ files[i];
List<String> words = Utilities.tokenizeFile(new File(filename));
wordCount += words.size();
List<Frequency> frequencies = WordFrequencyCounter.computeWordFrequencies(words);
indexer.addFrequencies(filename, frequencies);
System.out.println(filename);
//if(i > 31500)
// break;
}

String runtime = ((Long)(System.currentTimeMillis()-start)).toString();
System.out.println("Runtime: " + runtime);

System.out.println("Doc Wordcount: " + (indexer.docSizes).toString());

System.out.println("Num Unique Words: " + indexer.index.keySet().size());

System.out.println("Word Count: " + ((Integer)indexer.corpusSize).toString());

indexer.print();

indexer.save();

//test searching

System.out.println(indexer.maxTFIDF("pollution"));
}
}
@@ -0,0 +1,126 @@


package infx141asst3;

import java.io.File;
import java.lang.reflect.Array;
import java.util.*;

/**
* A collection of utility methods for text processing.
*/
public class Utilities
{
/**
* Reads the input text file and splits it into alphanumeric tokens.
* Returns an ArrayList of these tokens, ordered according to their
* occurrence in the original text file.
* <p>
* Non-alphanumeric characters delineate tokens, and are discarded.
* <p>
* Words are also normalized to lower case.
* <p>
* Example:
* <p>
* Given this input string
* "An input string, this is! (or is it?)"
* <p>
* The output list of strings should be
* ["an", "input", "string", "this", "is", "or", "is", "it"]
*
* @param input The file to read in and tokenize.
* @return The list of tokens (words) from the input file, ordered by occurrence.
*/

public static ArrayList<String> tokenizeFile(File input)
{
// List of tokens
ArrayList<String> tokens = new ArrayList<String>();
Scanner file;

try
{
// Scan the file for text
file = new Scanner(input);

// Loop through file
while (file.hasNext())
{
// For each word, modify it to make lower case and alphanumeric
String[] word = file.next().split("[^a-zA-Z0-9]");

// Place tokens into token list
for (String words: new ArrayList<String>(Arrays.asList(word)))
{
if (!words.equals(""))
tokens.add(words.toLowerCase());
}
}
}
catch (Exception e)
{
System.out.println(e.toString());
}
return tokens;
}

/**
* Takes a list of {@link Frequency}s and prints it to standard out. It also
* prints out the total number of items, and the total number of unique items.
* <p>
* Example one:
* <p>
* Given the input list of word frequencies
* ["sentence:2", "the:1", "this:1", "repeats:1", "word:1"]
* <p>
* The following should be printed to standard out
* <p>
* Total item count: 6
* Unique item count: 5
* <p>
* sentence 2
* the 1
* this 1
* repeats 1
* word 1
* <p>
* <p>
* Example two:
* <p>
* Given the input list of 2-gram frequencies
* ["you think:2", "how you:1", "know how:1", "think you:1", "you know:1"]
* <p>
* The following should be printed to standard out
* <p>
* Total 2-gram count: 6
* Unique 2-gram count: 5
* <p>
* you think 2
* how you 1
* know how 1
* think you 1
* you know 1
*
* @param frequencies A list of frequencies.
*/
public static void printFrequencies(List<Frequency> frequencies)
{
// TODO Write body!

// Instantiate total to 0
int total = 0;

// Loop through the list and count the frequency counts
for (Frequency frequency: frequencies)
total += frequency.getFrequency();

// Output the totals
System.out.println("Total item count: " + total);
System.out.println("Unique item count: " + frequencies.size());
System.out.println();

// Output the results
for (Frequency frequency: frequencies)
System.out.format("%-7s %d\n", frequency.getText(),frequency.getFrequency());
}
}
@@ -0,0 +1,88 @@
package infx141asst3;
import java.io.File;

import java.util.*;

//TO REMOVE

/**
* Counts the total number of words and their frequencies in a text file.
*/
public final class WordFrequencyCounter {
/**
* This class should not be instantiated.
*/
private WordFrequencyCounter() {}

/**
* Takes the input list of words and processes it, returning a list
* of {@link Frequency}s.
*
* This method expects a list of lowercase alphanumeric strings.
* If the input list is null, an empty list is returned.
*
* There is one frequency in the output list for every
* unique word in the original list. The frequency of each word
* is equal to the number of times that word occurs in the original list.
*
* The returned list is ordered by decreasing frequency, with tied words sorted
* alphabetically.
*
* The original list is not modified.
*
* Example:
*
* Given the input list of strings
* ["this", "sentence", "repeats", "the", "word", "sentence"]
*
* The output list of frequencies should be
* ["sentence:2", "the:1", "this:1", "repeats:1", "word:1"]
*
* @param words A list of words.
* @return A list of word frequencies, ordered by decreasing frequency.
*/


public static List<Frequency> computeWordFrequencies(List<String> words)
{
// Create arraylist to hold frequency objects
List<Frequency> frequencies = new ArrayList<>();

// Check if word list is null - return empty list
if (words == null || words.isEmpty())
return frequencies;

// Loop through a set of words - eliminating duplicates
for (String word : new HashSet<>(words))
{
// Get the count of word occurennces in the word list
int count = Collections.frequency(words,word);

// Add the frequency to the list
frequencies.add(new Frequency(word,count));
}

// // Sort by frequency count then alphabetically
// frequencies.sort(( Frequency a, Frequency b) -> {
// if (Integer.compare(b.getFrequency(), a.getFrequency()) == 0)
// return a.getText().compareTo(b.getText());
// else
// return Integer.compare(b.getFrequency(), a.getFrequency());
// });

return frequencies;
}

/**
* Runs the word frequency counter. The input should be the path to a text file.
*
* @param args The first element should contain the path to a text file.
*/
public static void main(String[] args)
{
File file = new File(args[0]);
List<String> words = Utilities.tokenizeFile(file);
List<Frequency> frequencies = computeWordFrequencies(words);
Utilities.printFrequencies(frequencies);
}
}
@@ -0,0 +1,62 @@
package infx141asst3;

import javafx.util.Pair;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* Created by VGDC_1 on 2/18/2016.
* Brett Lenz 76382638
Carl Pacheco 47911659
Derek Edrich 34363846
Khang Tran 47508988
*/
public final class WordTFIDF implements Serializable {
public String word;
public int IDF;
public HashMap<String, Pair<Integer, Double>> TFs;

public WordTFIDF(String word) {
this.word = word;
IDF = 0;
TFs = new HashMap<String, Pair<Integer, Double>>();
}

public void add(Pair<String, Pair<Integer, Double>> data) {
TFs.put(data.getKey(), data.getValue());
IDF += data.getValue().getKey();
}

public double TFIDF(String filename, int corpusSize) {
if (TFs.containsKey((filename))) {
return (1 + Math.log(TFs.get(filename).getKey())) * Math.log((double) corpusSize / (double) TFs.get(filename).getKey());
}
else
return 0;
}

public String maxTFIDF(int corpusSize) {
double max = -1;
String maxFilename = "";
for(String filename : TFs.keySet())
{
double testTFIDF = TFIDF(filename, corpusSize);
if(testTFIDF > max)
{
max = testTFIDF;
maxFilename = filename;
}
}
return maxFilename;
}

@Override
public String toString() {
return TFs.toString();
}
}