| @@ -0,0 +1,37 @@ | ||
| package infx141asst3; | ||
| /** | ||
| * Basic class for pairing a word/2-gram/palindrome with its frequency. | ||
| * | ||
| * DO NOT MODIFY THIS CLASS | ||
| */ | ||
| public final class Frequency { | ||
| private final String word; | ||
| private int frequency; | ||
|
|
||
| public Frequency(String word) { | ||
| this.word = word; | ||
| this.frequency = 0; | ||
| } | ||
|
|
||
| public Frequency(String word, int frequency) { | ||
| this.word = word; | ||
| this.frequency = frequency; | ||
| } | ||
|
|
||
| public String getText() { | ||
| return word; | ||
| } | ||
|
|
||
| public int getFrequency() { | ||
| return frequency; | ||
| } | ||
|
|
||
| public void incrementFrequency() { | ||
| frequency++; | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return word + ":" + frequency; | ||
| } | ||
| } |
| @@ -0,0 +1,140 @@ | ||
| package infx141asst3; | ||
|
|
||
| import com.sun.org.apache.xpath.internal.SourceTree; | ||
| import javafx.util.Pair; | ||
|
|
||
| import java.io.*; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
|
|
||
| /** | ||
| * Created by VGDC_1 on 2/8/2016. | ||
| * | ||
| * Brett Lenz 76382638 | ||
| Carl Pacheco 47911659 | ||
| Derek Edrich 34363846 | ||
| Khang Tran 47508988 | ||
| */ | ||
| public class Indexer { | ||
| public HashMap<String, WordTFIDF> index; | ||
| public int corpusSize; | ||
| public HashMap<String, Integer> docSizes; | ||
|
|
||
| public Indexer() { | ||
| corpusSize = 0; | ||
| index = new HashMap<String, WordTFIDF>(); | ||
| docSizes = new HashMap<String, Integer>(); | ||
| } | ||
|
|
||
| public Indexer(String filename) { | ||
| read(filename); | ||
| } | ||
|
|
||
| public void addFrequencies(String filename, List<Frequency> frequencies) { | ||
| int numWords = 0; | ||
| for (int i = 0; i < frequencies.size(); i++) { | ||
| numWords += frequencies.get(i).getFrequency(); | ||
| } | ||
|
|
||
| corpusSize += numWords; | ||
|
|
||
| docSizes.put(filename, numWords); | ||
|
|
||
| for (int i = 0; i < frequencies.size(); i++) { | ||
| if (!index.containsKey(frequencies.get(i).getText())) | ||
| index.put(frequencies.get(i).getText(), new WordTFIDF(frequencies.get(i).getText())); | ||
|
|
||
| index.get(frequencies.get(i).getText()).add(new Pair<String, Pair<Integer, Double>>(filename, | ||
| new Pair<Integer, Double>(frequencies.get(i).getFrequency(), | ||
| (double) (frequencies.get(i).getFrequency()) / (double) numWords))); | ||
| } | ||
| } | ||
|
|
||
| public boolean save() { | ||
| try { | ||
| FileOutputStream fileOut = | ||
| new FileOutputStream("pages/index.ser"); | ||
| ObjectOutputStream out = new ObjectOutputStream(fileOut); | ||
| out.writeObject(index); | ||
| out.close(); | ||
| fileOut.close(); | ||
| } catch (Exception e) { | ||
| e.printStackTrace(); | ||
| return false; | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| private void read(String filename) { | ||
| try { | ||
| FileInputStream fileIn = new FileInputStream(filename); | ||
| ObjectInputStream in = new ObjectInputStream(fileIn); | ||
| index = (HashMap<String, WordTFIDF>) in.readObject(); | ||
| in.close(); | ||
| fileIn.close(); | ||
| } catch (Exception e) { | ||
| e.printStackTrace(); | ||
| } | ||
| } | ||
|
|
||
| public void print() { | ||
| try { | ||
| PrintWriter writer = new PrintWriter("theIndex.txt", "UTF-8"); | ||
| for (String k : index.keySet()) | ||
| writer.println(k + " -> " + index.get(k).toString()); | ||
|
|
||
| writer.close(); | ||
| } | ||
| catch (Exception e) { | ||
|
|
||
| } | ||
| } | ||
|
|
||
| public String maxTFIDF(String word) { | ||
| if(index.containsKey(word)) | ||
| return index.get(word).maxTFIDF(corpusSize); | ||
| else | ||
| return ""; | ||
| } | ||
|
|
||
| public static void main(String[] args) { | ||
| Indexer indexer = new Indexer(); | ||
|
|
||
| File folder = new File("pages"); | ||
| String[] files = folder.list(); | ||
|
|
||
| int wordCount = 0; | ||
| long start = System.currentTimeMillis(); | ||
|
|
||
| for(int i = 0; i < files.length; i++) | ||
| { | ||
| String filename = "pages/"+ files[i]; | ||
| List<String> words = Utilities.tokenizeFile(new File(filename)); | ||
| wordCount += words.size(); | ||
| List<Frequency> frequencies = WordFrequencyCounter.computeWordFrequencies(words); | ||
| indexer.addFrequencies(filename, frequencies); | ||
| System.out.println(filename); | ||
| //if(i > 31500) | ||
| // break; | ||
| } | ||
|
|
||
| String runtime = ((Long)(System.currentTimeMillis()-start)).toString(); | ||
| System.out.println("Runtime: " + runtime); | ||
|
|
||
| System.out.println("Doc Wordcount: " + (indexer.docSizes).toString()); | ||
|
|
||
| System.out.println("Num Unique Words: " + indexer.index.keySet().size()); | ||
|
|
||
| System.out.println("Word Count: " + ((Integer)indexer.corpusSize).toString()); | ||
|
|
||
| indexer.print(); | ||
|
|
||
| indexer.save(); | ||
|
|
||
| //test searching | ||
|
|
||
| System.out.println(indexer.maxTFIDF("pollution")); | ||
| } | ||
| } |
| @@ -0,0 +1,126 @@ | ||
|
|
||
|
|
||
| package infx141asst3; | ||
|
|
||
| import java.io.File; | ||
| import java.lang.reflect.Array; | ||
| import java.util.*; | ||
|
|
||
| /** | ||
| * A collection of utility methods for text processing. | ||
| */ | ||
| public class Utilities | ||
| { | ||
| /** | ||
| * Reads the input text file and splits it into alphanumeric tokens. | ||
| * Returns an ArrayList of these tokens, ordered according to their | ||
| * occurrence in the original text file. | ||
| * <p> | ||
| * Non-alphanumeric characters delineate tokens, and are discarded. | ||
| * <p> | ||
| * Words are also normalized to lower case. | ||
| * <p> | ||
| * Example: | ||
| * <p> | ||
| * Given this input string | ||
| * "An input string, this is! (or is it?)" | ||
| * <p> | ||
| * The output list of strings should be | ||
| * ["an", "input", "string", "this", "is", "or", "is", "it"] | ||
| * | ||
| * @param input The file to read in and tokenize. | ||
| * @return The list of tokens (words) from the input file, ordered by occurrence. | ||
| */ | ||
|
|
||
| public static ArrayList<String> tokenizeFile(File input) | ||
| { | ||
| // List of tokens | ||
| ArrayList<String> tokens = new ArrayList<String>(); | ||
| Scanner file; | ||
|
|
||
| try | ||
| { | ||
| // Scan the file for text | ||
| file = new Scanner(input); | ||
|
|
||
| // Loop through file | ||
| while (file.hasNext()) | ||
| { | ||
| // For each word, modify it to make lower case and alphanumeric | ||
| String[] word = file.next().split("[^a-zA-Z0-9]"); | ||
|
|
||
| // Place tokens into token list | ||
| for (String words: new ArrayList<String>(Arrays.asList(word))) | ||
| { | ||
| if (!words.equals("")) | ||
| tokens.add(words.toLowerCase()); | ||
| } | ||
| } | ||
| } | ||
| catch (Exception e) | ||
| { | ||
| System.out.println(e.toString()); | ||
| } | ||
| return tokens; | ||
| } | ||
|
|
||
| /** | ||
| * Takes a list of {@link Frequency}s and prints it to standard out. It also | ||
| * prints out the total number of items, and the total number of unique items. | ||
| * <p> | ||
| * Example one: | ||
| * <p> | ||
| * Given the input list of word frequencies | ||
| * ["sentence:2", "the:1", "this:1", "repeats:1", "word:1"] | ||
| * <p> | ||
| * The following should be printed to standard out | ||
| * <p> | ||
| * Total item count: 6 | ||
| * Unique item count: 5 | ||
| * <p> | ||
| * sentence 2 | ||
| * the 1 | ||
| * this 1 | ||
| * repeats 1 | ||
| * word 1 | ||
| * <p> | ||
| * <p> | ||
| * Example two: | ||
| * <p> | ||
| * Given the input list of 2-gram frequencies | ||
| * ["you think:2", "how you:1", "know how:1", "think you:1", "you know:1"] | ||
| * <p> | ||
| * The following should be printed to standard out | ||
| * <p> | ||
| * Total 2-gram count: 6 | ||
| * Unique 2-gram count: 5 | ||
| * <p> | ||
| * you think 2 | ||
| * how you 1 | ||
| * know how 1 | ||
| * think you 1 | ||
| * you know 1 | ||
| * | ||
| * @param frequencies A list of frequencies. | ||
| */ | ||
| public static void printFrequencies(List<Frequency> frequencies) | ||
| { | ||
| // TODO Write body! | ||
|
|
||
| // Instantiate total to 0 | ||
| int total = 0; | ||
|
|
||
| // Loop through the list and count the frequency counts | ||
| for (Frequency frequency: frequencies) | ||
| total += frequency.getFrequency(); | ||
|
|
||
| // Output the totals | ||
| System.out.println("Total item count: " + total); | ||
| System.out.println("Unique item count: " + frequencies.size()); | ||
| System.out.println(); | ||
|
|
||
| // Output the results | ||
| for (Frequency frequency: frequencies) | ||
| System.out.format("%-7s %d\n", frequency.getText(),frequency.getFrequency()); | ||
| } | ||
| } |
| @@ -0,0 +1,88 @@ | ||
| package infx141asst3; | ||
| import java.io.File; | ||
|
|
||
| import java.util.*; | ||
|
|
||
| //TO REMOVE | ||
|
|
||
| /** | ||
| * Counts the total number of words and their frequencies in a text file. | ||
| */ | ||
| public final class WordFrequencyCounter { | ||
| /** | ||
| * This class should not be instantiated. | ||
| */ | ||
| private WordFrequencyCounter() {} | ||
|
|
||
| /** | ||
| * Takes the input list of words and processes it, returning a list | ||
| * of {@link Frequency}s. | ||
| * | ||
| * This method expects a list of lowercase alphanumeric strings. | ||
| * If the input list is null, an empty list is returned. | ||
| * | ||
| * There is one frequency in the output list for every | ||
| * unique word in the original list. The frequency of each word | ||
| * is equal to the number of times that word occurs in the original list. | ||
| * | ||
| * The returned list is ordered by decreasing frequency, with tied words sorted | ||
| * alphabetically. | ||
| * | ||
| * The original list is not modified. | ||
| * | ||
| * Example: | ||
| * | ||
| * Given the input list of strings | ||
| * ["this", "sentence", "repeats", "the", "word", "sentence"] | ||
| * | ||
| * The output list of frequencies should be | ||
| * ["sentence:2", "the:1", "this:1", "repeats:1", "word:1"] | ||
| * | ||
| * @param words A list of words. | ||
| * @return A list of word frequencies, ordered by decreasing frequency. | ||
| */ | ||
|
|
||
|
|
||
| public static List<Frequency> computeWordFrequencies(List<String> words) | ||
| { | ||
| // Create arraylist to hold frequency objects | ||
| List<Frequency> frequencies = new ArrayList<>(); | ||
|
|
||
| // Check if word list is null - return empty list | ||
| if (words == null || words.isEmpty()) | ||
| return frequencies; | ||
|
|
||
| // Loop through a set of words - eliminating duplicates | ||
| for (String word : new HashSet<>(words)) | ||
| { | ||
| // Get the count of word occurennces in the word list | ||
| int count = Collections.frequency(words,word); | ||
|
|
||
| // Add the frequency to the list | ||
| frequencies.add(new Frequency(word,count)); | ||
| } | ||
|
|
||
| // // Sort by frequency count then alphabetically | ||
| // frequencies.sort(( Frequency a, Frequency b) -> { | ||
| // if (Integer.compare(b.getFrequency(), a.getFrequency()) == 0) | ||
| // return a.getText().compareTo(b.getText()); | ||
| // else | ||
| // return Integer.compare(b.getFrequency(), a.getFrequency()); | ||
| // }); | ||
|
|
||
| return frequencies; | ||
| } | ||
|
|
||
| /** | ||
| * Runs the word frequency counter. The input should be the path to a text file. | ||
| * | ||
| * @param args The first element should contain the path to a text file. | ||
| */ | ||
| public static void main(String[] args) | ||
| { | ||
| File file = new File(args[0]); | ||
| List<String> words = Utilities.tokenizeFile(file); | ||
| List<Frequency> frequencies = computeWordFrequencies(words); | ||
| Utilities.printFrequencies(frequencies); | ||
| } | ||
| } |
| @@ -0,0 +1,62 @@ | ||
| package infx141asst3; | ||
|
|
||
| import javafx.util.Pair; | ||
|
|
||
| import java.io.Serializable; | ||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
||
| /** | ||
| * Created by VGDC_1 on 2/18/2016. | ||
| * Brett Lenz 76382638 | ||
| Carl Pacheco 47911659 | ||
| Derek Edrich 34363846 | ||
| Khang Tran 47508988 | ||
| */ | ||
| public final class WordTFIDF implements Serializable { | ||
| public String word; | ||
| public int IDF; | ||
| public HashMap<String, Pair<Integer, Double>> TFs; | ||
|
|
||
| public WordTFIDF(String word) { | ||
| this.word = word; | ||
| IDF = 0; | ||
| TFs = new HashMap<String, Pair<Integer, Double>>(); | ||
| } | ||
|
|
||
| public void add(Pair<String, Pair<Integer, Double>> data) { | ||
| TFs.put(data.getKey(), data.getValue()); | ||
| IDF += data.getValue().getKey(); | ||
| } | ||
|
|
||
| public double TFIDF(String filename, int corpusSize) { | ||
| if (TFs.containsKey((filename))) { | ||
| return (1 + Math.log(TFs.get(filename).getKey())) * Math.log((double) corpusSize / (double) TFs.get(filename).getKey()); | ||
| } | ||
| else | ||
| return 0; | ||
| } | ||
|
|
||
| public String maxTFIDF(int corpusSize) { | ||
| double max = -1; | ||
| String maxFilename = ""; | ||
| for(String filename : TFs.keySet()) | ||
| { | ||
| double testTFIDF = TFIDF(filename, corpusSize); | ||
| if(testTFIDF > max) | ||
| { | ||
| max = testTFIDF; | ||
| maxFilename = filename; | ||
| } | ||
| } | ||
| return maxFilename; | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return TFs.toString(); | ||
| } | ||
| } |