Permalink
Fetching contributors…
Cannot retrieve contributors at this time
102 lines (94 sloc) 3.17 KB
// <Snippet12>
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
public class Example
{
public static void Main()
{
RegexUtilities util = new RegexUtilities();
string title = "Doyle - The Hound of the Baskervilles.txt";
try {
var info = util.GetWordData(title);
Console.WriteLine("Words: {0:N0}", info.Item1);
Console.WriteLine("Average Word Length: {0:N2} characters", info.Item2);
}
catch (IOException e) {
Console.WriteLine("IOException reading file '{0}'", title);
Console.WriteLine(e.Message);
}
catch (RegexMatchTimeoutException e) {
Console.WriteLine("The operation timed out after {0:N0} milliseconds",
e.MatchTimeout.TotalMilliseconds);
}
}
}
public class RegexUtilities
{
public Tuple<int, double> GetWordData(string filename)
{
const int MAX_TIMEOUT = 1000; // Maximum timeout interval in milliseconds.
const int INCREMENT = 350; // Milliseconds increment of timeout.
List<string> exclusions = new List<string>( new string[] { "a", "an", "the" });
int[] wordLengths = new int[29]; // Allocate an array of more than ample size.
string input = null;
StreamReader sr = null;
try {
sr = new StreamReader(filename);
input = sr.ReadToEnd();
}
catch (FileNotFoundException e) {
string msg = String.Format("Unable to find the file '{0}'", filename);
throw new IOException(msg, e);
}
catch (IOException e) {
throw new IOException(e.Message, e);
}
finally {
if (sr != null) sr.Close();
}
int timeoutInterval = INCREMENT;
bool init = false;
Regex rgx = null;
Match m = null;
int indexPos = 0;
do {
try {
if (! init) {
rgx = new Regex(@"\b\w+\b", RegexOptions.None,
TimeSpan.FromMilliseconds(timeoutInterval));
m = rgx.Match(input, indexPos);
init = true;
}
else {
m = m.NextMatch();
}
if (m.Success) {
if ( !exclusions.Contains(m.Value.ToLower()))
wordLengths[m.Value.Length]++;
indexPos += m.Length + 1;
}
}
catch (RegexMatchTimeoutException e) {
if (e.MatchTimeout.TotalMilliseconds < MAX_TIMEOUT) {
timeoutInterval += INCREMENT;
init = false;
}
else {
// Rethrow the exception.
throw;
}
}
} while (m.Success);
// If regex completed successfully, calculate number of words and average length.
int nWords = 0;
long totalLength = 0;
for (int ctr = wordLengths.GetLowerBound(0); ctr <= wordLengths.GetUpperBound(0); ctr++) {
nWords += wordLengths[ctr];
totalLength += ctr * wordLengths[ctr];
}
return new Tuple<int, double>(nWords, totalLength/nWords);
}
}
// </Snippet12>