-
Notifications
You must be signed in to change notification settings - Fork 2
/
wordstats3.py
84 lines (72 loc) · 2.92 KB
/
wordstats3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Count words and distinct words in a file and print a report"""
# MCS 260 Fall 2021 Lecture 14
import sys
# Check for correct number of arguments
if len(sys.argv) < 2:
print("Usage: {} INPUTFILE".format(sys.argv[0]))
exit(1)
fn = sys.argv[1] # input filename
infile = open(fn,"r",encoding="UTF-8")
# Characters that are always taken to mark a word boundary
# (hence equivalent to spaces for the purpose of splitting
# into words)
always_word_bdry = [ "“", "”", "-", ";",
"_", "\ufeff", "(", "?", ".",
"\"", "!", "‘", "%", "#", ")",
"*", "$", "/", "&", ",", "]",
"—", ":", "[" ]
# A string containing all the characters that are allowed in
# the middle of a word, but which should be trimmed from the
# start or end of a word.
remove_from_word_edges = "’'"
def split_words(s):
"""Take a line of text and extract words, attempting to ignore punctuation and numbers"""
s = s.strip().lower()
# Convert punctuation that is never part of a word to spaces
for c in always_word_bdry:
s = s.replace(c," ")
L = []
# split along spaces and examine each piece
for w in s.split():
# strip any characters that are allowed inside but not
# at the ends of words
w = w.strip(remove_from_word_edges)
# Are there any alphabet letters left?
for c in w:
if c.isalpha():
# Found an alphabet letter; keep this word
L.append(w)
break
# Return the list of words found in s
return L
wordcount = 0 # total number of words seen
word_histogram = dict() # keys are distinct words, values are counts
max_words_per_line = 0
for line in infile: # handle one line at a time
line_words = split_words(line) # split this line into words
# Is this the longest so far (in word count?)
if len(line_words) > max_words_per_line:
max_words_per_line = len(line_words)
# Iterate over the words in this line
for w in line_words:
wordcount = wordcount + 1 # always increase count
if w in word_histogram:
# Word previously seen; increase the count
word_histogram[w] = word_histogram[w] + 1
else:
# Word not previously seen; record it with a count of 1
word_histogram[w] = 1
# Print the report
print("filename={}".format(fn))
print("words={}".format(wordcount))
print("distinct_words={}".format(len(word_histogram)))
print("max_words_per_line={}".format(max_words_per_line))
maxcount = max(word_histogram.values())
most_frequent_words = [ w for w in word_histogram
if word_histogram[w] == maxcount ]
print("most_used_words={}".format(",".join(most_frequent_words)))
print("most_used_words_freq={} ({:.2f}%)".format(maxcount,100.0*maxcount/wordcount))
# Uncomment to print all the words
# print("distinct words:")
# for w in word_histogram:
# print(w)