/
tf-02.py
executable file
·110 lines (98 loc) · 3.38 KB
/
tf-02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
import sys, re, operator, string
#
# The all-important data stack
#
stack = []
#
# The heap. Maps names to data (i.e. variables)
#
heap = {}
#
# The new "words" (procedures) of our program
#
def read_file():
"""
Takes a path to a file on the stack and places the entire
contents of the file back on the stack.
"""
f = open(stack.pop())
# Push the result onto the stack
stack.append([f.read()])
f.close()
def filter_chars():
"""
Takes data on the stack and places back a copy with all
nonalphanumeric chars replaced by white space.
"""
# This is not in style. RE is too high-level, but using it
# for doing this fast and short. Push the pattern onto stack
stack.append(re.compile('[\W_]+'))
# Push the result onto the stack
stack.append([stack.pop().sub(' ', stack.pop()[0]).lower()])
def scan():
"""
Takes a string on the stack and scans for words, placing
the list of words back on the stack
"""
# Again, split() is too high-level for this style, but using
# it for doing this fast and short. Left as exercise.
stack.extend(stack.pop()[0].split())
def remove_stop_words():
"""
Takes a list of words on the stack and removes stop words.
"""
f = open('../stop_words.txt')
stack.append(f.read().split(','))
f.close()
# add single-letter words
stack[-1].extend(list(string.ascii_lowercase))
heap['stop_words'] = stack.pop()
# Again, this is too high-level for this style, but using it
# for doing this fast and short. Left as exercise.
heap['words'] = []
while len(stack) > 0:
if stack[-1] in heap['stop_words']:
stack.pop() # pop it and drop it
else:
heap['words'].append(stack.pop()) # pop it, store it
stack.extend(heap['words']) # Load the words onto the stack
del heap['stop_words']; del heap['words'] # Not needed
def frequencies():
"""
Takes a list of words and returns a dictionary associating
words with frequencies of occurrence.
"""
heap['word_freqs'] = {}
# A little flavour of the real Forth style here...
while len(stack) > 0:
# ... but the following line is not in style, because the
# naive implementation would be too slow
if stack[-1] in heap['word_freqs']:
# Increment the frequency, postfix style: f 1 +
stack.append(heap['word_freqs'][stack[-1]]) # push f
stack.append(1) # push 1
stack.append(stack.pop() + stack.pop()) # add
else:
stack.append(1) # Push 1 in stack[2]
# Load the updated freq back onto the heap
heap['word_freqs'][stack.pop()] = stack.pop()
# Push the result onto the stack
stack.append(heap['word_freqs'])
del heap['word_freqs'] # We don't need this variable anymore
def sort():
# Not in style, left as exercise
stack.extend(sorted(stack.pop().items(), key=operator.itemgetter(1)))
# The main function
#
stack.append(sys.argv[1])
read_file(); filter_chars(); scan(); remove_stop_words()
frequencies(); sort()
stack.append(0)
# Check stack length against 1, because after we process
# the last word there will be one item left
while stack[-1] < 25 and len(stack) > 1:
heap['i'] = stack.pop()
(w, f) = stack.pop(); print(w, ' - ', f)
stack.append(heap['i']); stack.append(1)
stack.append(stack.pop() + stack.pop())