/
hashtag_hyphenater.py
54 lines (47 loc) · 1.54 KB
/
hashtag_hyphenater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re
import simplejson
with open('MergedPosts.json') as f:
j = simplejson.load(f)
HASHTAG_RE = re.compile('#\w+')
hashtags = []
for p in j:
text = p['text']
hashtags.extend(HASHTAG_RE.findall(text)
)
dictionary_files = [
'SallyWords.txt',
'/Users/drcraig/Downloads/ispell-enwl-3.1.20/english.0',
'/Users/drcraig/Downloads/ispell-enwl-3.1.20/english.1',
'/Users/drcraig/Downloads/ispell-enwl-3.1.20/english.2',
'/Users/drcraig/Downloads/ispell-enwl-3.1.20/english.3',
# '/Users/drcraig/Downloads/ispell-enwl-3.1.20/american.0',
# '/Users/drcraig/Downloads/ispell-enwl-3.1.20/american.1',
# '/Users/drcraig/Downloads/ispell-enwl-3.1.20/american.2',
]
dictionary_words = []
for df in dictionary_files:
with open(df) as f:
lines = f.readlines()
dictionary_words.extend([line.rstrip().lower() for line in lines])
dictionary_words.extend(map(str,range(0,100)))
def find_longest_words(text, words=[]):
word = ''
remainder = text
for i in xrange(0,len(text)+1):
possible_word = text[0:i]
if possible_word in dictionary_words:
word = possible_word
remainder = text[i:]
if word:
words.append(word)
return find_longest_words(remainder, words)
else:
if remainder:
words.append(remainder)
return words
h = {}
for hashtag in hashtags:
words = find_longest_words(hashtag.lstrip('#'), [])
h[hashtag] = '#'+(r'\-'.join(words))
with open('hashtag_hyphenation.json', 'w') as f:
simplejson.dump(h,f, indent=1)