/
build_frequency_lists.py
187 lines (160 loc) · 6.73 KB
/
build_frequency_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import time
import codecs
import urllib
import simplejson
import urllib2
from pprint import pprint
SLEEP_TIME = 20 # seconds
def get_ranked_english():
'''
wikitionary has a list of ~40k English words, ranked by frequency of occurance in TV and movie transcripts.
more details at:
http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/explanation
the list is separated into pages of 1000 or 2000 terms each.
* the first 10k words are separated into pages of 1000 terms each.
* the remainder is separated into pages of 2000 terms each:
'''
URL_TMPL = 'http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/%s'
urls = []
for i in xrange(10):
freq_range = "%d-%d" % (i * 1000 + 1, (i+1) * 1000)
urls.append(URL_TMPL % freq_range)
for i in xrange(0,15):
freq_range = "%d-%d" % (10000 + 2 * i * 1000 + 1, 10000 + (2 * i + 2) * 1000)
urls.append(URL_TMPL % freq_range)
urls.append(URL_TMPL % '40001-41284')
ranked_terms = [] # ordered by rank, in decreasing frequency.
for url in urls:
html, is_cached = wiki_download(url)
if not is_cached:
time.sleep(SLEEP_TIME)
new_terms = parse_wiki_terms(html)
ranked_terms.extend(new_terms)
return ranked_terms
def wiki_download(url):
'''
scrape friendly: sleep 20 seconds between each request, cache each result.
'''
DOWNLOAD_TMPL = '../data/tv_and_movie_freqlist%s.html'
freq_range = url[url.rindex('/')+1:]
tmp_path = DOWNLOAD_TMPL % freq_range
if os.path.exists(tmp_path):
print 'cached.......', url
with codecs.open(tmp_path, 'r', 'utf8') as f:
return f.read(), True
with codecs.open(tmp_path, 'w', 'utf8') as f:
print 'downloading...', url
req = urllib2.Request(url, headers={
'User-Agent': 'zxcvbn'
})
response = urllib2.urlopen(req)
result = response.read().decode('utf8')
f.write(result)
return result, False
def parse_wiki_terms(doc):
'''who needs an html parser. fragile hax, but checks the result at the end'''
results = []
last3 = ['', '', '']
header = True
for line in doc.split('\n'):
last3.pop(0)
last3.append(line.strip())
if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
if header:
header = False
continue
last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
rank, term, count = last3
rank = int(rank.split()[0])
term = term.replace('</a>', '')
term = term[term.index('>')+1:].lower()
results.append(term)
assert len(results) in [1000, 2000, 1284] # early docs have 1k entries, later have 2k, last doc has 1284
return results
def get_ranked_census_names():
'''
takes name lists from the the 2000 us census, prepares as a json array in order of frequency (most common names first).
more info:
http://www.census.gov/genealogy/www/data/2000surnames/index.html
files in data are downloaded copies of:
http://www.census.gov/genealogy/names/dist.all.last
http://www.census.gov/genealogy/names/dist.male.first
http://www.census.gov/genealogy/names/dist.female.first
'''
FILE_TMPL = '../data/us_census_2000_%s.txt'
SURNAME_CUTOFF_PERCENTILE = 85 # ie7 can't handle huge lists. cut surname list off at a certain percentile.
lists = []
for list_name in ['surnames', 'male_first', 'female_first']:
path = FILE_TMPL % list_name
lst = []
for line in codecs.open(path, 'r', 'utf8'):
if line.strip():
if list_name == 'surnames' and float(line.split()[2]) > SURNAME_CUTOFF_PERCENTILE:
break
name = line.split()[0].lower()
lst.append(name)
lists.append(lst)
return lists
def get_ranked_common_passwords():
lst = []
for line in codecs.open('../data/common_passwords.txt', 'r', 'utf8'):
if line.strip():
lst.append(line.strip())
return lst
def to_ranked_dict(lst):
return dict((word, i) for i, word in enumerate(lst))
def filter_short(terms):
'''
only keep if brute-force possibilities are greater than this word's rank in the dictionary
'''
return [term for i, term in enumerate(terms) if 26**(len(term)) > i]
def filter_dup(lst, lists):
'''
filters lst to only include terms that don't have lower rank in another list
'''
max_rank = len(lst) + 1
dct = to_ranked_dict(lst)
dicts = [to_ranked_dict(l) for l in lists]
return [word for word in lst if all(dct[word] < dct2.get(word, max_rank) for dct2 in dicts)]
def filter_ascii(lst):
'''
removes words with accent chars etc.
(most accented words in the english lookup exist in the same table unaccented.)
'''
return [word for word in lst if all(ord(c) < 128 for c in word)]
def to_js(lst, lst_name):
return 'var %s = %s;\n\n' % (lst_name, simplejson.dumps(lst))
def main():
english = get_ranked_english()
surnames, male_names, female_names = get_ranked_census_names()
passwords = get_ranked_common_passwords()
[english,
surnames, male_names, female_names,
passwords] = [filter_ascii(filter_short(lst)) for lst in (english,
surnames, male_names, female_names,
passwords)]
# make dictionaries disjoint so that d1 & d2 == set() for any two dictionaries
all_dicts = set(tuple(l) for l in [english, surnames, male_names, female_names, passwords])
passwords = filter_dup(passwords, all_dicts - set([tuple(passwords)]))
male_names = filter_dup(male_names, all_dicts - set([tuple(male_names)]))
female_names = filter_dup(female_names, all_dicts - set([tuple(female_names)]))
surnames = filter_dup(surnames, all_dicts - set([tuple(surnames)]))
english = filter_dup(english, all_dicts - set([tuple(english)]))
with open('../frequency_lists.js', 'w') as f: # words are all ascii at this point
lsts = locals()
for lst_name in 'passwords male_names female_names surnames english'.split():
lst = lsts[lst_name]
f.write(to_js(lst, lst_name))
print '\nall done! totals:\n'
print 'passwords....', len(passwords)
print 'male.........', len(male_names)
print 'female.......', len(female_names)
print 'surnames.....', len(surnames)
print 'english......', len(english)
print
if __name__ == '__main__':
if os.path.basename(os.getcwd()) != 'scripts':
print 'run this from the scripts directory'
exit(1)
main()