forked from Cybernetic1/DecodingLaw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare-word-vector-dict.py
125 lines (107 loc) · 4.92 KB
/
prepare-word-vector-dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""
* find unique words
* look up word-vectors for all unique words
* save word-vector dictionary
@author: YKY
"""
import numpy as np
import os # for os.listdir
from nltk.corpus import stopwords
import re # for removing punctuations
import pickle
import sys # for sys.stdout.flush()
import time # for timing events
path_to_glove = "wiki-news-300d-1M.vec" # change to your path and filename
GLOVE_SIZE = 300 # dimension of word vectors in GloVe file
num_classes = 10
# 10 categories:
categories = ["matrimonial-rights", "separation", "divorce", "after-divorce", "divorce-maintenance",
"property-on-divorce", "types-of-marriages", "battered-wife-and-children", "Harmony-House", "divorce-mediation"]
suffix = "" # to be added to sub-directory, not needed currently
print("***** This program takes about 15 minutes to run *****\n")
# ================== Find unique words in pre-recorded category files ========================
unique_count = 0
total_count = 0
unique_words = []
print("\n**** Finding unique words in pre-recorded category files....")
print(time.strftime("%Y-%m-%d %H:%M"))
for i, category in enumerate(categories):
print("\nCategory: ", category)
for j, filename in enumerate(os.listdir("categories/" + category + suffix)):
with open("categories/" + category + suffix + "/" + filename) as f:
for line in f:
line = re.sub(r"[^\w-]", " ", line) # strip punctuations except hyphen
line = re.sub(u"[\u4e00-\u9fff]", " ", line) # strip Chinese
line = re.sub(r"\d", " ", line) # strip numbers
line = re.sub(r"-+", "-", line) # reduce multiple --- to -
for word in line.lower().split():
total_count += 1
if word not in stopwords.words('english'):
if word not in unique_words:
unique_words.append(word)
unique_count += 1
#print(word, " ", end='\r')
print("\n**** unique words found == ", len(unique_words))
# =================== Find unique words in case-law files ========================
print("\n**** Finding unique words in law-case files....")
print(time.strftime("%Y-%m-%d %H:%M"), " (total 3070 K words, ~9 minutes)")
total_count = 0
for filenames in os.listdir("laws-TXT/family-laws"):
with open("laws-TXT/family-laws/" + filenames, encoding="utf-8") as fh:
for line in fh:
line = re.sub(r"[^\w-]", " ", line) # strip punctuations except hyphen
line = re.sub(u"[\u4e00-\u9fff]", " ", line) # strip Chinese
line = re.sub(r"\d", " ", line) # strip numbers
line = re.sub(r"-+", "-", line) # reduce multiple --- to -
for word in line.lower().split():
total_count += 1
if word not in stopwords.words('english'):
if word not in unique_words:
unique_words.append(word)
unique_count += 1
if total_count % 1000 == 0:
print(unique_count, "/", (total_count // 1000), "K : ", word, " ", end='\r')
#if count >= 20000:
# break
print("\n**** unique words found == ", len(unique_words))
# ====================== Create word-to-vector dictionary =========================
print("\n**** Looking up word vectors (Ctrl-C to end sooner)....")
print(time.strftime("%Y-%m-%d %H:%M"), " (total 1000000 words, ~7 minutes)")
word2vec_map = {}
count_all_words = 0
entry_number = 0
glove_file = open(path_to_glove, "r", encoding="utf-8")
f2 = open("found-words.txt", "w")
try:
for word_entry in glove_file:
vals = word_entry.split()
word = str(vals[0])
entry_number += 1
if word in unique_words:
print(count_all_words, word, file=f2)
print(entry_number, count_all_words, word, " ", end='\r')
sys.stdout.flush()
count_all_words += 1
coefs = np.asarray(vals[1:], dtype='float32')
coefs /= np.linalg.norm(coefs)
word2vec_map[word] = coefs
if count_all_words == len(unique_words):
print("*** found all words ***")
break
# if it takes too long to look up the entire dictionary, we can break it short
except KeyboardInterrupt:
pass
glove_file.close()
f2.close()
print("\n**** vocabulary size = ", len(word2vec_map))
print(time.strftime("%Y-%m-%d %H:%M"))
folderName = ""
# ================== save word2vec map and unique words list to .pickle =============
pickling_on = open(folderName + "word2vec-map.pickle", "wb+")
pickle.dump(word2vec_map, pickling_on)
pickling_on.close()
pickling_on = open(folderName + "unique-words.pickle", "wb+")
pickle.dump(unique_words, pickling_on)
pickling_on.close()
print("SUCCESS")