forked from Cybernetic1/DecodingLaw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
find-similar-paragraphs.py
128 lines (110 loc) · 4.17 KB
/
find-similar-paragraphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf-8 -*-
"""
@author: YKY
"""
import numpy as np
import os # for os.listdir
from nltk.corpus import stopwords
import re # for removing punctuations
import pickle
import sys # for sys.stdout.flush()
path_to_glove = "/data/wiki-news-300d-1M.vec" # change to your path and filename
GLOVE_SIZE = 300 # dimension of word vectors in GloVe file
num_classes = 10
times_steps = 32 # this number should be same as fixed_seq_len below
# 10 categories:
categories = ["matrimonial-rights", "separation", "divorce", "after-divorce", "divorce-maintenance",
"property-on-divorce", "types-of-marriages", "battered-wife-and-children", "Harmony-House", "divorce-mediation"]
suffix = "" # to be added to sub-directory, not needed currently
# =================== Read case examples from file ======================
"""
0. for each category:
1. read all cases in folder
2. for each case generate N examples (consecutive word sequences of fixed length)
"""
labels = []
data = []
fixed_seq_len = times_steps # For each case law, take N consecutive words from text
print("\n**** Preparing training data....")
for i, category in enumerate(categories):
print("\nCategory: ", category)
for j, filename in enumerate(os.listdir("../scraped-data/" + category + suffix)):
stuff = []
with open("../scraped-data/" + category + suffix + "/" + filename) as f:
for line in f:
line = re.sub(r'[^\w\s-]',' ',line) # remove punctuations except hyphen
for word in line.lower().split(): # convert to lowercase
# remove stop words, ignore numbers and dangling hyphens
if (word not in stopwords.words('english') and
word[0] not in "0123456789-"):
stuff.append(word)
print("Case-law #", j, " word count = ", len(stuff))
for k in range(0, 10): # number of examples per file (default: 500)
# Randomly select a sequence of words (of fixed length) in stuff text
rand_start = np.random.choice(range(0, len(stuff) - fixed_seq_len))
data.append(" ".join(stuff[rand_start: rand_start + fixed_seq_len]))
labels += [i] # set label
# convert to 1-hot encoding for labels
for i in range(len(labels)):
label = labels[i]
one_hot_encoding = [0] * num_classes
one_hot_encoding[label] = 1
labels[i] = one_hot_encoding
num_examples = len(data)
print("\nData size = ", num_examples, " examples")
# ================ Find unique words ================
print("\n**** Finding unique words....")
word_list = [] # to store the list of words appearing in case-text
index = 0
for sent in data:
for word in sent.split():
if word not in word_list:
# make sure no Chinese chars or numerals inside word
if re.search(u'[\u4e00-\u9fff]+', word) == None and \
re.search(r'\d', word) == None:
word_list.append(word)
index += 1
print(word, " ", end='\r')
sys.stdout.flush()
print(len(word_list), " unique words found")
# ============== Create word-to-vector dictionary ===========
print("\n**** Looking up word vectors....")
word2vec_map = {}
count_all_words = 0
entry_number = 0
glove_file = open(path_to_glove, "r")
f2 = open("found-words.txt", "w")
try:
for word_entry in glove_file:
vals = word_entry.split()
word = str(vals[0])
entry_number += 1
if word in word_list:
print(count_all_words, word, file=f2)
print(entry_number, count_all_words, word, " ", end='\r')
sys.stdout.flush()
count_all_words += 1
coefs = np.asarray(vals[1:], dtype='float32')
coefs /= np.linalg.norm(coefs)
word2vec_map[word] = coefs
if count_all_words == len(word_list) - 1:
print("*** found all words ***")
break
# if it takes too long to look up the entire dictionary, we can break it short
except KeyboardInterrupt:
pass
glove_file.close()
f2.close()
print("Vocabulary size = ", len(word2vec_map))
pickling_on = open("training-data-3.pickle", "wb+")
pickle.dump(data, pickling_on)
pickling_on.close()
pickling_on = open("training-labels-3.pickle", "wb+")
pickle.dump(labels, pickling_on)
pickling_on.close()
pickling_on = open("training-word-list-3.pickle", "wb+")
pickle.dump(word_list, pickling_on)
pickling_on.close()
pickling_on = open("training-word2vec-map-3.pickle", "wb+")
pickle.dump(word2vec_map, pickling_on)
pickling_on.close()