forked from Cybernetic1/DecodingLaw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
199 lines (161 loc) · 7.23 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding: utf-8 -*-
"""
For A2J Hackathon 2018
Purpose: try to generate random data from downloaded case-law files
@author: Abeer Arora
"""
import numpy as np
import math
import os # for os.listdir
from nltk.corpus import stopwords
import re # for removing punctuations
import pickle
import sys # for sys.stdout.flush()
from collections import defaultdict # for default value of word-vector dictionary
import time
from tkinter import *
path_to_glove = "wiki-news-300d-1M.vec" # change to your path and filename
GLOVE_SIZE = 300 # dimension of word vectors in GloVe file
num_classes = 10
times_steps = 32 # this number should be same as fixed_seq_len below
# 10 categories:
categories = ["matrimonial-rights", "separation", "divorce", "after-divorce", "divorce-maintenance",
"property-on-divorce", "types-of-marriages", "battered-wife-and-children", "Harmony-House", "divorce-mediation"]
root = Tk()
root.title("Similarity measure")
text0 = Text(root, height=10)
text0.grid(row=0,column=0,columnspan=2)
text0.insert(INSERT, "Case-law sentences")
labels = []
canvases = []
for i, cat in enumerate(categories):
label = Label(root, text=cat)
label.grid(row=i+1,column=0)
labels.append(label)
canvas = Canvas(root, width=402, height=10)
canvas.grid(row=i+1,column=1)
canvas.create_rectangle(0, 0, 402, 10, fill="black")
canvases.append(canvas)
root.update_idletasks()
root.update()
# ====================== load pre-trained word2vec dictionary ======================
pickle_off = open("word2vec-map.pickle", "rb")
word2vec_map = pickle.load(pickle_off)
pickle_off = open("unique-words.pickle", "rb")
unique_words = pickle.load(pickle_off) # unique words list for case-law as well as categories
# set default value = zero vector, if word not found in dictionary
zero_vector = np.asarray([0.0] * GLOVE_SIZE, dtype='float32')
word2vec_map = defaultdict(lambda: zero_vector, word2vec_map)
# ======================= Functions for calculating cosine similarity ==================
def sent_avg_vector(words):
""" calculates average vector of sentence and returns value"""
Vec = np.zeros((300,), dtype="float32")
numWords = 0
for word in words:
if word in unique_words:
numWords += 1
Vec = np.add(Vec, word2vec_map[word])
if numWords>0:
Vec = np.divide(Vec, numWords)
return Vec
def similarity(Vec1,Vec2):
""" calculating cosine similarity between two sentence vectors """
v = math.sqrt(sum(vec1**2))
vv = math.sqrt(sum(vec2**2))
numerator = np.dot(Vec1,Vec2)
denominator = v*vv
return numerator/denominator
# ==================== extract all sentences from categories =======================
cats = [] # list of list of list of words (categories[lines[words]])
suffix = "" # to be added to sub-directory, not needed currently
for i, category in enumerate(categories):
print("\nCategory: ", category)
for j, filename in enumerate(os.listdir("categories/" + category + suffix)):
with open("categories/" + category + suffix + "/" + filename) as f:
catLines = []
for line in f:
catWords = []
line = re.sub(r"[^\w-]", " ", line) # strip punctuations except hyphen
line = re.sub(u"[\u4e00-\u9fff]", " ", line) # strip Chinese
line = re.sub(r"\d", " ", line) # strip numbers
line = re.sub(r"-+", "-", line) # reduce multiple --- to -
for word in line.lower().split():
if word not in stopwords.words('english'):
catWords.append(word)
if len(catWords) > 0: # skip empty lines
catLines.append(catWords)
cats.append(catLines)
# ===================== Scan case examples from file ============================
labels = []
data = []
fixed_seq_len = times_steps # For each case law, take N consecutive words from text
print("\n**** Calculating sentence similarity....")
print(time.strftime("%Y-%m-%d %H:%M"))
for filenames in os.listdir("laws-TXT/family-laws"):
with open("laws-TXT/family-laws/" + filenames, encoding="utf-8") as fh:
for line in fh:
Var = np.empty([10,1])
words1 = []
line = re.sub(r"[^\w-]", " ", line) # strip punctuations except hyphen
line = re.sub(u"[\u4e00-\u9fff]", " ", line) # strip Chinese
line = re.sub(r"\d", " ", line) # strip numbers
line = re.sub(r"-+", "-", line) # reduce multiple --- to -
text0.delete(1.0, END)
for word in line.lower().split():
if word not in stopwords.words('english'):
words1.append(word)
text0.insert(INSERT, word + " ")
if len(words1) == 0: # skip empty lines
continue
vec1 = sent_avg_vector(words1)
# ====== for each case-law line, find similarity against N categories
for i, category in enumerate(categories):
# print("\nCategory: ", category)
# text2.delete(1.0, END)
# text2.insert(1.0, category)
sims = []
catLines = cats[i]
for catLine in catLines:
# text3.delete(1.0, END)
# for w in catWords:
# text3.insert(INSERT, w + " ")
vec2 = sent_avg_vector(catLine)
sims.append(similarity(vec1, vec2) * 100)
arr = np.asarray(sims)
Var[i] = np.var(arr)
sim = np.mean(sims)
canvases[i].create_rectangle(0, 0, 402, 10, fill="black")
canvases[i].create_rectangle(1, 1, sim * 4, 9, fill="red")
root.update_idletasks()
root.update()
exit(0)
for k in range(0, 5000): # number of examples per file (default: 500)
#print(k, end = ": ")
# Randomly select a sequence of words (of fixed length) in stuff text
rand_start = np.random.choice(range(0, len(stuff) - fixed_seq_len))
word_list = " ".join(stuff[rand_start: rand_start + fixed_seq_len])
#print(word_list)
data.append(word_list)
#labels += [i] # set label for training
# convert to 1-hot encoding for labels
for i in range(len(labels)):
label = labels[i]
one_hot_encoding = [0] * num_classes
one_hot_encoding[label] = 1
labels[i] = one_hot_encoding
num_examples = len(data)
print("\nData size = ", num_examples, " examples")
# ========================== save prepared data to files ==============================
folderName = "prepared-data/"
pickling_on = open(folderName + "training-data-3.pickle", "wb+")
pickle.dump(data, pickling_on)
pickling_on.close()
pickling_on = open(folderName + "training-labels-3.pickle", "wb+")
pickle.dump(labels, pickling_on)
pickling_on.close()
pickling_on = open(folderName + "training-word-list-3.pickle", "wb+")
pickle.dump(word_list, pickling_on)
pickling_on.close()
pickling_on = open(folderName + "training-word2vec-map-3.pickle", "wb+")
pickle.dump(word2vec_map, pickling_on)
pickling_on.close()