-
Notifications
You must be signed in to change notification settings - Fork 2
/
chobj.py
126 lines (101 loc) · 4.41 KB
/
chobj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import pickle
import datetime
from time import sleep
import numpy as np
from WikiWho.utils import iter_rev_tokens
from .revision import Revision
from .utils import Timer
class Chobjer:
def __init__(self, article, pickles_path, lang, context, starting_revid = -1):
from wikiwho import open_pickle
self.ww_pickle = open_pickle(
article, pickle_path=pickles_path, lang=lang)
self.article = article
self.context = context
self.revisions = self.ww_pickle.revisions
self.starting_revid = starting_revid
def get_revisions_dict(self):
revisions = self.revisions
return {
rev_id: Revision(
rev_id,
datetime.datetime.strptime(
revisions[rev_id].timestamp, r'%Y-%m-%dT%H:%M:%SZ'),
# revisions[rev_id].timestamp,
revisions[rev_id].editor)
for rev_id in self.ww_pickle.ordered_revisions if rev_id >= self.starting_revid
}
def get_one_revision(self, rev_id):
revisions = self.revisions
return Revision(
rev_id,
datetime.datetime.strptime(
revisions[rev_id].timestamp, r'%Y-%m-%dT%H:%M:%SZ'),
revisions[rev_id].editor)
def __iter_rev_content(self, rev_id):
yield ('{st@rt}', -1)
for word in iter_rev_tokens(self.revisions[rev_id]):
yield (word.value, word.token_id)
yield ('{$nd}', -2)
def __get_token_ids(self, rev_id):
yield -1
for word in iter_rev_tokens(self.revisions[rev_id]):
yield word.token_id
yield -2
def __get_values(self, rev_id):
yield '{st@rt}'
for word in iter_rev_tokens(self.revisions[rev_id]):
yield word.value
yield '{$nd}'
def add_all_tokens(self, revisions, tokens):
for token in tokens:
# token.str
if token.origin_rev_id >= self.starting_revid:
revisions[token.origin_rev_id].added.append(token.token_id)
for in_revision in token.inbound:
if in_revision >= self.starting_revid:
revisions[in_revision].added.append(token.token_id)
for out_revision in token.outbound:
if out_revision >= self.starting_revid:
revisions[out_revision].removed.append(token.token_id)
def iter_chobjs(self):
# get all the revisions
revs = self.get_revisions_dict()
revs_iter = iter(revs.items())
# prepare the first revision
from_rev_id, from_rev = next(revs_iter)
from_rev.from_id = None
# prepare the the next revisions (numpy arrays for tokens ids)
from_rev.tokens = np.fromiter(self.__get_token_ids(from_rev_id), int)
# the faster option for str is then python lists
from_rev.values = list(self.__get_values(from_rev_id))
# Adding the tokens to all revisions
self.add_all_tokens(revs, self.ww_pickle.tokens)
# adding content to all other revision and finding change objects
# between them
for to_rev_id, _ in revs_iter:
# the two revisions that will be compare
to_rev = revs[to_rev_id]
# make the revisions aware from the others ids
to_rev.from_id = from_rev_id
from_rev.to_id = to_rev.id
# prepare the the next revisions (numpy arrays for tokens ids)
to_rev.tokens = np.fromiter(self.__get_token_ids(to_rev_id), int)
# the faster option for str is then python lists
to_rev.values = list(self.__get_values(to_rev_id))
# complete the next revision
to_rev.inserted_continuous_pos()
for chobj in from_rev.iter_chobs(self.article, to_rev, self.context):
yield chobj
# the to revision becomes the from revision
# release memory
revs[from_rev_id] = None
from_rev_id = to_rev_id
# the to_revision will become the from revision in next iteration
from_rev = to_rev
def save(self, save_dir):
save_filepath = os.path.join(
save_dir, f"{self.article}_change.pkl")
with open(save_filepath, "wb") as file:
pickle.dump(self.wiki, file)