-
Notifications
You must be signed in to change notification settings - Fork 2
/
lexsaob.py
177 lines (173 loc) · 6.54 KB
/
lexsaob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
# Licensed under GPLv3+ i.e. GPL version 3 or later.
import logging
from urllib.parse import urlparse, parse_qsl
from pprint import pprint
from csv import reader
from wikibaseintegrator import wbi_core, wbi_login
import config
import loglevel
# Constants
wd_prefix = "http://www.wikidata.org/entity/"
print("Logging in with Wikibase Integrator")
login_instance = wbi_login.Login(
user=config.username, pwd=config.password
)
#download all swedish lexemes via sparql (~23000 as of 2021-04-05)
#dictionary with word as key and list in the value
#list[0] = lid
#list[1] = category Qid
print("Fetching all lexemes")
lexemes_data = {}
lexemes_list = []
for i in range(0,10000,10000):
print(i)
results = wbi_core.ItemEngine.execute_sparql_query(f"""
select ?lexemeId ?lemma ?category
WHERE {{
#hint:Query hint:optimizer "None".
?lexemeId dct:language wd:Q9027;
wikibase:lemma ?lemma;
wikibase:lexicalCategory ?category.
MINUS{{
?lexemeId wdt:P8478 [].
}}
}}
limit 10000
offset {i}
""")
if len(results) == 0:
print("No lexeme found")
else:
print("adding lexemes to list")
pprint(results.keys())
pprint(results["results"].keys())
pprint(len(results["results"]["bindings"]))
for result in results["results"]["bindings"]:
#print(result)
#*************************
# Handle result and upload
#*************************
lemma = result["lemma"]["value"]
lid = result["lexemeId"]["value"].replace(wd_prefix, "")
category = result["category"]["value"].replace(wd_prefix, "")
lexemes_data[lemma] = [lid, category]
lexemes_list.append(lemma)
print(f"{len(lexemes_list)} fetched")
# exit(0)
# load all saab words into a list that can be searched
# load all saab ids into a list we can lookup in using the index.
# the two lists above have the same index.
# load all saob lines into a dictionary with count as key and list in the value
#list[0] = saob_category
#list[1] = number
#list[2] = id
#list[3] = word
print("Loading SAOB into memory")
saob_wordlist = []
saob_data = {}
# open file in read mode
with open('saob_2021-01-06.csv', 'r') as read_obj:
# pass the file object to reader() to get the reader object
csv_reader = reader(read_obj)
count = 0
# Iterate over each row in the csv using reader object
for row in csv_reader:
# row variable is a list that represents a row in csv
# debug:
#print(row)
#*********************
# Set up the variables
#*********************
#row0 is null
word = row[1]
saob_category = row[2]
if row[3] == '':
number = 0
else:
number = int(row[3])
url = urlparse(row[4])
# print(url.query)
saob_id = dict(parse_qsl(url.query))["id"]
saob_data[count] = [saob_category, number, saob_id, word]
saob_wordlist.append(word)
count += 1
print(f"loaded {count} saob lines into dictionary with length {len(saob_data)}")
print(f"loaded {count} saob lines into list with length {len(saob_wordlist)}")
# exit(0)
# go through all lexemes missing SAOB identifier
for lexeme in lexemes_list:
#lookup
lexeme_data = lexemes_data[lexeme]
print(f"Working on {lexeme_data[0]}: {lexeme}")
value_count = 0
saob_indexes = []
if lexeme in saob_wordlist:
for count, value in enumerate(saob_wordlist):
if value == lexeme:
print(count, value)
saob_indexes.append(count)
value_count += 1
if value_count > 1:
print(f"Found more than 1 value = complex, skipping")
elif value_count == 1:
saob_worddata = saob_data[saob_indexes[0]]
saob_category = saob_worddata[0]
number = saob_worddata[1]
saob_id = saob_worddata[2]
if number != 0:
print(f"Found number > 0 = complex, skipping")
else:
print(f"found match: category: {saob_worddata[0]} id: {saob_worddata[2]}")
#check if categories match
category = None
if saob_category == "verb":
category = "Q24905"
elif saob_category == "subst":
category = "Q1084"
elif saob_category == "adj":
category = "Q34698"
elif saob_category == "adv":
category = "Q380057"
else:
print(f"Did not recognize category {saob_category}, skipping")
if category is not None:
if category == lexeme_data[1]:
print("Hooray categories match, uploading")
#*************************
# upload
#*************************
lemma = lexeme
lid = lexeme_data[0]
print(f"Uploading id to {lid}: {lemma}")
# TODO if numbered
# - fetch lexeme using wbi
# - present to user
# - ask user which if one matches
print(f"Adding {saob_id} to {lid}")
saob_statement = wbi_core.ExternalID(
prop_nr="P8478",
value=saob_id,
)
described_by_source = wbi_core.ItemID(
prop_nr="P1343",
value="Q1935308"
)
item = wbi_core.ItemEngine(
data=[saob_statement,
described_by_source],
#append_value="P8478",
item_id=lid
)
result = item.write(
login_instance,
edit_summary="Added SAOB identifier with [[Wikidata:Tools/LexSAOB]]"
)
#if config.debug_json:
#logging.debug(f"result from WBI:{result}")
print(f"{wd_prefix}{lid}")
exit(0)
else:
print("Categories did not match :/ - skipping")
else:
print(f"{lexeme} not found in SAOB wordlist")