## Process data

In [17]:
import bz2
import csv
import gzip
import json
import os
import random

import fasttext

In [23]:
file_parts_dir = './file_parts/'
base_fasttext_fn = './fasttext/wd_2020_11_02.txt'
groundtruth_data = 'labeled_enwiki_with_topics_metadata.json.bz2'  # same labeled data as used by outlinks

train_prop = 0.9
val_prop = 0.02
test_prop = 0.08
assert train_prop + val_prop + test_prop == 1
train_fn = base_fasttext_fn.replace('.txt', '_train.txt')
train_metadata_fn = base_fasttext_fn.replace('.txt', '_train_metadata.txt')
val_fn = base_fasttext_fn.replace('.txt', '_val.txt')
val_metadata_fn = base_fasttext_fn.replace('.txt', '_val_metadata.txt')
test_fn = base_fasttext_fn.replace('.txt', '_test.txt')
test_metadata_fn = base_fasttext_fn.replace('.txt', '_test_metadata.txt')
nogroundtruth_fn = base_fasttext_fn.replace('.txt', '_nogt.txt')
nogroundtruth_metadata_fn = base_fasttext_fn.replace('.txt', '_nogt_metadata.txt')

In [3]:
!mkdir './fasttext'
!hdfs dfs -copyToLocal 'wikidata-statements-enwiki/part*' '{file_parts_dir}'

In [20]:
# quickly combine many CSV parts into a single CSV (with metadata removed for fastText)
fns = [fn for fn in os.listdir(file_parts_dir) if fn.endswith('.csv.gz')]
embeddings_output = 'wikidata_embeddings_enwiki.tsv.bz2'
print_every = 1
input_header = ['item_id', 'statements']
empty = 0
processed = 0
with bz2.open(embeddings_output, 'wt') as fout:
    for i, fn in enumerate(fns, start=1):
        with gzip.open(os.path.join(file_parts_dir, fn), 'rt') as fin:
            # the quote symbol " is somehow a valid username character...
            header = next(fin).strip().split('\t')
            assert header == input_header
            for line_no, line_str in enumerate(fin, start=1):
                line = line_str.strip().split('\t')
                assert len(line) == 2
                qid = line[0]
                prop_vals = line[1]
                if len(prop_vals) == 0:
                    empty += 1
                emb = ' '.join(['{0:.3f}'.format(d) for d in ft_model.get_sentence_vector(prop_vals)])
                fout.write('{0}\t{1}\n'.format(qid, emb))
                processed += 1
        if i % print_every == 0:
            print("{0} / {1} files processed. {2} of {3} empty.".format(i, len(fns), empty, processed))
            print_every = print_every * 2

1 / 200 files processed. 0 of 30483 empty.
2 / 200 files processed. 0 of 60741 empty.
4 / 200 files processed. 0 of 121591 empty.
8 / 200 files processed. 0 of 243789 empty.
16 / 200 files processed. 0 of 486460 empty.
32 / 200 files processed. 0 of 971618 empty.
64 / 200 files processed. 0 of 1943770 empty.
128 / 200 files processed. 0 of 3888819 empty.


## Prepare Training Data

In [9]:
def fasttextify(topic):
    """Translate articletopic labels into fastText format (prefixed with __label__ and no spaces)."""
    return '__label__{0}'.format(topic.replace(' ', '_'))

In [8]:
# load in groundtruth
qid_topics = {}
with bz2.open(groundtruth_data, 'rt') as fin:
    for line in fin:
        line = json.loads(line)
        qid = line.get('qid')
        topics = line.get('topics')
        if qid and topics:
            qid_topics[qid] = topics
print("{0} QIDs with topics.".format(len(qid_topics)))

5662388 QIDs with topics.


In [13]:
!zless ./file_parts/part-00000-53a7157b-9779-4919-843d-8ac5abaa9cb6-c000.csv.gz | head

item_id	statements
Q1000211	P18 P214 P1313 Q65191356 P6671 P17 Q142 P31 Q484170 P1566 P1616 P7938 Q574168 P373 P1082 P1082 P1082 P1082 P4212 P856 P646 P131 Q3093 P131 Q701410 P6766 P2046 P8422 P374 P47 Q325587 P47 Q988888 P47 Q872458 P47 Q378973 P47 Q868812 P47 Q867323 P625 P1448 P166 Q2727598 P281
Q1000919	P244 P571 P18 P214 P31 Q43229 P373 P856 P7859 P625 P112 Q741873 P1315 P213 P3500 P3057
Q100124689	P361 Q99766252 P361 Q6591877 P17 Q408 P31 Q1057954 P991 Q5128951 P1697 P5044 P1867 P1868 P541 Q19202748 P585 P155 Q100098032
Q100158694	P31 Q5 P21 Q6581072 P569 P106 Q43845 P106 Q483501
Q100159873	P31 Q5 P569 P106 Q12299841 P735 Q18336566 P21 Q6581097
Q100166237	P31 Q11173 P31 Q2612896 P662 P117
Q100168290	P31 Q5 P569 P106 Q937857 P21 Q6581097 P19 Q3808
Q1001788	P244 P571 P1454 Q134161 P214 P31 Q4830453 P31 Q6881511 P856 P7859 P3347 P452 Q44497 P646 P414 Q13677 P154 P159 Q2868
Q1001929	P571 P18 P454 P3518 P649 P214 P1435 Q19558910 P17 Q30 P31 Q12323 P1566 P373 P4661 Q35043122 P

In [24]:
train_written = 0
val_written = 0
test_written = 0
nogt_written = 0
i = 0
fns = [fn for fn in os.listdir(file_parts_dir) if fn.endswith('.csv.gz')]
qids_to_split = {}
input_header = ['item_id', 'statements']
with open(train_fn, 'w') as train_fout:
    with open(train_metadata_fn, 'w') as train_metadata_fout:
        with open(val_fn, 'w') as val_fout:
            with open(val_metadata_fn, 'w') as val_metadata_fout:
                with open(test_fn, 'w') as test_fout:
                    with open(test_metadata_fn, 'w') as test_metadata_fout:
                        with open(nogroundtruth_fn, 'w') as nogt_fout:
                            with open(nogroundtruth_metadata_fn, 'w') as nogt_metadata_fout:
                                for fidx, fn in enumerate(fns, start=1):
                                    with gzip.open(os.path.join(file_parts_dir, fn), 'rt') as fin:
                                        header = next(fin).strip().split('\t')
                                        assert header == input_header
                                        for i, line_str in enumerate(fin, start=1):
                                            line = line_str.strip().split('\t')
                                            assert len(line) == len(input_header)
                                            qid = line[0]
                                            prop_vals = line[1]
                                            if not prop_vals or not qid:
                                                continue
                                            topics = qid_topics.get(qid)
                                            if topics:
                                                if qid in qids_to_split:
                                                    r = qids_to_split[qid]
                                                else:
                                                    r = random.random()
                                                    qids_to_split[qid] = r
                                                if r <= train_prop:
                                                    data_fout = train_fout
                                                    metadata_fout = train_metadata_fout
                                                    train_written += 1
                                                elif r <= train_prop + val_prop:
                                                    data_fout = val_fout
                                                    metadata_fout = val_metadata_fout
                                                    val_written += 1
                                                else:
                                                    data_fout = test_fout
                                                    metadata_fout = test_metadata_fout
                                                    test_written += 1
                                            else:
                                                topics = []
                                                data_fout = nogt_fout
                                                metadata_fout = nogt_metadata_fout
                                                nogt_written += 1
                                            data_fout.write('{0} {1}\n'.format(' '.join([fasttextify(t) for t in topics]), prop_vals))
                                            metadata_fout.write('{0}\n'.format(qid))
                                    print("{0} of {1} processed: {2} train. {3} val. {4} test. {5} no groundtruth.".format(fidx, len(fns),
                                                                                                                           train_written,
                                                                                                                           val_written,
                                                                                                                           test_written,
                                                                                                                           nogt_written))

1 of 200 processed: 24549 train. 573 val. 2229 test. 3132 no groundtruth.
2 of 200 processed: 49172 train. 1159 val. 4385 test. 6025 no groundtruth.
3 of 200 processed: 73810 train. 1690 val. 6513 test. 9023 no groundtruth.
4 of 200 processed: 98573 train. 2240 val. 8728 test. 12050 no groundtruth.
5 of 200 processed: 123223 train. 2740 val. 10905 test. 15068 no groundtruth.
6 of 200 processed: 147925 train. 3216 val. 13201 test. 18093 no groundtruth.
7 of 200 processed: 172832 train. 3769 val. 15373 test. 21183 no groundtruth.
8 of 200 processed: 197707 train. 4357 val. 17595 test. 24130 no groundtruth.
9 of 200 processed: 222071 train. 4899 val. 19793 test. 27153 no groundtruth.
10 of 200 processed: 246552 train. 5456 val. 21956 test. 30178 no groundtruth.
11 of 200 processed: 271102 train. 6025 val. 24164 test. 33246 no groundtruth.
12 of 200 processed: 295652 train. 6541 val. 26331 test. 36271 no groundtruth.
13 of 200 processed: 320198 train. 7110 val. 28525 test. 39310 no groundt

102 of 200 processed: 2509688 train. 55897 val. 223545 test. 309778 no groundtruth.
103 of 200 processed: 2534266 train. 56437 val. 225778 test. 312841 no groundtruth.
104 of 200 processed: 2558990 train. 56932 val. 228011 test. 315879 no groundtruth.
105 of 200 processed: 2583666 train. 57479 val. 230154 test. 318909 no groundtruth.
106 of 200 processed: 2608263 train. 58002 val. 232367 test. 321927 no groundtruth.
107 of 200 processed: 2632783 train. 58522 val. 234465 test. 324926 no groundtruth.
108 of 200 processed: 2657369 train. 59058 val. 236699 test. 328028 no groundtruth.
109 of 200 processed: 2681856 train. 59654 val. 238787 test. 331099 no groundtruth.
110 of 200 processed: 2706644 train. 60151 val. 241023 test. 334152 no groundtruth.
111 of 200 processed: 2731213 train. 60703 val. 243189 test. 337120 no groundtruth.
112 of 200 processed: 2755912 train. 61272 val. 245454 test. 340149 no groundtruth.
113 of 200 processed: 2780832 train. 61848 val. 247611 test. 343153 no groun

200 of 200 processed: 4920405 train. 108861 val. 437645 test. 607671 no groundtruth.


In [28]:
!ls -lht /home/isaacj/notebooks/wikidata_embeddings/fasttext/

total 1.3G
-rw-r--r-- 1 isaacj wikidev 5.4M Jan 27 22:47 wd_2020_11_02_nogt_metadata.txt
-rw-r--r-- 1 isaacj wikidev  27M Jan 27 22:47 wd_2020_11_02_nogt.txt
-rw-r--r-- 1 isaacj wikidev 3.9M Jan 27 22:47 wd_2020_11_02_test_metadata.txt
-rw-r--r-- 1 isaacj wikidev  95M Jan 27 22:47 wd_2020_11_02_test.txt
-rw-r--r-- 1 isaacj wikidev  44M Jan 27 22:47 wd_2020_11_02_train_metadata.txt
-rw-r--r-- 1 isaacj wikidev 1.1G Jan 27 22:47 wd_2020_11_02_train.txt
-rw-r--r-- 1 isaacj wikidev 976K Jan 27 22:47 wd_2020_11_02_val_metadata.txt
-rw-r--r-- 1 isaacj wikidev  24M Jan 27 22:47 wd_2020_11_02_val.txt
