In [1]:
import shelve
import os
import pickle
import spacy

index_fn = "/data/assertion_shelve"
max_len = 30

#nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])
nlp = spacy.load('en', parser=False, entity=False, matcher=False)

In [2]:
object2assertions = dict()
subject2assertions = dict()
assertions = dict()

def lemmatized(tokens, start, end):
    return " ".join(t.lemma_ for t in tokens if t.idx < end and t.idx >= start)


ctr = 0

def process_line(l, source):
    global ctr
    #e.g.:  'pair of compasses synonym compass\t0:17\t26:33'
    try:
        l = l.strip()
        if "\t" not in l:
            return

        [text, subject_spans, object_spans] = l.split("\t")
        
        if text.count(" ") < max_len:
            tokens = nlp(text)
            subjects = [lemmatized(tokens, int(s[:s.index(":")]), int(s[s.index(":") + 1:]))
                        for s in subject_spans.split(",")]
            objects = [lemmatized(tokens, int(s[:s.index(":")]), int(s[s.index(":") + 1:]))
                       for s in object_spans.split(",")]

            if len(subjects) == 1 and len(objects) == 1 and subjects[0] == objects[0]:
                return

            ctr += 1

            for subject in subjects:
                if subject not in subject2assertions[source]:
                    subject2assertions[source][subject] = set()
                subject2assertions[source][subject].add(str(ctr))

            for obj in objects:
                if obj not in object2assertions[source]:
                    object2assertions[source][obj] = set()
                object2assertions[source][obj].add(str(ctr))
            assertions[str(ctr)] = text
        #else:
            #print("skipping assertion bigger than %d..." % max_len)
    except ValueError:
        print("Could not process line: " + l)

In [3]:
if os.path.exists(index_fn):
    import shutil
    shutil.rmtree(index_fn)
    
os.mkdir(index_fn)  


def process_source(name, path):
    global assertions, ctr
    print("Processing %s assertions..." % name)
    object2assertions[name] = dict()
    subject2assertions[name] = dict()
    with open(path) as f:
        for l in f:
            process_line(l, name)
            if ctr % 100000 == 0:
                print("%d" % ctr)
                db.update(assertions)
                del assertions
                assertions = dict()

with shelve.open(os.path.join(index_fn, 'assertions.shelve')) as db:
    #process_source("dbpedia_type", "/run/media/diwe01/Data3/wiki/dbpedia/type_assertions.txt") 
    #process_source("microsoft_type", "/run/media/diwe01/Data2/corpora/concepts/concepts_microsoft/data-concept/type_assertions.txt") 
    #process_source("simple_wikipedia_firstsent", "/run/media/diwe01/Data3/wiki/simple_abstracts/wiki_assertions.txt") 
    #process_source("en_wikt", "/run/media/diwe01/Data3/wiki/en_wikt/meaning_assertions.txt") 
    #process_source("wikipedia_firstsent", "/run/media/diwe01/Data3/wiki/en_abstracts/wiki_assertions.txt") 
    process_source("ppdb_L", "/run/media/diwe01/Data2/corpora/concepts/ppdb/assertions_pos.txt") 
    process_source("conceptnet", "/run/media/diwe01/Data2/corpora/concepts/conceptnet/assertions.txt")

    db.update(assertions)
    

with open(os.path.join(index_fn, 'subject2assertions.pkl'), "wb") as f:
    pickle.dump(subject2assertions, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(index_fn, 'object2assertions.pkl'), "wb") as f:
    pickle.dump(object2assertions, f, protocol=pickle.HIGHEST_PROTOCOL)

Processing ppdb_L assertions...
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
Processing conceptnet assertions...
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
