# Fetching Documents From Solr

First, we need to fetch documents from Solr as a DataFrame.

In [1]:
ZK_HOST = "localhost:9990"

index = input("Index: ") or "core17"
field = input("Field: ") or "contents"
query = input("Query: ") or "Obama"

df = spark.read.format("solr") \
    .option("zkhost", ZK_HOST) \
    .option("collection", index) \
    .option("query", "{}:{}".format(field, query)) \
    .option("max_rows", "1000") \
    .load()

df.printSchema()

Index: 
Field: 
Query: 
root
 |-- id: string (nullable = false)
 |-- raw: string (nullable = true)



# Extract Triples From Text
Here, we extract triples from text using a (very) basic approach looking at the dependency tree of the entities.

In [2]:
import spacy

def extract_relations(doc):

    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()
    
    triples = []
        
    for ent in doc.ents:
        preps = [prep for prep in ent.root.head.children if prep.dep_ == "prep"]
        for prep in preps:
            for child in prep.children:
                triples.append((ent.text, "{} {}".format(ent.root.head, prep), child.text))
    
    return triples

TEXTS = [
    'Barrack Obama was born in Hawaii in the year 1961.',
    'Apple bought Google for $1 million.'
]
    
nlp = spacy.load("en")

for text in TEXTS:
    print("\n" + text)
    relations = extract_relations(nlp(text))
    for r1, r2, r3 in relations:
        print('({}, {}, {})'.format(r1, r2, r3))


Barrack Obama was born in Hawaii in the year 1961.
(Barrack Obama, born in, Hawaii)
(Barrack Obama, born in, the year 1961)

Apple bought Google for $1 million.
(Apple, bought for, $1 million)
(Google, bought for, $1 million)


and we can run it within Spark...

In [3]:
nlp = spacy.load("en")

triples = df.rdd.repartition(44).flatMap(lambda doc: extract_relations(nlp(doc.raw)))
triples.count()

KeyboardInterrupt: 