# Fetching Documents From Solr
First, we need to fetch documents from Solr as a DataFrame.

In [None]:
ZK_HOST = "localhost:9990"

index = input("Index: ") or "core17"
field = input("Field: ") or "contents"
query = input("Query: ") or "Obama"

df = spark.read.format("solr") \
    .option("zkhost", ZK_HOST) \
    .option("collection", index) \
    .option("query", "{}:{}".format(field, query)) \
    .option("max_rows", "100") \
    .load()

df.printSchema()

# CoreNLP Server
Next, we show how to get triples from the CoreNLP server.

In [None]:
from stanfordnlp.server import CoreNLPClient

def extract(doc):
    with CoreNLPClient(annotators=["openie"], memory='16G', threads=44) as client:
        triples = []
        for sent in client.annotate(doc).sentence:
            for triple in sent.openieTriple:
                triples.append((triple.subject, triple.relation, triple.object))
        return triples
    
print(extract("Barrack Obama was born in Hawaii in 1961."))
print(extract("Apple bought Google for $1 billion."))

In [None]:
import time

start = time.time()
count = df.rdd.repartition(44).flatMap(lambda doc: extract(doc.raw)).count()
end = time.time()

print("{} triples @ {} per/sec".format(count, count / (end - start)))

# CoreNLP Server (Batching)

In [None]:
import time

from stanfordnlp.server import CoreNLPClient

def extractPartition(docs):
    batch = "\n\n".join(doc.raw for doc in docs)
    with CoreNLPClient(annotators=["openie"], memory='16G', threads=44, max_char_length=100000000, timeout=3000000) as client:
        triples = []
        for sent in client.annotate(batch).sentence:
            for triple in sent.openieTriple:
                triples.append((triple.subject, triple.relation, triple.object))
        return triples
    
start = time.time()
count = df.rdd.repartition(44).mapPartitions(lambda doc: extractPartition(doc)).count()
end = time.time()

print("{} triples @ {} per/sec".format(count, count / (end - start)))