In [None]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sqlContext = pyspark.sql.SQLContext(sc)

In [None]:
import os

In [None]:
from sift.corpora import wikipedia, wikidata
from sift.models import text, links
Conll_path = "/data/conll2003/"
base_path = '/data/wikipedia/20151002/'

In [None]:
corpus = wikipedia.WikipediaCorpus()(sc, base_path + 'dump')
redirects = wikipedia.WikipediaRedirects()(corpus)

docs = ConllPrepare(Conll_path, "train", redirects)

In [None]:
docs.take(1)

In [None]:
wikipedia_pfx = 'en.wikipedia.org/wiki/'

In [None]:
ec_model = links\
    .EntityCounts(min_count=5, filter_target=wikipedia_pfx)\
    .build(docs)\
    .map(links.EntityCounts.format_item)

In [None]:
enc_model = links\
    .EntityNameCounts(lowercase=True, filter_target=wikipedia_pfx)\
    .build(docs)\
    .filter(lambda (name, counts): sum(counts.itervalues()) > 1)\
    .map(links.EntityNameCounts.format_item)

In [None]:
ec_model.take(1)

In [None]:
from nel.model import data
from nel.model.store import file

In [None]:
os.environ['NEL_DATASTORE_URI'] = 'file:///data0/nel/'

In [None]:
# we can use model.toLocalIterator if models don't fit in memory

In [None]:
data.ObjectStore\
    .Get('models:ecounts[wikipedia]')\
    .save_many(ec_model.collect())

In [None]:
data.ObjectStore\
    .Get('models:necounts[wikipedia]')\
    .save_many(enc_model.collect())

In [None]:
from nel.doc import Doc

In [None]:
from nel.harness.format import from_sift

In [None]:
from nel.process.pipeline import Pipeline
from nel.process.candidates import NameCounts
from nel.features.probability import EntityProbability, NameProbability

In [None]:
candidate_generation = [
    NameCounts('wikipedia', 10)
]
feature_extraction = [
    EntityProbability('wikipedia'),
    NameProbability('wikipedia')
]

In [None]:
training_pipeline = Pipeline(candidate_generation + feature_extraction)

In [None]:
training_docs = [from_sift(doc) for doc in docs.takeSample(False, 100)]

In [None]:
train = [training_pipeline(doc) for doc in training_docs]

In [None]:
from nel.learn import ranking
from nel.features import meta
from nel.model import resolution
from nel.process import resolve

In [None]:
ranker = ranking.TrainLinearRanker(name='ranker', features=[f.id for f in feature_extraction])(train)

In [None]:
classifier_feature = meta.ClassifierScore(ranker)
linking = [
    classifier_feature,
    resolve.FeatureRankResolver(classifier_feature.id)
]

In [None]:
linking_pipeline = Pipeline(candidate_generation + feature_extraction + linking)

In [None]:
sample = [from_sift(doc) for doc in docs.takeSample(False, 10)]

In [None]:
# clear existing links
for doc in sample:
    for chain in doc.chains:
        chain.resolution = None
        for mention in chain.mentions:
            mention.resolution = None

In [None]:
linked_sample = [linking_pipeline(doc) for doc in sample]

In [None]:
[d.id for d in linked_sample]

In [None]:
sample[0].chains[0].resolution.id