In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sqlContext = pyspark.sql.SQLContext(sc)

ImportError: No module named findspark

In [2]:
import os

In [3]:
from sift.corpora import wikipedia, wikidata
from sift.models import text, links
wikipedia_base_path = '/data0/linking/wikipedia/dumps/20150901/'
wikidata_base_path = '/n/schwa11/data0/linking/wikidata/dumps/20150713'

In [4]:
wikipedia_corpus = wikipedia.WikipediaCorpus()(sc, wikipedia_base_path)
docs = wikipedia.WikipediaArticles()(wikipedia_corpus).cache()

In [5]:
docs.take(1)

[{'_id': 'en.wikipedia.org/wiki/Anarchism',
  'links': [{'start': 0,
    'stop': 9,
    'target': u'en.wikipedia.org/wiki/Anarchism'},
   {'start': 15,
    'stop': 35,
    'target': u'en.wikipedia.org/wiki/Political_philosophy'},
   {'start': 51,
    'stop': 70,
    'target': u'en.wikipedia.org/wiki/Stateless_society'},
   {'start': 88,
    'stop': 101,
    'target': u'en.wikipedia.org/wiki/Self-governance'},
   {'start': 207, 'stop': 219, 'target': u'en.wikipedia.org/wiki/Hierarchy'},
   {'start': 220,
    'stop': 237,
    'target': u'en.wikipedia.org/wiki/Free_association_(communism_and_anarchism)'},
   {'start': 259,
    'stop': 264,
    'target': u'en.wikipedia.org/wiki/State_(polity)'},
   {'start': 315,
    'stop': 327,
    'target': u'en.wikipedia.org/wiki/Anti-statism'},
   {'start': 367, 'stop': 376, 'target': u'en.wikipedia.org/wiki/Authority'},
   {'start': 380,
    'stop': 405,
    'target': u'en.wikipedia.org/wiki/Hierarchical_organisation'},
   {'start': 804,
    'stop': 

In [6]:
wikipedia_pfx = 'en.wikipedia.org/wiki/'

In [7]:
ec_model = links\
    .EntityCounts(min_count=5, filter_target=wikipedia_pfx)\
    .build(docs)\
    .map(links.EntityCounts.format_item)

In [8]:
enc_model = links\
    .EntityNameCounts(lowercase=True, filter_target=wikipedia_pfx)\
    .build(docs)\
    .filter(lambda (name, counts): sum(counts.itervalues()) > 1)\
    .map(links.EntityNameCounts.format_item)

In [9]:
ec_model.take(1)

[{'_id': u'en.wikipedia.org/wiki/Polar_class', 'count': 15}]

In [10]:
from nel.model import data
from nel.model.store import file

In [11]:
os.environ['NEL_DATASTORE_URI'] = 'file:///data0/nel/'

In [12]:
# we can use model.toLocalIterator if models don't fit in memory

In [13]:
data.ObjectStore\
    .Get('models:ecounts[wikipedia]')\
    .save_many(ec_model.collect())

DEBUG:nel:Using file object store for (models:ecounts[wikipedia])...
DEBUG:nel:Loading mmap store: /data0/nel/models/ecounts[wikipedia].index ...
DEBUG:nel:Loading mmap store: /data0/nel/models/ecounts[wikipedia].index ...


In [14]:
data.ObjectStore\
    .Get('models:necounts[wikipedia]')\
    .save_many(enc_model.collect())

DEBUG:nel:Using file object store for (models:necounts[wikipedia])...
DEBUG:nel:Loading mmap store: /data0/nel/models/necounts[wikipedia].index ...
DEBUG:nel:Loading mmap store: /data0/nel/models/necounts[wikipedia].index ...


In [15]:
from nel.doc import Doc

In [16]:
from nel.harness.format import from_sift

In [17]:
from nel.process.pipeline import Pipeline
from nel.process.candidates import NameCounts
from nel.features.probability import EntityProbability, NameProbability

In [18]:
candidate_generation = [
    NameCounts('wikipedia', 10)
]
feature_extraction = [
    EntityProbability('wikipedia'),
    NameProbability('wikipedia')
]

INFO:nel:Preparing name model candidate generator (model=wikipedia, limit=10)...
DEBUG:nel:Using file object store for (models:necounts[wikipedia])...
DEBUG:nel:Loading mmap store: /data0/nel/models/necounts[wikipedia].index ...
DEBUG:nel:Using file object store for (models:ecounts[wikipedia])...
DEBUG:nel:Loading mmap store: /data0/nel/models/ecounts[wikipedia].index ...
DEBUG:nel:Using file object store for (models:necounts[wikipedia])...
DEBUG:nel:Loading mmap store: /data0/nel/models/necounts[wikipedia].index ...


In [19]:
training_pipeline = Pipeline(candidate_generation + feature_extraction)

In [20]:
training_docs = [from_sift(doc) for doc in docs.takeSample(False, 100)]

In [21]:
train = [training_pipeline(doc) for doc in training_docs]

In [22]:
from nel.learn import ranking
from nel.features import meta
from nel.model import resolution
from nel.process import resolve

In [23]:
ranker = ranking.TrainLinearRanker(name='ranker', features=[f.id for f in feature_extraction])(train)

INFO:nel:Computing feature statistics over 100 documents...
INFO:nel:Building training set, feature mapping = PolynomialMapper...
INFO:nel:Fitting model over 9394 instances...
INFO:nel:Training set pairwise classification: 93.2% (8751/9394)
INFO:nel:Done.


In [24]:
classifier_feature = meta.ClassifierScore(ranker)
linking = [
    classifier_feature,
    resolve.FeatureRankResolver(classifier_feature.id)
]

In [25]:
linking_pipeline = Pipeline(candidate_generation + feature_extraction + linking)

In [26]:
sample = [from_sift(doc) for doc in docs.takeSample(False, 10)]

In [27]:
# clear existing links
for doc in sample:
    for chain in doc.chains:
        chain.resolution = None
        for mention in chain.mentions:
            mention.resolution = None

In [28]:
linked_sample = [linking_pipeline(doc) for doc in sample]

In [29]:
[d.id for d in linked_sample]

['en.wikipedia.org/wiki/Tino_rangatiratanga',
 'en.wikipedia.org/wiki/Leleupidia',
 'en.wikipedia.org/wiki/Relevance_feedback',
 'en.wikipedia.org/wiki/Hovannes_Adamian',
 'en.wikipedia.org/wiki/United_States_Nordic_Combined_Championships_2009',
 'en.wikipedia.org/wiki/Guarea_thompsonii',
 'en.wikipedia.org/wiki/Willett_(Columbus,_Georgia)',
 'en.wikipedia.org/wiki/Parchuk',
 'en.wikipedia.org/wiki/Ralph_Wien_Memorial_Airport',
 'en.wikipedia.org/wiki/Natalya_Popkova']

In [30]:
sample[0].chains[0].resolution.id

u'en.wikipedia.org/wiki/Auckland_Harbour_Bridge'