In [1]:
from mcgocr import Craft, GoData, CandidateFinder, SolidExtractor, CandidateFinder, Index, LabelMarker
from mcgocr import bulk_measurements, recover, evaluate, Evidence, Statement, Entity, Pattern, Constraint
from experiment import Report, MetaReport, Corpus, Sentence, Candidate

In [2]:
from sklearn.feature_extraction import FeatureHasher
from sklearn import svm

In [3]:
craft = Craft('data')
corpus = craft.get_corpus()
goldstandard = craft.get_goldstandard()
godata = GoData.load('data/go/go_20071128.pickle')

Reading the corpus...
Reading the goldstandard of GO...


In [4]:
import progressbar
from collections import defaultdict

bar = progressbar.ProgressBar()
pmid2key = dict()
ten_corpora = corpus.divide(10)
for i, corp in enumerate(ten_corpora):
    pmid2key.update({pmid: i for pmid in corp.doc_set()})
    
term_indeces = defaultdict(Index)
for pmid, goid, start, end, text in bar(goldstandard):
    if '...' in text:
        continue
    key = pmid2key[pmid]
    for statement in godata[goid].statements:
        terms = statement.terms()
        if len(terms) == 1:
            term = terms[0]
            if text != term.lemma:
                term_indeces[key][text].add(term)



100% (29447 of 29447) |###################| Elapsed Time: 0:00:02 Time: 0:00:02


In [5]:
# Dump
import pickle
with open('data/auxiliary_term_indeces.pickle', 'wb') as f:
    pickle.dump(term_indeces, f)

with open('data/ten_fold_corpora.pickle', 'wb') as f:
    pickle.dump(ten_corpora, f)

In [6]:
# Load
import pickle
with open('data/auxiliary_term_indeces.pickle', 'rb') as f:
    term_indeces = pickle.load(f)

with open('data/ten_fold_corpora.pickle', 'rb') as f:
    corpora = pickle.load(f)

In [9]:
def training_testing_auxililary_for(corpusid):
    training_corpus = Corpus.join('training', [c for i, c in enumerate(ten_corpora) if i != corpusid])
    testing_corpus = ten_corpora[corpusid]
    term_index = Index.join([ix for i, ix in term_indeces.items() if i != corpusid])
    auxiliary_extractor = SolidExtractor(term_index)
    return training_corpus, testing_corpus, auxiliary_extractor

In [10]:
#auxiliary_extractor = SolidExtractor(sum(term_indeces.values(), Index()))

In [11]:
training_corpus, testing_corpus, auxiliary_extractor = training_testing_auxililary_for(0)

In [14]:
training_corpus, testing_corpus, auxiliary_extractor.term_index

(Corpus<60 documents, 19362 sentences, 'training'>,
 Corpus<7 documents, 2284 sentences, 'CRAFT 1/10'>,
 Index<1909 key(s)>)

In [20]:
def exp(title, training_corpus, testing_corpus, auxiliary_extractor, random_state=0):
    finder = CandidateFinder(godata, auxiliary_extractor)
    hasher = FeatureHasher(n_features=1024)
    marker = LabelMarker(goldstandard)
    classifier = svm.LinearSVC(max_iter=1000, random_state=random_state) 
    
    training_candidates, testing_candidates = finder.bulk_findall(training_corpus), finder.bulk_findall(testing_corpus)
    training_measurements, testing_measurements = bulk_measurements(training_candidates, godata), bulk_measurements(testing_candidates, godata)
    training_X = hasher.fit_transform(training_measurements).toarray()
    training_y = marker.markall(training_candidates)
    
    #training
    classifier.fit(training_X, training_y)
    
    #testing
    testing_X = hasher.transform(testing_measurements).toarray()
    pred_y = classifier.predict(testing_X)
    system = recover(testing_candidates, pred_y)
    gold = {item for item in goldstandard if item[0] in testing_corpus.doc_set()}
    return evaluate(system, gold, title)

In [21]:
%%time
exp('0', training_corpus, testing_corpus, auxiliary_extractor)

CPU times: user 8min 15s, sys: 1min 13s, total: 9min 29s
Wall time: 9min 59s


Report<R56.84% P76.83% F65.34% '0'>

In [22]:
with open('data/candidates_from_grepc.pickle', 'rb') as f:
    candidates = pickle.load(f)

In [23]:
doc_set = testing_corpus.doc_set()

In [25]:
training_grepc = []
testing_grepc = []
for candidate in candidates:
    if candidate.sentence.docid in doc_set:
        testing_grepc.append(candidate)
    else:
        training_grepc.append(candidate)

In [29]:
finder = CandidateFinder(godata, auxiliary_extractor)
hasher = FeatureHasher(n_features=1024)
marker = LabelMarker(goldstandard)
classifier = svm.LinearSVC(max_iter=1000, random_state=1) 

training_measurements, testing_measurements = bulk_measurements(training_grepc, godata), bulk_measurements(testing_grepc, godata)
training_X = hasher.fit_transform(training_measurements).toarray()
training_y = marker.markall(training_grepc)

#training
classifier.fit(training_X, training_y)

#testing
testing_X = hasher.transform(testing_measurements).toarray()
pred_y = classifier.predict(testing_X)
system = recover(testing_grepc, pred_y)
gold = {item for item in goldstandard if item[0] in testing_corpus.doc_set()}
report = evaluate(system, gold, None)
report

NameError: name 'title' is not defined

In [33]:
godata.values()

GoData<24337 concepts, 30171 statements, on 2007/11/28>

In [42]:
import random
for concept in random.sample(list(godata.values()), 10):
    for statement in concept.statements:
        print(concept)
        print(statement)
        print()

Concept<GO:0000733 BP DNA strand renaturation>
Statement<GO:0000733%000 Entity<DNA strand renaturation@GO:0000733>>

Concept<GO:0021832 BP cell-cell adhesion involved in cerebral cortex tangential migration using cell-cell interactions>
Statement<GO:0021832%000 Entity<cell-cell adhesion@GO:0021813> Constraint<cerebral cortex tangential migration using cell-cell interactions@GO:0021832>>

Concept<GO:0048624 BP plantlet formation on parent plant>
Statement<GO:0048624%000 Entity<plantlet formation on parent plant@GO:0048624>>

Concept<GO:0002892 BP regulation of type II hypersensitivity>
Statement<GO:0002892%000 Pattern<regulate@annotator> Entity<type II hypersensitivity@GO:0002892>>

Concept<GO:0019961 MF interferon binding>
Statement<GO:0019961%000 Entity<interferon@GO:0019961> Pattern<bind@annotator>>

Concept<GO:0032290 BP myelin formation in the peripheral nervous system>
Statement<GO:0032290%000 Entity<myelin@GO:0032289> Pattern<form@annotator> Constraint<peripheral nervous system@G

In [30]:
report = evaluate(system, gold, None)
report

Report<R55.97% P81.57% F66.39% None>

In [31]:
finder.extractor.extractors[0].term_index

Index<47671 key(s)>

In [9]:
%%time
finder = CandidateFinder(godata, auxiliary_extractor)

CPU times: user 56 s, sys: 5.14 s, total: 1min 1s
Wall time: 1min 37s


In [10]:
%%time
candidates = finder.bulk_findall(corpus)
print(len(candidates))

734457
CPU times: user 49.4 s, sys: 1.72 s, total: 51.1 s
Wall time: 1min 2s


In [11]:
with open('data/candidates_from_grepc.pickle', 'rb') as f:
    grepc_candidates = pickle.load(f)

In [12]:
old_candidates = candidates

In [13]:
candidates = grepc_candidates

In [14]:
%%time 
measurements = bulk_measurements(candidates, godata)

CPU times: user 33.7 s, sys: 1.06 s, total: 34.7 s
Wall time: 38.6 s


In [11]:
with open('data/measurements.pickle', 'wb') as f:
    pickle.dump(measurements, f)

In [15]:
%%time
hasher = FeatureHasher(n_features=1024)
X = hasher.fit_transform(measurements).toarray()

CPU times: user 9.82 s, sys: 3.71 s, total: 13.5 s
Wall time: 18.7 s


In [16]:
%%time
marker = LabelMarker(goldstandard)
y = marker.markall(candidates)

CPU times: user 32.6 s, sys: 2.77 s, total: 35.4 s
Wall time: 40.6 s


In [17]:
%%time
seed = 1
classifier = svm.LinearSVC(max_iter=1000, random_state=seed) 
classifier.fit(X, y)

CPU times: user 2min 35s, sys: 8.07 s, total: 2min 43s
Wall time: 3min 50s


In [18]:
%%time
pred_y = classifier.predict(X)

CPU times: user 2.02 s, sys: 7.05 s, total: 9.07 s
Wall time: 21.2 s


In [19]:
system = recover(candidates, pred_y)
evaluate(system, goldstandard, None)

Report<R64.29% P81.52% F71.89% None>

In [17]:
evaluate(system, goldstandard, None)

CPU times: user 294 ms, sys: 49.7 ms, total: 344 ms
Wall time: 410 ms


Report<R68.70% P79.28% F73.61% None>

In [18]:
system_r = recover(candidates, [1]*len(candidates))

In [19]:
evaluate(system_r, goldstandard, None)

Report<R85.25% P4.06% F7.76% None>

# Break

In [100]:
text = 'make'
sentence = Sentence(text, 0, 'diagnosing')
grounds = finder.extractor.to_grounds(sentence)

In [101]:
grounds

Grounds(evidences=[Evidence(term=Entity(lemma='reproduction', ref='GO:0000003'), text='make', start=0, end=4)], sentence=Sentence(text='make', offset=0, docid='diagnosing'))

In [102]:
auxiliary_extractor.to_grounds(sentence)

Grounds(evidences=[Evidence(term=Entity(lemma='reproduction', ref='GO:0000003'), text='make', start=0, end=4)], sentence=Sentence(text='make', offset=0, docid='diagnosing'))

In [99]:
diagnose('make', 'GO:0009058')

Grounds(evidences=[Evidence(term=Entity(lemma='reproduction', ref='GO:0000003'), text='make', start=0, end=4)], sentence=Sentence(text='make', offset=0, docid='diagnosing'))
[Statement(statid='GO:0009058%000', evidences=[Evidence(term=Pattern(lemma='biosyntheprocess', ref='annotator'), text='biosyntheprocess', start=0, end=20)]), Statement(statid='GO:0009058%003', evidences=[Evidence(term=Pattern(lemma='form', ref='annotator'), text='form', start=0, end=9)])]


2

In [70]:
godata['GO:0022607'].statements

[Statement(statid='GO:0022607%000', evidences=[Evidence(term=Entity(lemma='cellular component', ref='GO:0022607'), text='cellular component', start=0, end=18), Evidence(term=Pattern(lemma='biosyntheprocess', ref='annotator'), text='biosyntheprocess', start=19, end=27)])]

In [29]:
measurements[15], candidates[15]

(OrderedDict([('GOID', 'GO:0032801'),
              ('STATID', 'GO:0032801%000'),
              ('NAMESPACE', 'biological_process'),
              ('LENGTH', 8),
              ('TEXT', 'receptor'),
              ('TEXT[:3]', 'rec'),
              ('TEXT[-3:]', 'tor'),
              ('OMIT=catabolprocess', True),
              ('SATURATION', 0.5)]),
 Candidate<GO:0032801%000 evidences=[Evidence(term=Entity(lemma='receptor', ref='GO:0032800'), text='receptor', start=909, end=917)] sentence=Sentence(text='Homozygosity for a null allele of the carbonic anhydrase II gene (Car2n) does not alter IOP while homozygosity for a mutation in the leptin receptor gene (Leprdb) that causes obesity and diabetes results in increased IOP. ', offset=769, docid='11532192')>)

In [26]:
import random
for item in random.sample(Out[19].fp, 10):
    pmid, goid, _, _, text = item
    print(text, godata[goid].statements[0])
    print()

induced Statement(statid='GO:0044006%000', evidences=[Evidence(term=Entity(lemma='induction', ref='GO:0052286'), text='induction', start=0, end=9), Evidence(term=Constraint(lemma='symbiont in host of tumor, nodule, or growth containing transformed cells', ref='GO:0044006'), text='symbiont in host of tumor, nodule, or growth containing transformed cells', start=13, end=86)])

promotes Statement(statid='GO:0052501%000', evidences=[Evidence(term=Pattern(lemma='positregulate', ref='annotator'), text='positregulate', start=0, end=19), Evidence(term=Constraint(lemma='organism of apoptosis in other organism during symbiotic interaction', ref='GO:0052501'), text='organism of apoptosis in other organism during symbiotic interaction', start=23, end=91)])

protein Statement(statid='GO:0030163%000', evidences=[Evidence(term=Entity(lemma='protein', ref='GO:0045732'), text='protein', start=0, end=7), Evidence(term=Pattern(lemma='catabolprocess', ref='annotator'), text='catabolprocess', start=8, end=

In [24]:
godata['GO:0009058']
sentence = Sentence('Generated', 0, 'fake')
finder.findall(sentence)

[Candidate<GO:0032502%001 evidences=[Evidence(term=Pattern(lemma='develop', ref='annotator'), text='Generated', start=0, end=9)] sentence=Sentence(text='Generated', offset=0, docid='fake')>,
 Candidate<GO:0009790%001 evidences=[Evidence(term=Pattern(lemma='develop', ref='annotator'), text='Generated', start=0, end=9)] sentence=Sentence(text='Generated', offset=0, docid='fake')>,
 Candidate<GO:0032502%000 evidences=[Evidence(term=Pattern(lemma='develop', ref='annotator'), text='Generated', start=0, end=9)] sentence=Sentence(text='Generated', offset=0, docid='fake')>]

In [27]:
term_indeces['generated']

Index<0 key(s)>

In [33]:
sentence = Sentence('cells are GluR cell', 0, '123')
Entity(lemma='cell', ref='GO:0048468') in finder.recognizer.stat_index

False

In [43]:
def has_entity(statement):
    if any([isinstance(term, Entity) for term in statement.terms()]):
        return True
    return False

In [44]:
statement = Statement(statid='GO:0005623%000', evidences=[Evidence(term=Entity(lemma='cell', ref='GO:0005623'), text='cell', start=0, end=4)])

In [47]:
for term in statement.terms():
    if has_entity(statement) and isinstance(term, Entity):
        print(term)
    elif not has_entity(statement):
        print(term)

Entity(lemma='cell', ref='GO:0005623')


In [None]:
Entity(lemma='cell', ref='GO:0048468')

In [52]:
sentence = Sentence('cells', 0, '123')

In [55]:
grounds = finder.extractor.to_grounds(sentence)
grounds

Grounds(evidences=[Evidence(term=Entity(lemma='cell', ref='GO:0048468'), text='cells', start=0, end=5)], sentence=Sentence(text='cells', offset=0, docid='123'))

In [53]:
finder.findall(sentence)

[]

In [48]:
finder.recognizer.stat_index[Entity(lemma='cell', ref='GO:0048468')]

set()

In [9]:
import IPython
IPython.display.Audio("data/xylophone-tone-26481.mp3")

In [97]:
system = recover(candidates, [1] * len(candidates))
evaluate(system, goldstandard, None)

Report<R75.63% P6.08% F11.25% None>

In [98]:
import random 
random.sample(Out[97].fn, 10)

[('15314659', 'GO:0002378', 18015, 18028, 'Ab production'),
 ('17447844', 'GO:0022607', 1021, 1027, 'formed'),
 ('16216087', 'GO:0007608', 43496, 43505, 'olfactory'),
 ('17696610', 'GO:0050896', 2805, 2813, 'responds'),
 ('17447844', 'GO:0045184', 26402666, 26542674, 'recruitment of ... proteins'),
 ('14737183', 'GO:0043473', 54678, 54690, 'pigmentation'),
 ('15588329', 'GO:0006897', 2522025253, 2522925261, 'Endocytic ... pathways'),
 ('14737183', 'GO:0065007', 3864, 3871, 'control'),
 ('15061865', 'GO:0048019', 16702, 16713, 'antagonists'),
 ('15760270', 'GO:0019432', 53811, 53833, 'Triglyceride synthesis')]

In [102]:
godata['GO:0065007'].statements

[Statement(statid='GO:0065007%000', evidences=[Evidence(term=Pattern(lemma='regulate', ref='annotator'), text='regulate', start=0, end=21)])]

In [105]:
for extractor in finder.extractor.extractors:
    

[<mcgocr.extractor.SolidExtractor at 0x103cec048>,
 <mcgocr.extractor.SoftExtractor at 0x103cec2b0>,
 <mcgocr.extractor.SolidExtractor at 0x103cec198>]

In [103]:
godata['GO:0065007']

Concept<GO:0065007 BP biological regulation>

In [100]:
s = Sentence('control', 0, 'TEST')
c = finder.findall(s)
c

[Candidate<GO:0002087%000 evidences=[Evidence(term=Entity(lemma='neurological control of breathing', ref='GO:0002087'), text='control', start=0, end=7)] sentence=Sentence(text='control', offset=0, docid='TEST')>]

In [34]:
hasher = FeatureHasher(n_features=1024)
marker = LabelMarker(goldstandard)
classifier = svm.LinearSVC(max_iter=1000, random_state=seed)

def experiment(message, training_corpus, testing_corpus):
    
    training_candidates = finder.bulk_findall(training_corpus)
    training_measurements = bulk_measurements(training_candidates, godata)
    training_X = hasher.fit_transform(training_measurements).toarray()
    training_y = marker.markall(training_candidates)
    
    #training
    classifier.fit(training_X, training_y)
    
    #testing
    testing_candidates = finder.bulk_findall(testing_corpus)
    testing_measurements = bulk_measurements(testing_candidates, godata)
    testing_X = hasher.fit_transform(testing_measurements).toarray()
    testing_y = marker.markall(testing_candidates)
    pred_y = classifier.predict(testing_X)
    
    #evaluate
    system = recover(testing_candidates, pred_y)
    docset = testing_corpus.doc_set()
    gold = [item for item in goldstandard if item[0] in docset]
    
    return evaluate(system, gold, message)

In [30]:
testing_corpus = ten_corpora[0]
training_corpus = sum(ten_corpora[1:], Corpus('training corpus'))

In [32]:
training_corpus

Corpus<60 documents, 19162 sentences, 'training corpus|CRAFT 2/10|CRAFT 3/10|CRAFT 4/10|CRAFT 5/10|CRAFT 6/10|CRAFT 7/10|CRAFT 8/10|CRAFT 9/10|CRAFT 10/10'>

In [36]:
experiment('go', training_corpus, testing_corpus, hasher, marker, classifier)

Report<R63.43% P79.38% F70.52% 'go'>

In [38]:
source_filenames = [
     '20170205-152215_Ten fold validation: 1 in 10',
     '20170205-153105_Ten fold validation: 2 in 10',
     '20170205-154019_Ten fold validation: 3 in 10',
     '20170205-154928_Ten fold validation: 4 in 10',
     '20170205-155728_Ten fold validation: 5 in 10',
     '20170205-160528_Ten fold validation: 6 in 10',
     '20170205-161314_Ten fold validation: 7 in 10',
     '20170205-162122_Ten fold validation: 8 in 10',
     '20170205-163106_Ten fold validation: 9 in 10',
     '20170205-164131_Ten fold validation: 10 in 10']

In [None]:

def meta_experiment(recipie, seed=1):
    title, feature_names = recipie
    print(title)
    reports = []
    for filename in source_filenames:
        subtitle = filename.partition('_')[2]
        filepath = '../grepc/data/history/experiments/{}/prefilter.pickle'.format(filename)
        feature_filter = partial(feature_remove_filter, feature_names=feature_names) 
        report = experiment(subtitle, filepath, feature_filter, seed)
        reports.append(report)
    metareport = MetaReport(title, reports)
    write_metareport(metareport)
    print(metareport)


In [39]:
for filename in source_filenames[:1]:
    subtitle = filename.partition('_')[2]
    filepath = '../grepc/data/history/experiments/{}/prefilter.pickle'.format(filename)

In [87]:
with open('data/candidates_from_grepc.pickle', 'rb') as f:
    grepc_candidates = pickle.load(f)

In [62]:
len(candidates)

497641

In [85]:
Candidate

experiment.corpus.Candidate

In [84]:
s0 = set(candidates)
s1 = set(grepc_candidates)
len(s0), len(s1), len(s0 & s1)

(497641, 397816, 0)

In [75]:
s = Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192')

In [80]:
for c in grepc_candidates:
    if c.sentence == s:
        a = c
        break

In [78]:
a = '''Candidate<GO:0001917%000 evidences=[Evidence(term=Entity(lemma='photoreceptor inner segment', ref='GO:0001917'), text='is', start=109, end=111)] sentence=Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192')>'''
b = '''Candidate<GO:0001917%000 evidences=[Evidence(term=Entity(lemma='photoreceptor inner segment', ref='GO:0001917'), text='is', start=109, end=111)] sentence=Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192')>'''
a == b

True

In [81]:
for c in grepc_candidates:
    if c.sentence == s:
        b = c

In [83]:
set([a, b])

{Candidate<GO:0001917%000 evidences=[Evidence(term=Entity(lemma='photoreceptor inner segment', ref='GO:0001917'), text='is', start=109, end=111)] sentence=Sentence(text='\nLittle is known about genetic factors affecting intraocular pressure (IOP) in mice and other mammals. ', offset=101, docid='11532192')>}

In [None]:
for c in candidates

In [67]:
len(s1-s0)

397816

In [63]:
len(grepc_candidates)

397816

In [48]:
import pickle
from collections import namedtuple
from mcgocr import Candidate
Basket = namedtuple('Basket', 'candidates more_info')
with open(filepath, 'rb') as f:
    thing = pickle.load(f)

ImportError: No module named 'grepc'

In [43]:
with open('data/measurements0.pickle', 'wb') as f:
    pickle.dump(measurements, f)

In [49]:
with open('../grepc/data/history/meta_experments/20170212-215431_Reasonable Best seed 1/metareport.pickle', 'rb') as f:
    metareport = pickle.load(f)

In [51]:
metareport.reports

[Report<R81.14% P79.23% F80.17% 'Ten fold validation: 1 in 10'>,
 Report<R73.57% P80.93% F77.08% 'Ten fold validation: 2 in 10'>,
 Report<R83.45% P82.37% F82.91% 'Ten fold validation: 3 in 10'>,
 Report<R76.79% P81.59% F79.12% 'Ten fold validation: 4 in 10'>,
 Report<R82.79% P83.44% F83.12% 'Ten fold validation: 5 in 10'>,
 Report<R79.72% P80.55% F80.14% 'Ten fold validation: 6 in 10'>,
 Report<R80.98% P82.71% F81.84% 'Ten fold validation: 7 in 10'>,
 Report<R82.77% P78.96% F80.82% 'Ten fold validation: 8 in 10'>,
 Report<R77.08% P78.67% F77.87% 'Ten fold validation: 9 in 10'>,
 Report<R76.84% P81.45% F79.07% 'Ten fold validation: 10 in 10'>]

In [22]:
len(y)

497641

In [26]:
pred_y = classifier.predict(X)

In [27]:
system = recover(candidates, pred_y)

In [31]:
evaluate(system, goldstandard, None)

Report<R72.26% P74.00% F73.12% None>

In [43]:
finder.extractor.extractors

[<mcgocr.extractor.SolidExtractor at 0x119507eb8>,
 <mcgocr.extractor.SoftExtractor at 0x119507a58>,
 <mcgocr.extractor.SolidExtractor at 0x11387b6d8>]

In [38]:
len(report.fn)

9062

In [41]:
list(report.fn)[:100]

[('17020410', 'GO:0000739', 9249, 9262, 'Hybridisation'),
 ('12925238', 'GO:0005509', 23612384, 23662391, 'Ca2+- ... binding'),
 ('15836427', 'GO:0065007', 3477, 3484, 'control'),
 ('15760270', 'GO:0051866', 59365962, 59495968, 'adaptation to ... stress'),
 ('15921521', 'GO:0000003', 26879, 26887, 'breeding'),
 ('16700629', 'GO:0000739', 19857, 19870, 'hybridization'),
 ('15876356', 'GO:0016534', 12408, 12429, 'activator of cyclin 5'),
 ('16216087', 'GO:0008066', 39440, 39444, 'GluR'),
 ('16110338', 'GO:0033592', 48145, 48158, 'hybridization'),
 ('16216087', 'GO:0008066', 1112, 1116, 'GluR'),
 ('15876356', 'GO:0051179', 6183, 6195, 'distribution'),
 ('15314659', 'GO:0005694', 39614, 39625, 'Chromosomes'),
 ('14611657', 'GO:0009058', 45371, 45379, 'produced'),
 ('16110338', 'GO:0048518', 5648256494, 5648456503, 'up ... regulated'),
 ('17425782', 'GO:0015925', 15610, 15613, 'gal'),
 ('15345036', 'GO:0048771', 41290, 41300, 'remodeling'),
 ('17194222', 'GO:0009058', 46459, 46469, 'Generat

In [40]:
report2 = report.evaluate(hrs, the_goldstandard, None)

AttributeError: 'Report' object has no attribute 'evaluate'

In [44]:
ds = corpus.doc_set()
goldstand = [item for item in the_goldstandard if item[0] in ds]
len(goldstand)

2918

In [45]:
candidates = []
for sentence in corpus:
    candidates.extend(finder.findall(sentence))
measurements = 
X = hasher.fit_transform(measurements).toarray()
pred_y = hclf.predict(X)
system = recover(candidates, pred_y)
from experiment import report
report.evaluate(system, goldstand)



ValueError: X has 56408 features per sample; expecting 1024

In [25]:
import pickle

with open('data/auxiliary_term_indeces.pickle', 'wb') as f:
    pickle.dump(term_indeces, f)

In [26]:
with open('data/ten_fold_corpora.pickle', 'wb') as f:
    pickle.dump(ten_corpus, f)

In [24]:
for key, values in term_indeces['CRAFT 2/10'].items():
    print(key, values)

alkaline phophatase {Entity(lemma='orthophosphoric-monoester phosphohydrolase (alkaline optimum)', ref='GO:0004035')}
hormones {Entity(lemma='glycopeptide hormone', ref='GO:0005179')}
β-Gal {Entity(lemma='lactose hydrolysis', ref='GO:0004565')}
G2 {Entity(lemma='G2 phase', ref='GO:0051319')}
transfected {Entity(lemma='DNA mediated transformation', ref='GO:0009294')}
X-Inactivation {Entity(lemma='chromosome inactivation', ref='GO:0009048')}
proliferative {Entity(lemma='cell proliferation', ref='GO:0008283')}
Expressed {Pattern(lemma='geneexpress', ref='annotator')}
implantation stage {Entity(lemma='embryo implantation', ref='GO:0007566')}
development {Pattern(lemma='develop', ref='annotator')}
death {Entity(lemma='death', ref='GO:0016265')}
cellular {Entity(lemma='cell', ref='GO:0050865')}
behavioural {Pattern(lemma='behaviorresponse', ref='annotator')}
breakdown {Pattern(lemma='catabolprocess', ref='annotator')}
scaffold {Entity(lemma='protein complex scaffold', ref='GO:0032947')}
neo 

Index<2740 key(s)>

In [32]:
new_term_text = []
for term, text in term_text:
    cluster = godata.clusterbook.index[term]
    texts = {t.lemma for t in cluster.terms}
    if text not in texts and '...' not in text:
        new_term_text.append((term, text))

In [33]:
from mcgocr.extractor import Index

In [34]:
term_index = Index()

In [35]:
for term, text in new_term_text:
    term_index[term].add(text)

In [37]:
term_index

Index<662 key(s)>

In [36]:
for key, value in term_index.items():
    print(key.lemma, value)

geneexpress {'Expression', 'gene expression', 'expressions', 'Expressed', 'genes', 'Gene expression', 'expressed', 'gene-expression', 'expression of', 'Gene Expression', 'expresses', 'express', 'expression', 'Expressing', 'expressing', 'expression of genes', "gene's expression"}
G2/M transition checkpoint {'G2/M', 'G2/M delay'}
globulin G {'Lyzs'}
pronucleus {'Pronuclear'}
binary fission {'fission'}
factor XIIIa {'transglutaminase'}
phospholipid efflux {'secretion'}
cell aging {'senescent', 'aging of cells', 'senescencing'}
RNA stabilization {'stabilize'}
blood circulation {'blood flow', 'Blood flow'}
acute-phase response {'acute phase', 'acute-phase'}
JAK-STAT cascade {'JAK-STAT pathway'}
sex chromosome {'sex chromosomes', 'sex-chromosome'}
cell migration {'cellular migration', 'migrates', 'migration of cells', 'cell migration events', 'migratory', 'migrated', 'migrate', 'migrating', 'migration'}
bicarbonate:chloride antiporter {'anion exchanger'}
olfactory learning {'Odor Learning', 

In [23]:
new_term_text[:100]

[(Entity(lemma='apoptosis', ref='GO:0006915'), 'apoptotic'),
 (Entity(lemma='Dp38', ref='GO:0004707'), 'mitogen-activated protein kinase'),
 (Entity(lemma='photoreceptor outer segment', ref='GO:0001750'),
  'outer segment'),
 (Entity(lemma='cell', ref='GO:0050865'), 'cells'),
 (Entity(lemma='reverse transcription', ref='GO:0006410'), 'RT'),
 (Entity(lemma='reverse transcription', ref='GO:0006410'),
  'reverse-transcribed'),
 (Pattern(lemma='geneexpress', ref='annotator'), 'expression'),
 (Entity(lemma='chromosome', ref='GO:0005694'), 'chr.'),
 (Entity(lemma='heterochromatin', ref='GO:0031507'),
  'heterochromatin domains'),
 (Entity(lemma='extracellular matrix', ref='GO:0031012'), 'matrix'),
 (Entity(lemma='translational termination', ref='GO:0006415'), 'stop'),
 (Pattern(lemma='geneexpress', ref='annotator'), 'expression'),
 (Entity(lemma='female sex determination', ref='GO:0030237'),
  'establishment of ... female program'),
 (Entity(lemma='adherens junction', ref='GO:0005912'), 'AJs

In [4]:
def only_has_pattern(statement):
    if set([term.__class__ for term in statement.terms()]) == set([Pattern]):
        return True
    return False

In [7]:
def dont_has_entity(statement):
    if goconcept.Entity not in set([term.__class__ for term in statement.terms()]):
        return True
    return False

In [5]:
stat_index = extractor.Index()
for goid, concept in godata.items():
    for statement in concept.statements:
        for term in statement.terms():
            if isinstance(term, Entity):
                stat_index[term].add(statement)
            elif isinstance(term, Constraint):
                stat_index[term].add(statement)
            elif only_has_pattern(statement):
                stat_index[term].add(statement)
                              
stat_index.use_default = False

In [58]:
term_index = extractor.Index()
for cluster in godata.clusterbook.clusters:
    for term in cluster.terms:
        term_index[term.lemma].add(cluster.primary_term)


In [59]:
solid_extractor = extractor.SolidExtractor(term_index)

In [57]:
import importlib
importlib.reload(extractor)

<module 'mcgocr.extractor' from '/Users/jeroyang/projects/mcgocr/mcgocr/extractor.py'>

Grounds(evidences=[Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=37, end=47)], sentence=Sentence(text='The largest is the macrophage', offset=18, ref='None'))

In [34]:
from collections import namedtuple, defaultdict
Candidate = namedtuple('Candidate', 'statement evidences')

In [39]:
def nearest_evidences(current_position, wanted_terms, position_index):
    found_evidences = []
    for term in wanted_terms:
        positional_evidences = position_index[term]
        if len(positional_evidences) > 0:
            distance_evidence = [(abs(current_position - position), evidence) 
                             for position, evidence in positional_evidences]
            distance_evidence.sort(key=lambda it:it[0])
            found_evidences.append(distance_evidence[0])
    return found_evidences

    
def generate_candidates(grounds, stat_index):
    """
    This function looks so complex because I only want to report the nearest evidence
    Maybe there is a more elegant way, but I have no idea, currently. 
    """
    result_candidates = []
    
    positional_evidences = list(enumerate(grounds.evidences))
    
    #The first loop, build the positional_index
    position_index = defaultdict(list)
    for position, evidence in positional_evidences:
        position_index[evidence.term].append((position, evidence))
    
    #The second loop, gathering evidences
    for position, evidence in positional_evidences:
        statements = stat_index[evidence.term]
        for statement in statements:
            wanted_terms = statement.terms()
            found_evidences = nearest_evidences(position, wanted_terms, position_index)
            candidate = Candidate(statement, found_evidences)
            result_candidates.append(candidate)
    return result_candidates
        

In [60]:
from basics import corpus
sentence = Sentence('The largest is the macrophage', 18, 'None')
solid_extractor.findall(sentence)

[Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=37, end=47)]

In [61]:
solid_extractor.to_grounds(sentence)

Grounds(evidences=[Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=37, end=47)], sentence=Sentence(text='The largest is the macrophage', offset=18, ref='None'))

In [53]:
len(stat_index[Out[48]])

9

In [63]:
generate_candidates(Out[61], stat_index)

[Candidate(statement=Statement(statid='GO:0030225%000', evidences=[Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=0, end=10), Evidence(term=Pattern(lemma='celldifferentiate', ref='annotator'), text='celldifferentiate', start=11, end=26)]), evidences=[(0, Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=37, end=47))]),
 Candidate(statement=Statement(statid='GO:0045650%000', evidences=[Evidence(term=Pattern(lemma='negatregulate', ref='annotator'), text='negatregulate', start=0, end=19), Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=23, end=33), Evidence(term=Pattern(lemma='celldifferentiate', ref='annotator'), text='celldifferentiate', start=34, end=49)]), evidences=[(0, Evidence(term=Entity(lemma='macrophage', ref='GO:0042116'), text='macrophage', start=37, end=47))]),
 Candidate(statement=Statement(statid='GO:0042116%000', evidences=[Evidence(term=Entity(lemma='macrophage', 

In [42]:
len(set(Out[40]))

TypeError: unhashable type: 'list'

In [151]:
goconcept.evidence_split('GO:testing', 'in utero embryonic development', regex_in)

[Evidence(term=Constraint(lemma='utero embryonic', ref='GO:testing'), text='utero embryonic', start=3, end=18)]

In [139]:
from mcgocr.pattern_regex import regex_in
concept.evidence_split('GO:testing', 'axon growth cone guidance', regex_in)

[Evidence(term=Entity(lemma='axon growth cone guidance', ref='GO:testing'), text='axon growth cone guidance', start=0, end=25)]

In [135]:
goid = 'GO:0007411'
print(godata[goid].labels)
print()
print(godata[goid].statements)

['axon guidance', 'axon growth cone guidance']

[Statement(statid='GO:0007411%000', evidences=[Evidence(term=Constraint(lemma='axon guidance', ref='GO:0048843'), text='axon guidance', start=0, end=13)])]


In [8]:
from mcgocr import craft
import importlib
importlib.reload(craft)

<module 'mcgocr.craft' from '/Users/jeroyang/projects/mcgocr/mcgocr/craft.py'>

In [11]:
c = craft.Craft('data2')

In [12]:
c.get_corpus()

Corpus<67 documents, 21646 sentences, 'CRAFT'>

In [13]:
c.get_goldstandard()

Bag<29447 items>

In [6]:
from mcgocr import craft

In [9]:
c = craft.Craft('data2')

In [10]:
c.get_corpus()

Reading the corpus...


Corpus<67 documents, 21646 sentences, 'CRAFT'>

In [11]:
c.get_goldstandard()

Reading the goldstandard of GO...


Bag<29447 items>

In [12]:
Out[10]

Corpus<67 documents, 21646 sentences, 'CRAFT'>

In [13]:
corpus = Out[10]

In [16]:
corpus[13]

Sentence(text=1219, offset='These IOP differences are likely due to interstrain genetic differences that create a powerful resource for studying the regulation of IOP. ', docid='11532192')

In [18]:
from mcgocr import concept

In [None]:
obopath = craft.godata
godata = concept.GoData()

In [5]:
for sentence in corpus:
    

KeyboardInterrupt: 

In [83]:
#term_index = extractor.TermIndex(set)
clusterbook = godata.clusterbook

for c in clusterbook.clusters:
    for term in c.terms:
        term_index[term.lemma].add(c.primary_term)
term_index

Index<47681 key(s)>

In [22]:
import argparse

description = """
 _______ _______  ______  _____  _______  ______
 |  |  | |       |  ____ |     | |       |_____/
 |  |  | |_____  |_____| |_____| |_____  |    \_
 
 Micro Concept Gene Ontology Concept Recognizer """
parser = argparse.ArgumentParser(description=description)
parser.add_argument('command', type=str, nargs=1,
                    help='command')
parser.add_argument('--output', dest='out_path',
                    help='the output file path')

args = parser.parse_args('hello --output'.split())

usage: __main__.py [-h] [--output OUT_PATH] command
__main__.py: error: argument --output: expected one argument


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [172]:
from experiment import corpus as c
importlib.reload(c)

<module 'experiment.corpus' from '/Users/jeroyang/projects/mcgocr/experiment/corpus.py'>

In [181]:
corpus_a = c.Corpus(title='testing_a')
corpus_b = c.Corpus(title='testing_b')
sentences = []
for i in range(100):
    sentence = c.Sentence(str(i), i, 'doc'+str(i%10))
    sentences.append(sentence)
    if i < 50:
        corpus_a.append(sentence)
    else:
        corpus_b.append(sentence)

In [183]:
corpus_list = corpus_a.divide(5)
result = sum(corpus_list, c.Corpus())
result.title = 'testing_a'
#wanted = self.corpus_a
#self.assertEqual(result, wanted)

In [184]:
result

Corpus<10 documents, 50 sentences, 'testing_a'>

In [187]:
[i for i in corpus_a]

[Sentence(text='0', offset=0, docid='doc0'),
 Sentence(text='1', offset=1, docid='doc1'),
 Sentence(text='2', offset=2, docid='doc2'),
 Sentence(text='3', offset=3, docid='doc3'),
 Sentence(text='4', offset=4, docid='doc4'),
 Sentence(text='5', offset=5, docid='doc5'),
 Sentence(text='6', offset=6, docid='doc6'),
 Sentence(text='7', offset=7, docid='doc7'),
 Sentence(text='8', offset=8, docid='doc8'),
 Sentence(text='9', offset=9, docid='doc9'),
 Sentence(text='10', offset=10, docid='doc0'),
 Sentence(text='11', offset=11, docid='doc1'),
 Sentence(text='12', offset=12, docid='doc2'),
 Sentence(text='13', offset=13, docid='doc3'),
 Sentence(text='14', offset=14, docid='doc4'),
 Sentence(text='15', offset=15, docid='doc5'),
 Sentence(text='16', offset=16, docid='doc6'),
 Sentence(text='17', offset=17, docid='doc7'),
 Sentence(text='18', offset=18, docid='doc8'),
 Sentence(text='19', offset=19, docid='doc9'),
 Sentence(text='20', offset=20, docid='doc0'),
 Sentence(text='21', offset=21, d

In [189]:
corpus_a == result

False

In [194]:
corpus_a[2], result[2]

(Sentence(text='2', offset=2, docid='doc2'),
 Sentence(text='10', offset=10, docid='doc0'))

In [196]:
[j for j in result]

[Sentence(text='0', offset=0, docid='doc0'),
 Sentence(text='1', offset=1, docid='doc1'),
 Sentence(text='10', offset=10, docid='doc0'),
 Sentence(text='11', offset=11, docid='doc1'),
 Sentence(text='20', offset=20, docid='doc0'),
 Sentence(text='21', offset=21, docid='doc1'),
 Sentence(text='30', offset=30, docid='doc0'),
 Sentence(text='31', offset=31, docid='doc1'),
 Sentence(text='40', offset=40, docid='doc0'),
 Sentence(text='41', offset=41, docid='doc1'),
 Sentence(text='3', offset=3, docid='doc3'),
 Sentence(text='4', offset=4, docid='doc4'),
 Sentence(text='13', offset=13, docid='doc3'),
 Sentence(text='14', offset=14, docid='doc4'),
 Sentence(text='23', offset=23, docid='doc3'),
 Sentence(text='24', offset=24, docid='doc4'),
 Sentence(text='33', offset=33, docid='doc3'),
 Sentence(text='34', offset=34, docid='doc4'),
 Sentence(text='43', offset=43, docid='doc3'),
 Sentence(text='44', offset=44, docid='doc4'),
 Sentence(text='5', offset=5, docid='doc5'),
 Sentence(text='9', off

In [197]:
from mcgocr import extractor as ex

In [203]:
ix = ex.Index()
ix.update({'a': {1,2,3}, 'b': {4,5}})
ix.use_default = False
