In [1]:
import pandas as pd
import numpy as np
import re
# from unidecode import unidecode
import spacy
from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load('en_core_web_md', disable=["ner", "textcat", "entity_ruler", "merge_noun_chunks", "merge_entities", "merge_subtokens"])

In [2]:
from IPython.display import Image
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [3]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_colwidth', 260)

In [4]:
dfExoplanetsNASAannot = pd.read_json('./data/dfExoplanetsNASAdetected100rand_v2.json', orient = 'table')
del dfExoplanetsNASAannot['tagRootSent']
del dfExoplanetsNASAannot['tagDetected']
dfExoplanetsNASAannot.head(2)

Unnamed: 0,sent,label
0,"We detected visual companions within 1'' for 5 stars, between 1'' and 2'' for 7 stars, and between 2'' and 4'' for 15 stars.",discovery
1,"Using these data and photometry from the Spitzer Space Telescope, we have identified members with infrared excess emission from circumstellar disks and have estimated the evolutionary stages of the detected disks, which include 31 new full disks and 16 new...",discovery


In [5]:
def satz_analytic2(satz):
#    merkmal = dict({"satz":satz.text})
    merkmal=dict({})
# search for main verb
    for t in satz:
        if t.dep_ == "ROOT":
            r=t.head.text
            merkmal.update({"act":r})
# subject and object related to verb
    for t in satz:
#        if t.dep_ == "dobj" and merkmal["act"]==t.head.text:
#            merkmal.update({"obj":''.join(w.text_with_ws for w in t.subtree)})
        if t.dep_ == "nsubj" and merkmal["act"]==t.head.text:
            merkmal.update({"subject":t.text.lower()})
    return(merkmal)

In [6]:
satz=nlp(dfExoplanetsNASAannot["sent"][1])
satz_analytic2(satz)

{'act': 'identified', 'subject': 'we'}

In [7]:
dfExoplanetsNASAannot["s"]=dfExoplanetsNASAannot["sent"].apply(lambda y: satz_analytic2(nlp(y)))
dfExoplanetsNASAannot.head(2)

Unnamed: 0,sent,label,s
0,"We detected visual companions within 1'' for 5 stars, between 1'' and 2'' for 7 stars, and between 2'' and 4'' for 15 stars.",discovery,"{'act': 'detected', 'subject': 'we'}"
1,"Using these data and photometry from the Spitzer Space Telescope, we have identified members with infrared excess emission from circumstellar disks and have estimated the evolutionary stages of the detected disks, which include 31 new full disks and 16 new...",discovery,"{'act': 'identified', 'subject': 'we'}"


In [8]:
def transp(x):
    if x=="discovery":
        y=1
    else:
        y=0
    return(y)
dfExoplanetsNASAannot["label"]=dfExoplanetsNASAannot["label"].apply(lambda x: transp(x))

In [9]:
dfExoplanetsNASAannot.head(2)

Unnamed: 0,sent,label,s
0,"We detected visual companions within 1'' for 5 stars, between 1'' and 2'' for 7 stars, and between 2'' and 4'' for 15 stars.",1,"{'act': 'detected', 'subject': 'we'}"
1,"Using these data and photometry from the Spitzer Space Telescope, we have identified members with infrared excess emission from circumstellar disks and have estimated the evolutionary stages of the detected disks, which include 31 new full disks and 16 new...",1,"{'act': 'identified', 'subject': 'we'}"


In [10]:
_all_xs=list(dfExoplanetsNASAannot["s"])

In [11]:
_all_xs[1]

{'act': 'identified', 'subject': 'we'}

In [12]:
_all_ys=np.array(list(dfExoplanetsNASAannot["label"]))
_all_ys

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1])

In [13]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [14]:
all_xs, all_ys = shuffle(_all_xs, _all_ys, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
    all_xs, all_ys, test_size=0.25, random_state=0)
print('{} items total, {:.1%} true'.format(len(all_xs), np.mean(all_ys)))

100 items total, 40.0% true


In [15]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [16]:
from xgboost import XGBClassifier

clf = XGBClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, all_xs, all_ys, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(X_train, y_train)  # so that parts of the original pipeline are fitted
    
evaluate(pipeline)

Accuracy: 0.680 ± 0.215


In [17]:
from sklearn.metrics import accuracy_score

In [18]:
booster = clf.get_booster()
original_feature_names = booster.feature_names
booster.feature_names = vec.get_feature_names()
print(booster.get_dump()[0])
# recover original feature names
booster.feature_names = original_feature_names

0:[subject=we<-9.53674316e-07] yes=1,no=2,missing=1
	1:[act=detected<-9.53674316e-07] yes=3,no=4,missing=3
		3:leaf=-0.0926829278
		4:leaf=0.0105263162
	2:[act=detected<-9.53674316e-07] yes=5,no=6,missing=5
		5:leaf=0.0200000014
		6:leaf=0.127272725



In [19]:
from eli5 import show_weights
show_weights(clf, vec=vec)

Weight,Feature
0.6351,subject=we
0.3649,act=detected
0,act=are
0,act=remains
0,act=reconstruct
0,act=provokes
0,act=present
0,act=possess
0,act=performed
0,act=measure


In [21]:
y_preds = pipeline.predict(X_test)