In [None]:
#Install spacy and library. Just Run Once
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

In [12]:
# test spacy and initialize spacy
import spacy
nlp = spacy.load("en_core_web_sm", disable = ['vectors', 'textcat', 'tagger', 'parser', 'ner'])
sent = "<NULL> London bridge is falling down."
doc = nlp(sent[7:])
tokens = [t.text for t in doc] #tokenization
' '.join(tokens)

'London bridge is falling down .'

In [6]:
# Reference
# https://github.com/explosion/spaCy/discussions/6093
# https://github.com/explosion/spaCy/discussions/3304
import json

entry = {
    'content': '''<NULL> I love this game so much it's so fun to play. I play this game with my sibling .''',
    'first_relatives_result': '''{"connections":[{"fromId":0,"id":0,"tag":"Positive","toId":1},{"fromId":2,"id":1,"tag":"Positive","toId":3},{"fromId":5,"id":2,"tag":"Neutral","toId":4}],"labels":[{"content":"love","endIndex":13,"id":0,"startIndex":9,"tag":"Opinion"},{"content":"game","endIndex":23,"id":1,"startIndex":19,"tag":"Aspect"},{"content":"fun","endIndex":43,"id":2,"startIndex":40,"tag":"Opinion"},{"content":"play","endIndex":51,"id":3,"startIndex":47,"tag":"Aspect"},{"content":"game","endIndex":69,"id":4,"startIndex":65,"tag":"Aspect"},{"content":"\u003cNULL\u003e","endIndex":6,"id":5,"startIndex":0,"tag":"Opinion"}]}''',
}

NULL_LEN = len('<NULL> ')
content = entry['content']
annotation_result = entry['first_relatives_result']

review = content[NULL_LEN:]
annotation_result = json.loads(annotation_result)

arrows = annotation_result['connections']

terms = []
for term in annotation_result['labels']:
    term['startIndex'] -= NULL_LEN
    term['endIndex'] -= NULL_LEN
    terms.append(term)

print('arrows:', arrows)
print('terms:', terms)



arrows: [{'fromId': 0, 'id': 0, 'tag': 'Positive', 'toId': 1}, {'fromId': 2, 'id': 1, 'tag': 'Positive', 'toId': 3}, {'fromId': 5, 'id': 2, 'tag': 'Neutral', 'toId': 4}]
terms: [{'content': 'love', 'endIndex': 6, 'id': 0, 'startIndex': 2, 'tag': 'Opinion'}, {'content': 'game', 'endIndex': 16, 'id': 1, 'startIndex': 12, 'tag': 'Aspect'}, {'content': 'fun', 'endIndex': 36, 'id': 2, 'startIndex': 33, 'tag': 'Opinion'}, {'content': 'play', 'endIndex': 44, 'id': 3, 'startIndex': 40, 'tag': 'Aspect'}, {'content': 'game', 'endIndex': 62, 'id': 4, 'startIndex': 58, 'tag': 'Aspect'}, {'content': '<NULL>', 'endIndex': -1, 'id': 5, 'startIndex': -7, 'tag': 'Opinion'}]


In [7]:
# Convert a term from char index to token index
print('review:', review + '\n')
doc = nlp(review)
tokens = [t.text for t in doc]
tokenized_doc = ' '.join(tokens)


newterms = {}
for term in terms:
    print('term:', term)
    term_idx = term['id']
    ch_s, ch_e = term['startIndex'], term['endIndex']

    if ch_s <0 and ch_e <0: # <NULL>
        s, e, term_str, term_type = -1, -1, '<NULL>', 'Dummy'
    else:
        assert review[ch_s:ch_e] == term['content'] #TODO: check if there is some annotation breaks it, and figure out how to resolve it

        span = doc.char_span(ch_s, ch_e)
        s, e = span.start, span.end
        term_str = term['content']
        term_type = term['tag']


    new_term = (s, e, term_str, term_type)
    print('new_term:', new_term)

    newterms[term_idx] = new_term


review: I love this game so much it's so fun to play. I play this game with my sibling .

term: {'content': 'love', 'endIndex': 6, 'id': 0, 'startIndex': 2, 'tag': 'Opinion'}
new_term: (1, 2, 'love', 'Opinion')
term: {'content': 'game', 'endIndex': 16, 'id': 1, 'startIndex': 12, 'tag': 'Aspect'}
new_term: (3, 4, 'game', 'Aspect')
term: {'content': 'fun', 'endIndex': 36, 'id': 2, 'startIndex': 33, 'tag': 'Opinion'}
new_term: (9, 10, 'fun', 'Opinion')
term: {'content': 'play', 'endIndex': 44, 'id': 3, 'startIndex': 40, 'tag': 'Aspect'}
new_term: (11, 12, 'play', 'Aspect')
term: {'content': 'game', 'endIndex': 62, 'id': 4, 'startIndex': 58, 'tag': 'Aspect'}
new_term: (16, 17, 'game', 'Aspect')
term: {'content': '<NULL>', 'endIndex': -1, 'id': 5, 'startIndex': -7, 'tag': 'Opinion'}
new_term: (-1, -1, '<NULL>', 'Dummy')


In [10]:
triplets = []
for arrow in arrows:
    from_id = arrow['fromId']
    to_id = arrow['toId']
    sentiment = arrow['tag']

    opinion_term = newterms[from_id]
    aspect_term = newterms[to_id]


    if opinion_term[-1] == 'Aspect' and aspect_term[-1] == 'Opinion':
        tmp = aspect_term
        aspect_term = opinion_term
        opinion_term = tmp


    assert opinion_term[-1] in ('Opinion','Dummy')  #TODO: check if any case break it?  order might be flipped, fix it
    assert aspect_term[-1] in ('Aspect', 'Dummy')


    triplet = ('NULL',  (aspect_term[0], aspect_term[1]), (opinion_term[0], opinion_term[1]), sentiment)
    print(triplet)
    triplets.append(triplet)



('NULL', (3, 4), (1, 2), 'Positive')
('NULL', (11, 12), (9, 10), 'Positive')
('NULL', (16, 17), (-1, -1), 'Neutral')


In [None]:

# final format
#
# 'tokenized_doc':tokenized_doc

# 'triplet list' : json format [(NULL, (0, 2), (3, 4), Positive), (NULL, (-1, -1), (3, 4), Negative)]
# for each arrow, we have (aspect category=NULL, token offset of aspect term, token offset of opinion term, sentiment)
# if there is single element without arrow, e.g. aspect term  (NULL, (3,4), (-1, -1), Sentiment)



