In [275]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

from scipy.special import expit

In [647]:
journal_filename = "journals/journal-qualcomm.txt"

EVENT_TYPE = enums.Entity.Type.EVENT
PERSON_TYPE = enums.Entity.Type.PERSON
LOCATION_TYPE = enums.Entity.Type.LOCATION
OTHER_TYPE = enums.Entity.Type.OTHER

ALLOWABLE_LABELS = [enums.DependencyEdge.Label.DOBJ,
                    #enums.DependencyEdge.Label.NSUBJ,
                    #enums.DependencyEdge.Label.NSUBJPASS,
                    enums.DependencyEdge.Label.IOBJ,
                    enums.DependencyEdge.Label.POBJ]

In [648]:
generic_words = "generic_words.txt"

with open(generic_words) as f:
    DISALLOWED_WORDS = set(f.read().split())

In [649]:
client = language.LanguageServiceClient()

with open(journal_filename, 'r') as journal_file:
    # Instantiates a plain text document.
    content = journal_file.read()

document = types.Document(
    content=content,
    type=enums.Document.Type.PLAIN_TEXT)
annotations = client.analyze_entity_sentiment(document=document, encoding_type="UTF8")
syntax = client.analyze_syntax(document=document, encoding_type="UTF8")
sentiment = client.analyze_sentiment(document=document, encoding_type="UTF8")

In [650]:
def sm_fun(score, magnitude):
    return (expit(score * 10) - 0.5) * 2 * magnitude

In [651]:
offset_label_map = {tok.text.begin_offset: tok.dependency_edge.label for tok in syntax.tokens}

In [652]:
events = []
people = []
locations = []
other = []

for keyword in annotations.entities:
    word_type = keyword.type
    entity_obj = (keyword.name, keyword.sentiment.score, keyword.sentiment.magnitude)
    if keyword.name in DISALLOWED_WORDS:
        continue
    
    if keyword.type == EVENT_TYPE:
        events.append(entity_obj)
    
    elif keyword.type == PERSON_TYPE:
        people.append(entity_obj)
    
    elif keyword.type == LOCATION_TYPE:
        locations.append(entity_obj)
    
    elif keyword.type == OTHER_TYPE:
        allowable = False
        
        for mention in keyword.mentions:
            content = mention.text.content
            offset = mention.text.begin_offset
            
            for word in content.split(" "):
                try:
                    if offset_label_map[offset] in ALLOWABLE_LABELS:
                        allowable = True
                except:
                    print("Missed word somehow")
                offset += (len(word) + 1)
        
        if allowable:
            other.append(entity_obj)

In [653]:
def collapse_items(items):
    done = [False] * len(items)
    new_items = []
    
    for i, item in enumerate(items):
        if done[i]:
            continue
        done[i] = True
        name = set(item[0].split(" "))
        sum_1 = item[1]
        sum_2 = item[2]
        for j, i2 in enumerate(items):
            if done[j]:
                continue
            ns = set(i2[0].split(" "))
            if name & ns:
                done[j] = True
                sum_1 += i2[1]
                sum_2 += i2[2]
        
        new_items.append((item[0], sum_1, sum_2))
    
    return new_items

In [654]:
events = collapse_items(events)
people = collapse_items(people)
locations = collapse_items(locations)
other = collapse_items(other)

In [655]:
events.sort(key=lambda x: -sm_fun(x[1], x[2]))
people.sort(key=lambda x: -sm_fun(x[1], x[2]))
locations.sort(key=lambda x: -sm_fun(x[1], x[2]))
other.sort(key=lambda x: -sm_fun(x[1], x[2]))

In [656]:
[print(f'{i[0]}: {sm_fun(i[1], i[2])}') for i in events];

In [657]:
[print(f'{i[0]}: {sm_fun(i[1], i[2])}') for i in people];

guy: 0.0
Mason: 0.0


In [658]:
[print(f'{i[0]}: {sm_fun(i[1], i[2])}') for i in locations];

San Diego: 0.5430889802299089


In [659]:
[print(f'{i[0]}: {sm_fun(i[1], i[2])}') for i in other];

answering problems: 0.899777865874302
internship: 0.0


In [660]:
sentiment.document_sentiment

magnitude: 2.9000000953674316
score: 0.5

In [661]:
sm_fun(sentiment.document_sentiment.score, sentiment.document_sentiment.magnitude)

2.8611815587300202