In [50]:
import re
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

class Geo_Entity(object):
    
    def __init__(self,location, sentence):
        self.location = location
        self.sentence = sentence
        self.preference = None
     
    def display(self):
        print ('location: ' + self.location)
        print ('sentence: ' + self.sentence)
        print ('preference: ' + str(self.preference))
                
# Normalize locations for spacy geographic entity tagging 
def norm_gpe_locations(full_memo): 
    # order matters (i.e. expand NYC before NY)
    gpe_norm_dict = {
        'NYC': 'New York City', 
        'NY' : 'New York',
        'LA' : 'Los Angeles',
        'SF' : 'San Francisco',
        'DC' : "Washington DC",
        'West Coast' : 'California, Oregon, Washington'
    }
    city_norm_memo = full_memo
    for k,v in gpe_norm_dict.items():
        city_norm_memo = city_norm_memo.replace(k,v)      
    return (city_norm_memo)
 
# get sentences that mention geogrphic location
# returns a list of sentences with geo
def get_gpe_entities (sent_token):
    gpe_entities = []
    for entity in sent_token.ents:
        if entity.label_ == 'GPE':
            gpe_ent = Geo_Entity(entity.text, sent_token.text)
            gpe_entities.append(gpe_ent)
    return gpe_entities

# translates 'tri-state' term to actual location based
# on other location mentioned in the memo
def translate_tri_state (norm_memo):
    tri_state_dict = {'New York': 'New York Metro Area'}
    # search through memo to determine which tri-state
    for k,v in tri_state_dict.items():
        if k in norm_memo:
            break;
    return v

# checks for 'tri-state' in sentence token and translates
# to actual location
def get_tri_entity (sent_token, norm_memo):
    # tri-state text pattern
    tri_pat = re.compile ('tri\-?state')
    tri_entity = None
    sent_lower = sent_token.text.lower()
    # check for 'tri-state' usage in sentence 
    if tri_pat.search(sent_lower):
        # get name of tri_state
        tri_state = translate_tri_state(norm_memo)
        # replace tri-state mention with real location
        updated_sent = tri_pat.sub(tri_state, sent_lower)
        tri_entity = Geo_Entity(tri_state, updated_sent)
    return tri_entity

# check if a word indicating interest is in sentence 
def check_interest(sent_text):
    interest_words = ('focused', 'based', 'seeking', 'looking')
    interest = False
    if any([(item in sent_text.lower()) for item in interest_words]):
        interest = True
    return interest

# get a list of the geographic entities that contain words
# inidcating interest
def get_interest_geo_entities (norm_memo):
    nlp = spacy.load('en_core_web_sm')
    doc_nlp = nlp(norm_memo)
    geo_entities = []
    for sent in doc_nlp.sents:
        # check for words indicating interest
        if check_interest(sent.text):     
            sent_nlp = nlp(sent.text)
            # get tagged geographic entities 
            gpe_ents = get_gpe_entities(sent_nlp)
            if gpe_ents:
                geo_entities.extend(gpe_ents)
            # get tri-state entity 
            tri_ent = get_tri_entity(sent_nlp, city_norm_memo)
            if tri_ent != None:
                geo_entities.append(tri_ent)
    return geo_entities

# if 'but' comes after location then it negates the preference
def eval_but_negation(geo_entity):
    preferred = True
    loc_index = geo_entity.sentence.find(geo_entity.location)
    but_index = geo_entity.sentence.find('but') 
    if but_index > loc_index:
        preferred = False
    return preferred

# set the location preference for the collected entities
def set_loc_preference (geo_entities):
    for index, g_ent in enumerate(geo_entities):
        preference = True
        if 'but' in g_ent.sentence:
            preference = eval_but_negation(g_ent)
        geo_entities[index].preference = preference
    return
            
         

In [51]:
full_memo_a1 = ("I recently caught up with XXX of XXX Group and he " 
                "indicated they are actively trying to grow their "
                "portfolio in the US. They have been focused in NYC, "
                "but with his recent addition to the team, are looking "
                "in Chicago and on the West Coast. Their main focus "
                "continues to be high-rise office in CBDs, but are "
                "also considering urban multi-housing, preferably with "
                "a value-add component."
                )

In [52]:
full_memo_a2 = ("I will be based in New York, mainly tasked with sourcing "
                "equity and debt investments in high-profile real estate "
                "assets in gateway markets with equity ticket $30M and up. "
                "My team and I will also look for JV and M&A opportunities "
                "of established real estate companies and platforms."
               )

In [53]:
full_memo_a3 = ("They intend to add a large subterranean retail complex as well as "
                "reposition the building after the major tenant moves out in 2 years. "
                "For the most part they are value-add to opportunistic driven. They "
                "are focused on the following markets: NYC, Boston, DC, Chicago, SF, "
                "and LA. They will look at office, MH, and retail. They are solving "
                "to mid-teen returns and they have no max or min on their investment "
                "size."
               )

In [54]:
full_memo_a4 = ("They are seeking large office/residential/retail deals in the "
                "Tri-State region. Looking for low teen return profile and 100% "
                "ownership (no operating partners). Typically looking for long "
                "term value and can handle temporary non cash flowing periods "
                "to help generate value. Currently not using much, if any, "
                "leverage on their deals. Global portfolio is 70mm SF, US "
                "portfolio is two NYC assets (room to grow)."
               )

In [55]:
memos = {'a1':full_memo_a1, 
         'a2':full_memo_a2, 
         'a3':full_memo_a3, 
         'a4':full_memo_a4}

In [60]:

for key, memo in memos.items():
    print ('Memo: {}'.format(key))
    city_norm_memo = norm_gpe_locations(memo)
    geo_entities = get_interest_geo_entities (city_norm_memo)
    set_loc_preference(geo_entities)
    for geo in geo_entities:
        print ('  {:.<20} {}'.format(geo.location, geo.preference))

Memo: a1
  New York City....... False
  Chicago............. True
  California.......... True
  Oregon.............. True
  Washington.......... True
Memo: a2
  New York............ True
Memo: a3
  New York City....... True
  Boston.............. True
  Washington DC....... True
  Chicago............. True
  San Francisco....... True
  Los Angeles......... True
Memo: a4
  New York Metro Area. True


In [57]:
city_norm_memo = norm_gpe_locations(full_memo_a4)
geo_entities = get_interest_geo_entities (city_norm_memo)
set_loc_preference(geo_entities)
for geo in geo_entities:
    geo.display()

location: New York Metro Area
sentence: they are seeking large office/residential/retail deals in the New York Metro Area region.
preference: True
