In [1]:
from nltk import sent_tokenize
import numpy as np
from bert_ner_utils import BERT_NER

In [2]:
# Importing models
import spacy
nlp = spacy.load("en_core_web_sm")

ner_model = BERT_NER('/home/ether/Desktop/BERT_Similarity_experiments/models/bert-ner/',cased=False)
ner_model_u = BERT_NER('/home/ether/Desktop/BERT_Similarity_experiments/models/bert-ner-uncased-2/',cased=False,labels=["O","E"])
# ner_model_c = BERT_NER('/home/ether/Desktop/BERT_Similarity_experiments/models/bert-ner-cased/',cased=True)
ner_model_c = BERT_NER('/home/ether/Desktop/BERT_Similarity_experiments/models/bert-ner-cased-2/',cased=True,labels=["O","E"])

In [3]:
test_segment = "'gotham city needs a #Wash', said batman in his low, grumbly voice as he waved his wand to cast the spell aguamenti on the city."
model = ner_model

ents,sc,nents,nsc = model.get_entities(test_segment,get_non_entities=True)
print(">",test_segment,end="\n\n")
print("Entities Detected:",ner_model.wordize(ents, capitalize=True),end="\n\n")
print("Non Entity tokens, sorted by score:",end="\n\n")

for ne,ns in sorted(zip(nents,nsc),key=lambda x: x[1]):
    print(ne,ns,sep="\t")

## Comparing models based on segment

In [5]:
segment = "Handsome. Yeah, this is - I never trying to figure out how to get the either this time about setup. So yes the time account set up with an Enterprise type of organization very particular domain and we've set up for email address and evening and I ran it a command to create a bot with for the time. So that's how it is. None, so we're able to officially read an email address as a bot in able to spend now. What we do is in order for us to be able to connect it to through the UI accelerate away. We had to drive the box where to set up a Lambda function with subscribe to events and expose that endpoint for that Bart and then associate that Lambda to the specific body that they play Through a APA. But once that happens we can invite the board to a chat like a Google like a Channel or corresponding Channel thing in China and okay, once that happens then bar gets notified Lambda gets triggered thing, but we're invited to this Channel and mention Etc. And then it has the next steps are essentially to create another function and add all the quote there to other Bots person respond to the yeah, we sort of"
print(segment,end="\n\n")
entities=[]
entities_c = []
entities_u = []
conf = []
conf_c = []
conf_u=[]
for text in sent_tokenize(segment):
    ent, con, ne,nc = ner_model.get_entities(text,get_non_entities=True)
    ent_c, con_c,ne_c, nc_c = ner_model_c.get_entities(text,get_non_entities=True)
    ent_u, con_u,ne_u,nc_u = ner_model_u.get_entities(text,get_non_entities=True)
    if len(ent)==0 and len(ent_c)==0 and len(ent_u)==0:
        continue
    entities.extend(ent)
    entities_c.extend(ent_c)
    entities_u.extend(ent_u)
    conf.extend(con)
    conf_c.extend(con_c)
    conf_u.extend(con_u)

print("UNCASED BERT-NER",end="\n\n")
# [print(e,c) for e,c in zip(entities,conf)]
entities_words = ner_model.wordize(entities)
print(entities_words,end="\n\n")

print("-"*100)
print("UNCASED BERT-NER-BINARY",end="\n\n")
# [print(e,c) for e,c in zip(entities_u,conf_u)]
entities_words_u = ner_model_u.wordize(entities_u)
print(entities_words_u,end="\n\n")

print("-"*100)
print("CASED BERT-NER-BINARY",end="\n\n")
# [print(e,c) for e,c in zip(entities_c,conf_c)]
entities_words_c = ner_model_c.wordize(entities_c)
print(entities_words_c,end="\n\n")

print("-"*100)
print("SPACY")
doc=nlp(segment)
entities_spacy=[]
for ent in doc.ents:
    if ent.label_ not in ["CARDINAL","ORDINAL"]:
        entities_spacy.append(ent.text)
print(", ".join(list(set(entities_spacy))))

Handsome. Yeah, this is - I never trying to figure out how to get the either this time about setup. So yes the time account set up with an Enterprise type of organization very particular domain and we've set up for email address and evening and I ran it a command to create a bot with for the time. So that's how it is. None, so we're able to officially read an email address as a bot in able to spend now. What we do is in order for us to be able to connect it to through the UI accelerate away. We had to drive the box where to set up a Lambda function with subscribe to events and expose that endpoint for that Bart and then associate that Lambda to the specific body that they play Through a APA. But once that happens we can invite the board to a chat like a Google like a Channel or corresponding Channel thing in China and okay, once that happens then bar gets notified Lambda gets triggered thing, but we're invited to this Channel and mention Etc. And then it has the next steps are essentia

# Validation Tests

## Validation on entity-sentences set

In [6]:
import pickle
with open("/home/venkat/knowledge_graphs/entity_graph_builder/graph_dumps/ppn_sentences.pkl","rb") as f:
    sent_dict = pickle.load(f)

In [7]:
ctr = 0
num_of_entities=10
for gold_entity in sent_dict:
    entities = []
    entities_c = []
    entities_u = []
    entities_spacy_u = []
    entities_spacy_c = []
    # Selecting entities based on starting letter
    if gold_entity[0]=="T":
        continue
    gold_sentences = sent_dict[gold_entity][:500]
    gold_entity = gold_entity.replace("the ","")
    if len(gold_sentences)<4:
        continue
    
    for text in gold_sentences:
        ent, con= ner_model.get_entities(text)
        ent_c, con_c= ner_model_c.get_entities(text)
        ent_u, con_u= ner_model_u.get_entities(text)
        
        doc_c = nlp(text)
        entities_spacy_c.extend(list(set([x for e in doc_c.ents for x in e.text.upper().split() ])))
        doc_u = nlp(text.lower())
        entities_spacy_u.extend(list(set([x for e in doc_u.ents for x in e.text.upper().split() ])))
        
        if len(ent)==0 and len(ent_c)==0 and len(ent_u)==0:
            continue
        entities.extend(list(set(ner_model.wordize(ent))))
        entities_c.extend(list(set(ner_model_c.wordize(ent_c))))
        entities_u.extend(list(set(ner_model_u.wordize(ent_u))))
    
    # Consider n-gram entities. Take mean score of all entity detections.
    gold_entity_list = gold_entity.upper().split()
    counts_1 = np.mean([entities.count(gold_ent) for gold_ent in gold_entity_list])
    counts_2 = np.mean([entities_c.count(gold_ent) for gold_ent in gold_entity_list])
    counts_3 = np.mean([entities_u.count(gold_ent) for gold_ent in gold_entity_list])
    counts_ner_spacy_cased = entities_spacy_c.count(gold_entity.upper())
    counts_ner_spacy_uncased = entities_spacy_u.count(gold_entity.upper())
    
    print("#"*100,"\n",gold_entity,"| #Sentences:",len(gold_sentences),"\n")
    print("BERT NER UNCASED MUL: ", counts_1/len(gold_sentences))
    print("BERT NER CASED BIN: ", counts_2/len(gold_sentences))
    print("BERT NER UNCASED BIN: ", counts_3/len(gold_sentences))
    print("SPACY NER CASED: ", counts_ner_spacy_cased/len(gold_sentences))
    print("SPACY NER UNCASED: ", counts_ner_spacy_uncased/len(gold_sentences))
    
    ctr+=1
    if ctr==num_of_entities:
        break

#################################################################################################### 
 Document | #Sentences: 124 

BERT NER UNCASED MUL:  0.008064516129032258
BERT NER CASED BIN:  0.008064516129032258
BERT NER UNCASED BIN:  0.0
SPACY NER CASED:  0.008064516129032258
SPACY NER UNCASED:  0.0
#################################################################################################### 
 Bitbucket | #Sentences: 15 

BERT NER UNCASED MUL:  0.8
BERT NER CASED BIN:  0.9333333333333333
BERT NER UNCASED BIN:  0.9333333333333333
SPACY NER CASED:  0.6666666666666666
SPACY NER UNCASED:  0.0
#################################################################################################### 
 App Version | #Sentences: 6 

BERT NER UNCASED MUL:  0.5
BERT NER CASED BIN:  0.4166666666666667
BERT NER UNCASED BIN:  0.4166666666666667
SPACY NER CASED:  0.0
SPACY NER UNCASED:  0.0
#################################################################################################### 


In [8]:
gold_sentences

['It can be gathered from a couple of answers on Quora that around 25% to even 40% of employees at Google, Apple, Facebook and Amazon are software engineers — people that are technically capable of executing software projects, writing the program code theirselves.',
 'You can follow me on LinkedIn, Quora, Twitter, and Instagram where I answer questions related to Mobile Development, especially Android and Flutter.',
 'Check the lists of FP disadvantages on Quora and in the article by Alexander Alvin too.',
 'Then, moving on, I found the Quora programming community.',
 'These are technology companies that might be as young as a two-person startup and also those who have started fully maturing (as an example, Dropbox, Airbnb, and Quora were all at one time or another incubated by Y Combinator).']

In [16]:
model = ner_model_u
for text in sent_dict['Atlassian'][:10]:
    print(text)
    ents = model.get_entities(text,get_non_entities=True)[0]
    print(model.wordize(ents,capitalize=True))
    print()

Heres a link to some documentation the great people over at Atlassian put together to help you understand what a Git workflow is as well as the different types of workflows you can integrate into each repository you work on.
['Atlassian', 'Git']

Some time ago, I read a piece on how Atlassian determined what to put in their backlog.
['Atlassian']

The speaker shared that at one of their offices, they had put up persona cards on the bathroom walls so that Atlassians don't risk a minute unfocused on the people they are building their products for.
['Atlassians']



## Validation on Meeting Segments

In [17]:
import json
with open("/home/ether/Desktop/BERT_Similarity_experiments/data/entity_validation.json","r") as f:
    entity_val_set = json.load(f)

In [18]:
for seg_chunk in entity_val_set:
    gold_entity_list = list(seg_chunk['entities'].keys())
    entities = []
    entities_c = []
    entities_u = []
    entities_spacy_u = []
    entities_spacy_c = []
    conf=[]
    conf_c=[]
    conf_u=[]
    print(seg_chunk['segments'],end="\n\n")
    gold_sentences = sent_tokenize(seg_chunk['segments'])
    for text in gold_sentences:
#         text=text.lower() # ABLATION STUDY FOR BERT CASED MODEL
        ent, con= ner_model.get_entities(text)
        ent_c, con_c= ner_model_c.get_entities(text)
        ent_u, con_u= ner_model_u.get_entities(text)
        entities.extend(ent)
        entities_c.extend(ent_c)
        entities_u.extend(ent_u)
        conf.extend(con)
        conf_c.extend(con_c)
        conf_u.extend(con_u)
        
        doc_c = nlp(text)
        entities_spacy_c.extend(list(set([e.text for e in doc_c.ents])))
        doc_u = nlp(text.lower())
        entities_spacy_u.extend(list(set([e.text for e in doc_u.ents])))
    entities = ner_model.wordize(entities, capitalize=True)
    entities_c = ner_model_c.wordize(entities_c, capitalize=True)
    entities_u = ner_model_u.wordize(entities_u, capitalize=True)
    
    print("\n",gold_entity_list,"No. of Sentences:",len(gold_sentences),"\n")
    print("BERT NER UNCASED MUL: ",set(entities),conf,sep="\n",end="\n\n")
    print("BERT NER CASED BIN: ",set(entities_c),conf_c,sep="\n",end="\n\n")
    print("BERT NER UNCASED BIN: ",set(entities_u),conf_u,sep="\n",end="\n\n")
    print("SPACY NER CASED",set(entities_spacy_c))
    print("SPACY NER UNCASED",set(entities_spacy_u))
    print("#"*100)

We go into that issue because we read the from the Json field column tables. We we have their columns available, but we are not breaking from that column which is creating a problem but then you know, bring a that solution we should start needed it from that columns and only few places we've done ready need to do that the is the case. I think this not of code is the if you want to use the column portion of the values json log then there will be George changes setting. U a D then we start fields and all not so in case of meeting meeting and recording and markers. We have both Json fields and the call proper call call let let can that like that what I telling is will not do anything with that in the Json and value of d there right. We will replace the swing field with the Hyphen iPhone that's. The we are reading the ID from the Json. I'm not from the column. So in the days very while waiting from a database we need to change oriented B a force layer. I I will from the value where it is l


 ['Browser'] No. of Sentences: 17 

BERT NER UNCASED MUL: 
{'Web', 'Victory', 'Local', 'Dot', 'Electron'}
[0.4147031307220459, 0.8855326, 0.7797708, 0.12225759029388428, 0.33489859104156494, 0.21373093128204346]

BERT NER CASED BIN: 
{'M', 'I', 'P', 'C', 'Browser'}
[0.9614601, 0.91519994, 0.8662625, 0.94643325, 0.7769169, 0.7585058, 0.79917485]

BERT NER UNCASED BIN: 
{'E', 'Index', 'Electron', 'Web', 'Victory', 'Dot', 'I'}
[0.77422136, 0.4337450861930847, 0.7837314, 0.1875535249710083, 0.99972564, 0.6249454, 0.48362767696380615, 0.8940357, 0.05614018440246582]

SPACY NER CASED {'Browser', 'yesterday', 'first'}
SPACY NER UNCASED {'yesterday', 'first'}
####################################################################################################
Also as the experiment to test on staging to what time what I've done is calculated in phrase and ent*ties code with respect to segment. I and the overall medium code is also a noted right. So now what I do is I check if any of the key fo