<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/Project/NeuralCoref.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [3]:
# install packages
%%capture
!pip install spacy==2.2.4
!python -m spacy download en
!venv .env
!source .env/bin/activate
!git clone https://github.com/huggingface/neuralcoref.git
!pip install -r /content/neuralcoref/requirements.txt
!pip install -e /content/neuralcoref/

!pip install gender-guesser
!git clone https://github.com/darisoy/EE517_Sp21.git
!pip install bcubed
!pip install pyspotlight

In [2]:
import bcubed
import pandas as pd
import gender_guesser.detector as gender
from tqdm.notebook import tqdm
import spotlight

In [40]:
# load models
import spacy
nlp = spacy.load('en')

import neuralcoref
# restart runtime if this throws an error
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')
genDec = gender.Detector()

In [4]:
# list of person mentions that are not names
fem_p = ['she', 'her', 'hers', 'herself']
male_p = ['he', 'him', 'his', 'himself']
personal_p = ['i', 'me', 'we', 'us', 'myself', 'ourself', 'ourselves']
other_p = ['they', 'them', 'their', 'you', 'themself', 'themselves']
people = ['adult','adults', 'person','people','child','children']

person_f_singular = ['girl','woman','mrs','ms','mother','mom','aunt','niece','sister','wife','daughter','grandmother','grandma','grandmom','granddaughter','bride','girlfriend','gal','madam','lady']
person_m_singular = ['boy','man','mr','father','dad','uncle','nephew','brother','husband','son','grandfather','grandpa','granddad','grandson','groom','boyfriend','guy','gentleman','bachelor']
people_f_plural = ['girls','women','mothers','moms','aunts','nieces','sisters','wives','daughters','grandmothers','grandmas','granddaughters','brides','girlfriends','gals','ladies']
people_m_plural = ['boys','men','fathers','dads','uncles','nephews','brothers','husbands','sons','grandfathers','grandpas','grandsons','grooms','boyfriends','guys','gentlemen','bachelors']
people_f = person_f_singular + people_f_plural
people_m = person_m_singular + people_m_plural

un_named_mentions = fem_p + male_p + personal_p + other_p + people + people_f + people_m
f_un_named = fem_p + people_f
m_un_named = male_p + people_m
gendered_un_named = f_un_named + m_un_named
neutral_un_named = personal_p + other_p + people

# Helper Functions

In [31]:
# run text through models

def get_coref(text, nlp):
    doc = nlp(text)
    dic = [t.text for t in doc]
    coref = [[[ent.start, ent.end-1] for ent in cluster.mentions] for cluster in doc._.coref_clusters]
    return coref, dic

def get_ner(text, nlp):
    return [[ent.start, ent.end-1] for ent in nlp(text).ents if ent.label_ == 'PERSON']

def get_nel(text):
    threshold = 0.95
    results = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate', text, confidence=threshold)
    return [p['surfaceForm'] for p in results if 'Person' in p['types']]

def get_gender(person_mention):
    if person_mention.lower() in gendered_un_named:
        if person_mention.lower() in f_un_named:
            return 'F'
        elif person_mention.lower() in m_un_named:
            return 'M'
    elif len(person_mention) > 0:
        first_name = person_mention.split()[0]
        gen_name = genDec.get_gender(first_name.capitalize())
        if 'female' in gen_name:
            return 'F'
        elif 'male' in gen_name:
            return 'M'
    return '-'

def get_pronouns(dic):
    result = []
    for i, word in enumerate(dic):
        if word.lower() in gendered_un_named:
            result.append([i, i])
    return result

In [18]:
# converting spans to text

def span_to_string(span, dic):
    [a, b] = span
    return " ".join(dic[a:b+1])

def array_to_text(array, dic):
    return [span_to_string(e, dic) for e in array]

def array2d_to_text(array2D, dic):
    return [array_to_text(arr, dic) for arr in array2D]

In [33]:
# get hypo & histo from model results

def get_cluster(person, clusters):
    for i, cluster in enumerate(clusters):
        if person in cluster:
            return clusters[i]
    return [person]

def split_hist_hypo(clusters, person_mentions, famous_people, gender, dic):
    historical = []
    hypothetical = []
    hist_genders = []
    hypo_genders = []
    for i, person in enumerate(person_mentions):
        # skip if not gendered
        if gender[i] == '-':
            continue
        # skip if already in one of the lists
        in_historical = any(person in sublist for sublist in historical)
        in_hypothetical = any(person in sublist for sublist in hypothetical)
        if in_historical or in_hypothetical:
            continue
        # add the cluster to correct list
        person_set = get_cluster(person, clusters)
        if any(p in array_to_text(person_set, dic) and p in array_to_text(person_mentions, dic) for p in famous_people):
            historical.append(person_set)
            hist_genders.append(gender[i])
        else:
            hypothetical.append(person_set)
            hypo_genders.append(gender[i])
    return historical, hist_genders, hypothetical, hypo_genders

In [48]:
# pipline for a single given text
def get_hist_hypo_references(text, nlp, debug=False):
    doc = nlp(text)
    clusters, dic = get_coref(text, nlp)
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    gender = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    famous_people = get_nel(text)
    hist, hist_g, hypo, hypo_g = split_hist_hypo(clusters, person_mentions, famous_people, gender, dic)
    hist_str = array2d_to_text(hist, dic)
    hypo_str = array2d_to_text(hypo, dic)
    if debug and len(person_mentions) > 0:
        print('***NEW SENTENCE***\t\t %s' % text)
        print()
        print('Coreference Model Output\t %s' % array2d_to_text(clusters, dic))
        print('NER + Pronouns Output\t\t %s' % array_to_text(person_mentions, dic))
        print('Gender Guesser Output\t\t %s' % gender)
        print('NEL Output\t\t\t %s' % famous_people)
        print()
        print('Historical references\t\t %s' % hist_str)
        print('Historical genders\t\t %s' % hist_g)
        print('Hypothetical references\t\t %s' % hypo_str)
        print('Hypothetical genders\t\t %s' % hypo_g)
        print()
        print()
    return hist, hist_g, hypo, hypo_g, dic, famous_people

In [49]:
# dictinories for evaluations

def get_pipe_dic(text, nlp):
    gendered_clusters, dic = get_references(text, nlp)
    pipe_dic = {}
    for i, cluster in enumerate(array2d_to_text(gendered_clusters, dic)):
        pipe_dic['Person%d' % (i+1)] = set(cluster)
    return pipe_dic

def get_ann_dic(file):
    ann_dic = {}
    for line in file:
        tabs = line.split('\t')
        key = tabs[1].split()[0]
        if len(tabs) < 3 or key == "InDatabase":
            continue
        value = tabs[2]
        if value[-1] == '\n':
            value = value[:-1]
        if key in ann_dic.keys():
            current_set = ann_dic[key]
            current_set.add(value)
            ann_dic.update({ key: current_set})
        else:
            ann_dic.update({ key: set([value])})
    return ann_dic

In [52]:
sample = 'Isaac Newton invented the wheel. He didn\'t go to kindergarden but he was familiar with circles. When told this story, Jessica didn\'t believe it. She thought Newton was a lie.'
hist, hist_g, hypo, hypo_g, dic, database = get_hist_hypo_references(sample, nlp, debug=True)

***NEW SENTENCE***		 Isaac Newton invented the wheel. He didn't go to kindergarden but he was familiar with circles. When told this story, Jessica didn't believe it. She thought Newton was a lie.

Coreference Model Output	 [['Isaac Newton', 'He', 'he', 'Newton'], ['this story', 'it'], ['Jessica', 'She']]
NER + Pronouns Output		 ['Isaac Newton', 'Jessica', 'Newton', 'He', 'he', 'She']
Gender Guesser Output		 ['M', 'F', 'M', 'M', 'M', 'F']
NEL Output			 ['Isaac Newton', 'Newton']

Historical references		 [['Isaac Newton', 'He', 'he', 'Newton']]
Historical genders		 ['M']
Hypothetical references		 [['Jessica', 'She']]
Hypothetical genders		 ['F']




In [32]:
get_nel(sample)

['Isaac Newton', 'Newton']

## Coref evaluations

In [None]:
for i in tqdm(range(300)):
    text_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.txt" % (i+1)
    ann_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.ann" % (i+1)
    text = open(text_file, 'r').readline()
    
    pipe_dic = get_pipe_dic(text, nlp)
    ann_dic = get_ann_dic(open(ann_file, 'r'))

    max_len = max(len(pipe_dic), len(ann_dic))
    i = len(pipe_dic)
    while len(pipe_dic) < max_len:
        key = "Person%d" % i
        pipe_dic.update({ key: set([]) })
        i += 1
    i = len(ann_dic)
    while len(ann_dic) < max_len:
        key = "Person%d" % i
        ann_dic.update({ key: set([]) })
        i += 1

    print(len(pipe_dic))
    print(len(ann_dic))

    precision = bcubed.precision(pipe_dic, ann_dic)
    recall = bcubed.recall(pipe_dic, ann_dic)
    fscore = bcubed.fscore(precision, recall)
    print("Precision: %.3f\tRecall: %.3f\tFscore: %.3f" % (precision, recall, fscore))