<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/Project/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [265]:
# install packages
%%capture
!pip install allennlp
!pip install allennlp-models
!pip install gender-guesser
!git clone https://github.com/darisoy/EE517_Sp21.git
!pip install bcubed

import spacy
import bcubed
import pandas as pd
from allennlp.predictors.predictor import Predictor
import gender_guesser.detector as gender
from tqdm.notebook import tqdm

!python -m spacy download en_core_web_lg

In [None]:
%cd /content/EE517_Sp21/
!git pull
%cd

In [234]:
# load models
nlp = spacy.load('en_core_web_lg')
allen_model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
allen_predictor = Predictor.from_path(allen_model_url)  # load the model
nel = spacy.load('en_core_web_lg')
nel.add_pipe('dbpedia_spotlight', config={'types': 'DBpedia:Person'})
genDec = gender.Detector()




Plugin allennlp_models could not be loaded: No module named 'nltk.translate.meteor_score'


In [235]:
# list of person mentions that are not names
fem_p = ['she', 'her', 'hers', 'herself']
male_p = ['he', 'him', 'his', 'himself']
personal_p = ['i', 'me', 'we', 'us', 'myself', 'ourself', 'ourselves']
other_p = ['they', 'them', 'their', 'you', 'themself', 'themselves']
people = ['adult','adults', 'person','people','child','children']

person_f_singular = ['girl','woman','mrs','ms','mother','mom','aunt','niece','sister','wife','daughter','grandmother','grandma','grandmom','granddaughter','bride','girlfriend','gal','madam','lady']
person_m_singular = ['boy','man','mr','father','dad','uncle','nephew','brother','husband','son','grandfather','grandpa','granddad','grandson','groom','boyfriend','guy','gentleman','bachelor']
people_f_plural = ['girls','women','mothers','moms','aunts','nieces','sisters','wives','daughters','grandmothers','grandmas','granddaughters','brides','girlfriends','gals','ladies']
people_m_plural = ['boys','men','fathers','dads','uncles','nephews','brothers','husbands','sons','grandfathers','grandpas','grandsons','grooms','boyfriends','guys','gentlemen','bachelors']
people_f = person_f_singular + people_f_plural
people_m = person_m_singular + people_m_plural

un_named_mentions = fem_p + male_p + personal_p + other_p + people + people_f + people_m
f_un_named = fem_p + people_f
m_un_named = male_p + people_m
gendered_un_named = f_un_named + m_un_named
neutral_un_named = personal_p + other_p + people

In [236]:
# get data
train = pd.read_csv('/content/train.csv', delimiter= ",", low_memory=False, index_col=0)
test = pd.read_csv('/content/test.csv', delimiter= ",", low_memory=False, index_col=0)
data = pd.concat([train, test], axis = 0)
data = data.drop(['bool', 'book', 'text', 'level'], axis=1)
data = data.rename(columns={'text_org': 'text'})
data.fillna('[]',inplace = True)
data.reset_index(drop=True, inplace=True)
data[data.grade=='12'].head()

FileNotFoundError: ignored

# Helper Functions

In [255]:
# run text through models

def get_coref(text):
    prediction = allen_predictor.predict(document=text)
    return prediction['clusters'], prediction['document']

def get_ner(text, nlp):
    return [[ent.start, ent.end-1] for ent in nlp(text).ents if ent.label_ == 'PERSON']

def get_nel(text, nel):
    threshold = 0.95
    # print(nel(text).ents)
    # for ent in nel(text).ents:
    #     print(ent.text)
    return [[ent.start, ent.end-1] for ent in nel(text).ents if ent._.dbpedia_raw_result is not None and float(ent._.dbpedia_raw_result['@similarityScore']) >= threshold]

def get_gender(person_mention):
    if person_mention.lower() in gendered_un_named:
        if person_mention.lower() in f_un_named:
            return 'F'
        elif person_mention.lower() in m_un_named:
            return 'M'
    elif len(person_mention) > 0:
        first_name = person_mention.split()[0]
        gen_name = genDec.get_gender(first_name.capitalize())
        if 'female' in gen_name:
            return 'F'
        elif 'male' in gen_name:
            return 'M'
    return '-'

def get_pronouns(dic):
    result = []
    for i, word in enumerate(dic):
        if word.lower() in gendered_un_named:
            result.append([i, i])
    return result

In [249]:
# converting spans to text

def span_to_string(span, dic):
    [a, b] = span
    return " ".join(dic[a:b+1])

def array_to_text(array, dic):
    return [span_to_string(e, dic) for e in array]

def array2d_to_text(array2D, dic):
    return [array_to_text(arr, dic) for arr in array2D]

In [250]:
# get hypo & histo from model results

def get_cluster(person, clusters):
    for i, cluster in enumerate(clusters):
        if person in cluster:
            return clusters[i]
    return [person]

def split_hist_hypo(clusters, person_mentions, famous_people, gender):
    historical = []
    hypothetical = []
    hist_genders = []
    hypo_genders = []
    for i, person in enumerate(person_mentions):
        # skip if not gendered
        if gender[i] == '-':
            continue
        # skip if already in one of the lists
        in_historical = any(person in sublist for sublist in historical)
        in_hypothetical = any(person in sublist for sublist in hypothetical)
        if in_historical or in_hypothetical:
            continue
        # add the cluster to correct list
        person_set = get_cluster(person, clusters)
        if any(p in person_set and p in person_mentions for p in famous_people):
            historical.append(person_set)
            hist_genders.append(gender[i])
        else:
            hypothetical.append(person_set)
            hypo_genders.append(gender[i])
    return historical, hist_genders, hypothetical, hypo_genders

In [251]:
# pipline for a single given text
def get_hist_hypo_references(text, nlp, nel, debug=False):
    doc = nlp(text)
    clusters, dic = get_coref(text)
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    gender = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    famous_people = get_nel(text, nel)
    hist, hist_g, hypo, hypo_g = split_hist_hypo(clusters, person_mentions, famous_people, gender)
    hist_str = array2d_to_text(hist, dic)
    hypo_str = array2d_to_text(hypo, dic)
    if debug and len(person_mentions) > 0:
        print('***NEW SENTENCE***\t\t %s' % text)
        print()
        print('Coreference Model Output\t %s' % array2d_to_text(clusters, dic))
        print('NER + Pronouns Output\t\t %s' % array_to_text(person_mentions, dic))
        print('Gender Guesser Output\t\t %s' % gender)
        print('NEL Output\t\t\t %s' % array_to_text(famous_people, dic,))
        print()
        print('Historical references\t\t %s' % hist_str)
        print('Historical genders\t\t %s' % hist_g)
        print('Hypothetical references\t\t %s' % hypo_str)
        print('Hypothetical genders\t\t %s' % hypo_g)
        print()
        print()
    return hist, hist_g, hypo, hypo_g, dic, famous_people

In [252]:
def generate_ann_file(text, hist, hypo, dic, database, filename):
    f = open(filename, "w")
    database_str = array_to_text(database, dic)
    T_counter = 0
    for i, cluster in enumerate(hist + hypo):
        same_person = "*\tSame_Person"
        for word in cluster:
            T_counter += 1
            start_str = dic[word[0]]
            end_str = dic[word[1]]
            start = text.index(start_str)
            end = text.index(end_str) + len(end_str)
            string = text[start:end]
            f.write("T%d\tPerson%d %d %d\t%s\n" %(T_counter, i+1, start, end, string))
            same_person += (" T%d" % T_counter)
        f.write(same_person + "\n")
    for cluster in hist:
        for word in cluster:
            start_str = dic[word[0]]
            end_str = dic[word[1]]
            start = text.index(start_str)
            end = text.index(end_str) + len(end_str)
            string = text[start:end]
            if string in database_str:
                T_counter += 1
                f.write("T%d\tInDatabase %d %d\t%s\n" % (T_counter, start, end, string))

In [242]:
sample = 'Isaac Newton invented the wheel. He didn\'t go to kindergarden but he was familiar with circles. When told this story, Jessica didn\'t believe it. She thought Newton was a lie.'
hist, hist_g, hypo, hypo_g, dic, database = get_hist_hypo_references(sample, nlp, nel, debug=True)
generate_ann_file(sample, hist, hypo, dic, database, "/content/test.ann")

***NEW SENTENCE***		 Isaac Newton invented the wheel. He didn't go to kindergarden but he was familiar with circles. When told this story, Jessica didn't believe it. She thought Newton was a lie.

Coreference Model Output	 [['Isaac Newton', 'He', 'he', 'Newton'], ['this story', 'it'], ['Jessica', 'She']]
NER + Pronouns Output		 ['Isaac Newton', 'Jessica', 'Newton', 'He', 'he', 'She']
Gender Guesser Output		 ['M', 'F', 'M', 'M', 'M', 'F']
NEL Output			 ['Isaac Newton', 'Newton']

Historical references		 [['Isaac Newton', 'He', 'he', 'Newton']]
Historical genders		 ['M']
Hypothetical references		 [['Jessica', 'She']]
Hypothetical genders		 ['F']




# Pipeline

In [None]:
# Save only gendered sentences as csv file
data_genders = [False for _ in range(len(data))]
for i, row in tqdm(data.iterrows()):
    dic = row.text.split()
    person_mentions = get_ner(row.text, nlp) + get_pronouns(dic)
    genders = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    male_mentions = [_ for g in genders if g == 'M']
    female_mentions = [_ for g in genders if g == 'F']
    data_genders[i] = len(male_mentions)>0 or len(female_mentions)>0
data['gendered'] = data_genders
gendered = data[data['gendered']]
gendered = gendered.drop(['gendered'], axis=1)
gendered.to_csv('/content/gendered.csv', index=False)

In [11]:
# Load previously saved gendered sentences
gendered_data = pd.read_csv('/content/gendered.csv', delimiter= ",", low_memory=False)

In [35]:
for i, row in gendered_data.iterrows():
    _, hist_g, _, hypo_g, _, _ = get_hist_hypo_references(row.text, nlp, debug=True)

***NEW SENTENCE***		  Look at the sky diver. Do you see him fall? We do not see gravity. It is what pulls him down.

Coreference Model Output	 [['the sky diver', 'him', 'him'], ['gravity', 'It']]
NER + Pronouns Output		 ['him', 'him']
Gender Guesser Output		 ['M', 'M']
NEL Output			 ['sky diver', 'gravity']

Historical references		 []
Historical genders		 []
Hypothetical references		 [['the sky diver', 'him', 'him']]
Hypothetical genders		 ['M']


***NEW SENTENCE***		  A driver is stopped at a stoplight. He waits for the light to turn green. The driver looks to his right. He sees an interesting store. Honk, honk, is the sound heard from behind. Why would another driver beep their horn?

Coreference Model Output	 [['A driver', 'He', 'The driver', 'his', 'He'], ['a stoplight', 'the light'], ['another driver', 'their']]
NER + Pronouns Output		 ['He', 'his', 'He']
Gender Guesser Output		 ['M', 'M', 'M']
NEL Output			 ['The driver']

Historical references		 []
Historical genders		 []
Hypoth

KeyboardInterrupt: ignored

# Evaluate Annotations

## Gender guesser evaluation

In [106]:
pipe_male = 0
pipe_female = 0
pipe_total = 0
ann_male = 0
ann_female = 0
ann_total = 0
pipe_ann_total = 0
pipe_ann_male = 0
pipe_ann_female = 0
for i in range(100):
    text_file = "/content/EE517_Sp21/Project/F_M_N_mentions_labeled/F_M_N_%d.txt" % (i+1)
    ann_file = "/content/EE517_Sp21/Project/F_M_N_mentions_labeled/F_M_N_%d.ann" % (i+1)
    
    text = open(text_file, 'r').readline()
    dic = text.split()
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    genders = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    pipe_male_mentions = [_ for g in genders if g == 'M']
    pipe_female_mentions = [_ for g in genders if g == 'F']
    pipe_total += len(person_mentions)
    pipe_male += len(pipe_male_mentions)
    pipe_female += len(pipe_female_mentions)

    pipe_mentions = array_to_text(person_mentions, dic)

    for line in open(ann_file, 'r'):
        info = line.split('\t')
        gender = info[1].split()[0]
        word = info[2][:-1]

        if gender == "Male":
            ann_male += 1
        if gender == "Female":
            ann_female += 1
        ann_total += 1

        if word in pipe_mentions:
            pipe_ann_total += 1
            index = pipe_mentions.index(word)
            if genders[index] == gender[0]:
                if gender == "Male":
                    pipe_ann_male += 1
                if gender == "Female":
                    pipe_ann_female += 1

print("Pipeline results:")
print("\tTotal person mentions: %i" % pipe_total)
print("\tMale mentions: %i" % pipe_male)
print("\tFemale mentions: %i" % pipe_female)
print()
print("Annotations results:")
print("\tTotal person mentions: %i" % ann_total)
print("\tMale mentions: %i" % ann_male)
print("\tFemale mentions: %i" % ann_female)
print()
print("In pipeline and annotations results:")
print("\tTotal person mentions: %i" % pipe_ann_total)
print("\tMale mentions: %i" % pipe_ann_male)
print("\tFemale mentions: %i" % pipe_ann_female)

Pipeline results:
	Total person mentions: 83
	Male mentions: 41
	Female mentions: 14

Annotations results:
	Total person mentions: 272
	Male mentions: 65
	Female mentions: 11

In pipeline and annotations results:
	Total person mentions: 43
	Male mentions: 37
	Female mentions: 5


In [107]:
print("Overall:")
overall_precision = (pipe_ann_total / pipe_total) * 100
print("\tPrecision: %.2f%%" % overall_precision)
overall_recall = (pipe_ann_total / ann_total) * 100
print("\tRecall: %.2f%%" % overall_recall)
overall_f = (2 * overall_precision * overall_recall) / (overall_precision + overall_recall)
print("\tF-score: %.2f%%" % overall_f)
print()
print("Male:")
male_precision = (pipe_ann_male / pipe_male) * 100
print("\tPrecision: %.2f%%" % male_precision)
male_recall = (pipe_ann_male / ann_male) * 100
print("\tRecall: %.2f%%" % male_recall)
male_f = (2 * male_precision * male_recall) / (male_precision + male_recall)
print("\tF-score: %.2f%%" % male_f)
print()
print("Female:")
female_precision = (pipe_ann_female / pipe_female) * 100
print("\tPrecision: %.2f%%" % female_precision)
female_recall = (pipe_ann_female / ann_female) * 100
print("\tRecall: %.2f%%" % female_recall)
female_f = (2 * female_precision * female_recall) / (female_precision + female_recall)
print("\tF-score: %.2f%%" % female_f)

Overall:
	Precision: 51.81%
	Recall: 15.81%
	F-score: 24.23%

Male:
	Precision: 90.24%
	Recall: 56.92%
	F-score: 69.81%

Female:
	Precision: 35.71%
	Recall: 45.45%
	F-score: 40.00%


## Coref and NEL evaluations (combined)

In [None]:
# generate and save pre-annotation .ann files
for i in tqdm(range(300)):
    text_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.txt" % (i+1)
    ann_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.ann" % (i+1)
    text = open(text_file, 'r').readline()
    hist, _, hypo, _, dic, database = get_hist_hypo_references(text, nlp, nel)
    generate_ann_file(text, hist, hypo, dic, database, ann_file)

In [309]:
def get_pipe_dic(text, nlp, nel):
    hist, _, hypo, _, dic, _ = get_hist_hypo_references(text, nlp, nel)
    pipe_dic = {}
    for i, cluster in enumerate(array2d_to_text(hist, dic) + array2d_to_text(hypo, dic)):
        pipe_dic['Person%d' % (i+1)] = set(cluster)
    return pipe_dic

In [310]:
def get_ann_dic(file):
    ann_dic = {}
    for line in file:
        tabs = line.split('\t')
        key = tabs[1].split()[0]
        if len(tabs) < 3 or key == 'InDatabase':
            continue
        value = tabs[2]
        if value[-1] == '\n':
            value = value[:-1]
        if key in ann_dic.keys():
            current_set = ann_dic[key]
            current_set.add(value)
            ann_dic.update({ key: current_set})
        else:
            ann_dic.update({ key: set([value])})
    return ann_dic

In [312]:
for i in tqdm(range(300)):
    text_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.txt" % (i+1)
    ann_file = "/content/EE517_Sp21/Project/Coref_NEL_mentions/Coref_NEL_%d.ann" % (i+1)
    text = open(text_file, 'r').readline()
    pipe_dic = get_pipe_dic(text, nlp, nel)
    ann_dic = get_ann_dic(open(ann_file, 'r'))
    precision = bcubed.precision(pipe_dic, ann_dic)
    recall = bcubed.recall(pipe_dic, ann_dic)
    fscore = bcubed.fscore(precision, recall)
    print("Precision: %.3f\tRecall: %.3f\tFscore: %.3f" % (precision, recall, fscore))

HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))

Precision: 1.000	Recall: 1.000	Fscore: 1.000
Precision: 1.000	Recall: 1.000	Fscore: 1.000
Precision: 1.000	Recall: 1.000	Fscore: 1.000


KeyboardInterrupt: ignored