<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/Project/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [11]:
# install packages
%%capture
!pip install allennlp
!pip install allennlp-models
!pip install spacy-dbpedia-spotlight
!pip install gender-guesser
!git clone https://github.com/darisoy/EE517_Sp21.git

import spacy
import pandas as pd
from allennlp.predictors.predictor import Predictor
import spacy_dbpedia_spotlight
import gender_guesser.detector as gender
from tqdm.notebook import tqdm

In [2]:
# initialize models
%%capture
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
allen_model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
allen_predictor = Predictor.from_path(allen_model_url)  # load the model
nel = spacy_dbpedia_spotlight.create('en')
genDec = gender.Detector()

In [3]:
#list of person mentions that are not names
fem_p = ['she', 'her', 'hers', 'herself']
male_p = ['he', 'him', 'his', 'himself']
personal_p = ['i', 'me', 'we', 'us', 'myself', 'ourself', 'ourselves']
other_p = ['they', 'them', 'their', 'you', 'themself', 'themselves']
people = ['adult','adults', 'person','people','child','children']

person_f_singular = ['girl','woman','mrs','ms','mother','mom','aunt','niece','sister','wife','daughter','grandmother','grandma','grandmom','granddaughter','bride','girlfriend','gal','madam','lady']
person_m_singular = ['boy','man','mr','father','dad','uncle','nephew','brother','husband','son','grandfather','grandpa','granddad','grandson','groom','boyfriend','guy','gentleman','bachelor']
people_f_plural = ['girls','women','mothers','moms','aunts','nieces','sisters','wives','daughters','grandmothers','grandmas','granddaughters','brides','girlfriends','gals','ladies']
people_m_plural = ['boys','men','fathers','dads','uncles','nephews','brothers','husbands','sons','grandfathers','grandpas','grandsons','grooms','boyfriends','guys','gentlemen','bachelors']
people_f = person_f_singular + people_f_plural
people_m = person_m_singular + people_m_plural

un_named_mentions = fem_p + male_p + personal_p + other_p + people + people_f + people_m
f_un_named = fem_p + people_f
m_un_named = male_p + people_m
neutral_un_named = personal_p + other_p + people

In [59]:
# get data
train = pd.read_csv('/content/train.csv', delimiter= ",", low_memory=False, index_col=0)
test = pd.read_csv('/content/test.csv', delimiter= ",", low_memory=False, index_col=0)
data = pd.concat([train, test], axis = 0)
data = data.drop(['bool', 'book', 'text', 'level'], axis=1)
data = data.rename(columns={'text_org': 'text'})
data.fillna('[]',inplace = True)
data[data.grade=='12'].head()

Unnamed: 0,grade,science,text
5,12,1,SPONSOR This textbook was developed with corp...
6,12,1,"Well structured, impactful Corporate Social I..."
7,12,1,The merger between Metropolitan and Momentum ...
8,12,1,HIV/AIDS is becoming a manageable disease in ...
9,12,1,Momentum's focus on persons with disabilities...


# Functions

In [5]:
# run text through models

def get_coref(text):
    prediction = allen_predictor.predict(document=text)
    return prediction['clusters'], prediction['document']

def get_ner(text, nlp):
    return [[ent.start, ent.end-1] for ent in nlp(text).ents if ent.label_ == 'PERSON']

def get_nel(text, nel):
    threshold = 0.95
    return [[ent.start, ent.end-1] for ent in nel(text).ents if float(ent._.dbpedia_raw_result['@similarityScore']) >= threshold]

def get_gender(person_mention):
    if person_mention.lower() in un_named_mentions:
        if person_mention.lower() in f_un_named:
            return 'F'
        elif person_mention.lower() in m_un_named:
            return 'M'
    elif len(person_mention) > 0:
        first_name = person_mention.split()[0]
        gen_name = genDec.get_gender(first_name.capitalize())
        if 'female' in gen_name:
            return 'F'
        elif 'male' in gen_name:
            return 'M'
    return '-'

def get_pronouns(dic):
    result = []
    for i, word in enumerate(dic):
        if word.lower() in un_named_mentions:
            result.append([i, i])
    return result

In [38]:
# converting spans to text

def span_to_string(span, dic):
    [a, b] = span
    return " ".join(dic[a:b+1])

def array_to_text(array, dic):
    return [span_to_string(e, dic) for e in array]

def array2d_to_text(array2D, dic):
    return [array_to_text(arr, dic) for arr in array2D]

In [7]:
# get hypo & histo from model results

def get_cluster(person, clusters):
    for i, cluster in enumerate(clusters):
        if person in cluster:
            return clusters[i]
    return [person]

def split_hist_hypo(clusters, person_mentions, famous_people, gender):
    historical = []
    hypothetical = []
    for i, person in enumerate(person_mentions):
        # skip if not gendered
        if gender[i] == '-':
            continue
        # skip if already in one of the lists
        in_historical = any(person in sublist for sublist in historical)
        in_hypothetical = any(person in sublist for sublist in hypothetical)
        if in_historical or in_hypothetical:
            continue
        # add the cluster to correct list
        person_set = get_cluster(person, clusters)
        if any(p in person_set for p in famous_people):
            historical.append(person_set)
        else:
            hypothetical.append(person_set)
    return historical, hypothetical

In [8]:
# pipline main function
def get_hist_hypo_references(text, nlp, nel, debug=False):
    doc = nlp(text)
    clusters, dic = get_coref(text)
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    gender = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    famous_people = get_nel(text, nel)
    hist, hypo = split_hist_hypo(clusters, person_mentions, famous_people, gender)
    hist = array2d_to_text(hist, dic)
    hypo = array2d_to_text(hypo, dic)
    if debug and len(person_mentions) > 0:
        print('***NEW SENTENCE***\t\t %s' % text)
        print()
        print('Coreference Model Output\t %s' % array2d_to_text(clusters, dic))
        print('NER + Pronouns Output\t\t %s' % array_to_text(person_mentions, dic))
        print('Gender Guesser Output\t\t %s' % gender)
        print('NEL Output\t\t\t %s' % array_to_text(famous_people, dic,))
        print()
        print('Historical references\t\t %s' % hist)
        print('Hypothetical references\t\t %s' % hypo)
        print()
        print()
    return hist, hypo

In [9]:
sample = 'Albert Einstein didn\'t want his friend Jenny to feel lonely so they invited her to the party. Tom is happy.'
hist, hypo = get_hist_hypo_references(sample, nlp, nel, debug=True)

***NEW SENTENCE***		 Albert Einstein didn't want his friend Jenny to feel lonely so they invited her to the party. Tom is happy.

Coreference Model Output	 [['Albert Einstein', 'his'], ['his friend Jenny', 'her']]
NER + Pronouns Output		 ['Albert Einstein', 'Jenny', 'Tom', 'his', 'they', 'her']
Gender Guesser Output		 ['M', 'F', 'M', 'M', '-', 'F']
NEL Output			 ['Albert Einstein']

Historical references		 [['Albert Einstein', 'his']]
Hypothetical references		 [['Jenny'], ['Tom'], ['his friend Jenny', 'her']]




# Test Pipeline

In [None]:
for text in data[data.grade=='12'].text_org:
    hist, hypo = get_hist_hypo_references(text, nlp, nel, debug=True)

***NEW SENTENCE***		  Well structured, impactful Corporate Social Investment (CSI) has the ability to contribute positively to nation building and drive positive change in the communities. MMI's commitment to social investment means that we are constantly looking for ways in which we can assist some of South Africa's most vulnerable citizens to expand their horizons and gain greater access to life's opportunities. This means that we do not view social investment as a nice to have or as an exercise in marketing or sponsorship but rather as a critical part of our contribution to society.

Coreference Model Output	 [["some of South Africa 's most vulnerable citizens", 'their']]
NER + Pronouns Output		 ['we', 'we', 'their', 'we']
Gender Guesser Output		 ['-', '-', '-', '-']
NEL Output			 ['South Africa']

Historical references		 []
Hypothetical references		 []


***NEW SENTENCE***		  HIV/AIDS is becoming a manageable disease in many developed countries but in a country such as ours, it rem

ValueError: ignored

# Evaluate Annotations

## Gender guesser evaluation

In [None]:
data.head()

Unnamed: 0,grade,science,text,document,gender
1,K_1,1,We cannot see the wind but we can feel it. Th...,"[We, cannot, see, the, wind, but, we, can, fee...","[-, -, -, -]"
2,K_1,1,Yesterday it rained. There was a lot of wind....,"[Yesterday, it, rained., There, was, a, lot, o...",[]
3,K_1,1,Summer is here! July is sunny and hot. Summer...,"[Summer, is, here!, July, is, sunny, and, hot....",[-]
4,K_1,1,It's starting to cool down from the summer he...,"[It's, starting, to, cool, down, from, the, su...","[-, -]"
5,K_1,1,The wind pushes the kite high into the sky. W...,"[The, wind, pushes, the, kite, high, into, the...",[]


In [47]:
pipe_male = 0
pipe_female = 0
pipe_total = 0
ann_male = 0
ann_female = 0
ann_total = 0
pipe_ann_total = 0
pipe_ann_male = 0
pipe_ann_female = 0
for i in range(100):
    text_file = "/content/EE517_Sp21/Project/F_M_N_mentions_labeled/F_M_N_%d.txt" % (i+1)
    ann_file = "/content/EE517_Sp21/Project/F_M_N_mentions_labeled/F_M_N_%d.ann" % (i+1)
    
    text = open(text_file, 'r').readline()
    dic = text.split()
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    genders = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    pipe_male_mentions = [_ for g in genders if g == 'M']
    pipe_female_mentions = [_ for g in genders if g == 'F']
    pipe_total += len(person_mentions)
    pipe_male += len(pipe_male_mentions)
    pipe_female += len(pipe_female_mentions)

    pipe_mentions = array_to_text(person_mentions, dic)

    for line in open(ann_file, 'r'):
        info = line.split('\t')
        gender = info[1].split()[0]
        word = info[2][:-1]

        if gender == "Male":
            ann_male += 1
        if gender == "Female":
            ann_female += 1
        ann_total += 1

        if word in pipe_mentions:
            pipe_ann_total += 1
            index = pipe_mentions.index(word)
            if genders[index] == gender[0]:
                if gender == "Male":
                    pipe_ann_male += 1
                if gender == "Female":
                    pipe_ann_female += 1

print("Pipeline results:")
print("\tTotal person mentions: %i" % pipe_total)
print("\tMale mentions: %i" % pipe_male)
print("\tFemale mentions: %i" % pipe_female)
print()
print("Annotations results:")
print("\tTotal person mentions: %i" % ann_total)
print("\tMale mentions: %i" % ann_male)
print("\tFemale mentions: %i" % ann_female)
print()
print("In pipeline and annotations results:")
print("\tTotal person mentions: %i" % pipe_ann_total)
print("\tMale mentions: %i" % pipe_ann_male)
print("\tFemale mentions: %i" % pipe_ann_female)

Pipeline results:
	Total person mentions: 280
	Male mentions: 41
	Female mentions: 14

Annotations results:
	Total person mentions: 272
	Male mentions: 65
	Female mentions: 11

In pipeline and annotations results:
	Total person mentions: 182
	Male mentions: 37
	Female mentions: 5


In [58]:
print("Overall:")
overall_precision = (pipe_ann_total / pipe_total) * 100
print("\tPrecision: %.2f%%" % overall_precision)
overall_recall = (pipe_ann_total / ann_total) * 100
print("\tRecall: %.2f%%" % overall_recall)
overall_f = (2 * overall_precision * overall_recall) / (overall_precision + overall_recall)
print("\tF-score: %.2f%%" % overall_f)
print()
print("Male:")
male_precision = (pipe_ann_male / pipe_male) * 100
print("\tPrecision: %.2f%%" % male_precision)
male_recall = (pipe_ann_male / ann_male) * 100
print("\tRecall: %.2f%%" % male_recall)
male_f = (2 * male_precision * male_recall) / (male_precision + male_recall)
print("\tF-score: %.2f%%" % male_f)
print()
print("Female:")
female_precision = (pipe_ann_female / pipe_female) * 100
print("\tPrecision: %.2f%%" % female_precision)
female_recall = (pipe_ann_female / ann_female) * 100
print("\tRecall: %.2f%%" % female_recall)
female_f = (2 * female_precision * female_recall) / (female_precision + female_recall)
print("\tF-score: %.2f%%" % female_f)

Overall:
	Precision: 65.00%
	Recall: 66.91%
	F-score: 65.94%

Male:
	Precision: 90.24%
	Recall: 56.92%
	F-score: 69.81%

Female:
	Precision: 35.71%
	Recall: 45.45%
	F-score: 40.00%
