<a href="https://colab.research.google.com/github/darisoy/EE517_Sp21/blob/master/Project/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the environment

In [None]:
# install packages
%%capture
!pip install allennlp
!pip install allennlp-models
!pip install spacy-dbpedia-spotlight
!pip install gender-guesser

import spacy
from allennlp.predictors.predictor import Predictor
import spacy_dbpedia_spotlight
import gender_guesser.detector as gender

In [148]:
# initialize models
%%capture
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
allen_model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
allen_predictor = Predictor.from_path(allen_model_url)  # load the model
nel = spacy_dbpedia_spotlight.create('en')
genDec = gender.Detector()

In [None]:
#list of person mentions that are not names
fem_p = ['she', 'her', 'hers', 'herself']
male_p = ['he', 'him', 'his', 'himself']
personal_p = ['i', 'me', 'we', 'us', 'myself', 'ourself', 'ourselves']
other_p = ['they', 'them', 'their', 'you', 'themself', 'themselves']
people = ['adult','adults', 'person','people','child','children']

person_f_singular = ['girl','woman','mrs','ms','mother','mom','aunt','niece','sister','wife','daughter','grandmother','grandma','grandmom','granddaughter','bride','girlfriend','gal','madam','lady']
person_m_singular = ['boy','man','mr','father','dad','uncle','nephew','brother','husband','son','grandfather','grandpa','granddad','grandson','groom','boyfriend','guy','gentleman','bachelor']
people_f_plural = ['girls','women','mothers','moms','aunts','nieces','sisters','wives','daughters','grandmothers','grandmas','granddaughters','brides','girlfriends','gals','ladies']
people_m_plural = ['boys','men','fathers','dads','uncles','nephews','brothers','husbands','sons','grandfathers','grandpas','grandsons','grooms','boyfriends','guys','gentlemen','bachelors']
people_f = person_f_singular + people_f_plural
people_m = person_m_singular + people_m_plural

un_named_mentions = fem_p + male_p + personal_p + other_p + people + people_f + people_m
f_un_named = fem_p + people_f
m_un_named = male_p + people_m
neutral_un_named = personal_p + other_p + people

# Functions

In [149]:
# run text through models

def get_coref(text, predictor):
    prediction = predictor.predict(document=text)
    return prediction['clusters'], prediction['document']

def get_ner(text, nlp):
    return [[ent.start, ent.end-1] for ent in nlp(text).ents if ent.label_ == 'PERSON']

def get_nel(text, nel):
    threshold = 0.95
    return [[ent.start, ent.end-1] for ent in nel(text).ents if float(ent._.dbpedia_raw_result['@similarityScore']) >= threshold]

def get_gender(person_mention):
    if person_mention.lower() in un_named_mentions:
        if person_mention.lower() in f_un_named:
            return 'F'
        elif person_mention.lower() in m_un_named:
            return 'M'
    else:
        gen_name = genDec.get_gender(person_mention.capitalize())
        if 'female' in gen_name:
            return 'F'
        elif 'male' in gen_name:
            return 'M'
    return '-'

def get_pronouns(dic):
    result = []
    for i, word in enumerate(dic):
        if word.lower() in un_named_mentions:
            result.append([i, i])
    return result

In [150]:
# converting spans to text

def span_to_string(span, dic):
    [a, b] = span
    return " ".join(dic[a:b+1])

def array_to_text(array, dic):
    return [span_to_string(e, dic) for e in array]

def array2d_to_text(array2D, dic):
    return [array_to_text(arr, dic) for arr in array2D]

In [157]:
# get hypo & histo from model results

def get_cluster(person, clusters):
    for i, cluster in enumerate(clusters):
        if person in cluster:
            return clusters[i]
    return [person]

def split_hist_hypo(clusters, person_mentions, famous_people, gender):
    historical = []
    hypothetical = []
    for i, person in enumerate(person_mentions):
        # skip if not gendered
        if gender[i] == '-':
            continue
        # skip if already in one of the lists
        in_historical = any(person in sublist for sublist in historical)
        in_hypothetical = any(person in sublist for sublist in hypothetical)
        if in_historical or in_hypothetical:
            continue
        # add the cluster to correct list
        person_set = get_cluster(person, clusters)
        if person in famous_people:
            historical.append(person_set)
        else:
            hypothetical.append(person_set)
    return historical, hypothetical

In [158]:
# pipline main function
def get_hist_hypo_references(text, nlp, nel, debug=False):
    doc = nlp(text)
    clusters, dic = get_coref(text, allen_predictor)
    person_mentions = get_ner(text, nlp) + get_pronouns(dic)
    gender = [get_gender(span_to_string(p, dic)) for p in person_mentions]
    famous_people = get_nel(text, nel)
    hist, hypo = split_hist_hypo(clusters, person_mentions, famous_people, gender)
    hist = array2d_to_text(hist, dic)
    hypo = array2d_to_text(hypo, dic)
    if debug and len(person_mentions) > 0:
        print('***NEW SENTENCE***\t\t %s' % text)
        print()
        print('Coreference Model Output\t %s' % array2d_to_text(clusters, dic))
        print('NER + Pronouns Output\t\t %s' % array_to_text(person_mentions, dic))
        print('Gender Guesser Output\t\t %s' % gender)
        print('NEL Output\t\t\t %s' % array_to_text(famous_people, dic,))
        print()
        print('Historical references\t\t %s' % hist)
        print('Hypothetical references\t\t %s' % hypo)
        print()
        print()
        print()
    return hist, hypo

In [159]:
sample = 'Eva and Martha didn\'t want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.'
hist, hypo = get_hist_hypo_references(sample, nlp, nel, debug=True)

***NEW SENTENCE***		 Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.

Coreference Model Output	 [['Eva and Martha', 'their', 'they'], ['their friend Jenny', 'her']]
NER + Pronouns Output		 ['Eva', 'Martha', 'Jenny', 'Tom', 'their', 'they', 'her']
Gender Guesser Output		 ['F', 'F', 'F', 'M', '-', '-', 'F']
NEL Output			 []

Historical references		 []
Hypothetical references		 [['Eva'], ['Martha'], ['Jenny'], ['Tom'], ['their friend Jenny', 'her']]





# Test

In [161]:
import pandas as pd
d_all1 = pd.read_csv('/content/train.csv', delimiter= ",", low_memory=False, index_col=0)
d_all2 = pd.read_csv('/content/test.csv', delimiter= ",", low_memory=False, index_col=0)
d_all1 = d_all1.drop(['bool'], axis=1)
assert all(d_all1.columns == d_all2.columns)
d = pd.concat([d_all1, d_all2], axis = 0)
d.fillna('[]',inplace = True)
d[d.grade=='12'].head()

Unnamed: 0,book,grade,level,science,text,text_org
5,Gr12_PhysicalSciences_Learner_Eng.txt3,12,3,1,sponsor this textbook was developed with corp...,SPONSOR This textbook was developed with corp...
6,Gr12_PhysicalSciences_Learner_Eng.txt3,12,3,1,"well structured , impactful corporate social ...","Well structured, impactful Corporate Social I..."
7,Gr12_PhysicalSciences_Learner_Eng.txt3,12,3,1,the merger between metropolitan and momentum ...,The merger between Metropolitan and Momentum ...
8,Gr12_PhysicalSciences_Learner_Eng.txt3,12,3,1,hiv/aids is becoming a manageable disease in ...,HIV/AIDS is becoming a manageable disease in ...
9,Gr12_PhysicalSciences_Learner_Eng.txt3,12,3,1,momentum 's focus on persons with disabilitie...,Momentum's focus on persons with disabilities...


In [160]:
for text in d[d.grade=='12'].text_org:
    hist, hypo = get_hist_hypo_references(text, nlp, nel, debug=True)

***NEW SENTENCE***		  Well structured, impactful Corporate Social Investment (CSI) has the ability to contribute positively to nation building and drive positive change in the communities. MMI's commitment to social investment means that we are constantly looking for ways in which we can assist some of South Africa's most vulnerable citizens to expand their horizons and gain greater access to life's opportunities. This means that we do not view social investment as a nice to have or as an exercise in marketing or sponsorship but rather as a critical part of our contribution to society.

Coreference Model Output	 [["some of South Africa 's most vulnerable citizens", 'their']]
NER + Pronouns Output		 ['we', 'we', 'their', 'we']
Gender Guesser Output		 ['-', '-', '-', '-']
NEL Output			 ['South Africa']

Historical references		 []
Hypothetical references		 []



***NEW SENTENCE***		  HIV/AIDS is becoming a manageable disease in many developed countries but in a country such as ours, it re

KeyboardInterrupt: ignored