### Setting up Huggingface Neuralcoref and small example

In [4]:
# compile neuralcoref from scratch according to https://github.com/huggingface/neuralcoref

In [5]:
import spacy
print(spacy.__version__) 

3.0.6


In [6]:
import en_core_web_lg
# Add neural coref to SpaCy's pipe
import neuralcoref

nlp = en_core_web_lg.load()
neuralcoref.add_to_pipe(nlp)

ValueError: spacy.strings.StringStore size changed, may indicate binary incompatibility. Expected 88 from C header, got 64 from PyObject

In [89]:

doc = nlp("Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.")

In [40]:
#doc._.has_coref
print(doc._.coref_clusters)

[Eva and Martha: [Eva and Martha, their, they], Jenny: [Jenny, her]]


In [31]:
all_clusters = []
for cluster in doc._.coref_clusters:
    cluster_start_end = []
    for mention in cluster.mentions:
        cluster_start_end.append([mention.start, mention.end -1])
    all_clusters.append(cluster_start_end)
print(all_clusters)
print(doc._.coref_clusters)

[[[0, 2], [6, 6], [13, 13]], [[8, 8], [15, 15]]]
[Eva and Martha: [Eva and Martha, their, they], Jenny: [Jenny, her]]


### Setting up AllenNLP coreference resolution and small example

In [6]:
#!pip install allennlp
#!pip install allennlp-models

In [7]:
from allennlp.predictors.predictor import Predictor

In [12]:
model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
predictor = Predictor.from_path(model_url)  # load the model

In [45]:
text = "Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.He says."
prediction = predictor.predict(document=text)  # get the prediction

In [46]:
def get_clusters_allen_nlp(prediction):
    clusters = []
    for cluster in prediction['clusters']:
        print("cluster", cluster)
        first_mention = ""
        mention_ref = {}
        for token in cluster:
            token_start = token[0]
            token_end = token[1]
            mention = " ".join(prediction['document'][token_start:token_end + 1])
            if first_mention == "":
                first_mention = mention
                mention_ref[first_mention] = [first_mention]
            else:
                mention_ref[first_mention] += [mention]
        print(mention_ref)
        clusters.append(mention_ref)
        
    return clusters


In [47]:
clusters_allennlp = get_clusters_allen_nlp(prediction)
print(clusters_allennlp)

cluster [[0, 2], [6, 6], [13, 13]]
{'Eva and Martha': ['Eva and Martha', 'their', 'they']}
cluster [[6, 8], [15, 15]]
{'their friend Jenny': ['their friend Jenny', 'her']}
cluster [[20, 20], [24, 24]]
{'Tom': ['Tom', 'He']}
[{'Eva and Martha': ['Eva and Martha', 'their', 'they']}, {'their friend Jenny': ['their friend Jenny', 'her']}, {'Tom': ['Tom', 'He']}]


### NER and small example

In [53]:
doc = nlp("Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.")

In [54]:
person_found = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

In [55]:
print(person_found)

['Eva', 'Martha', 'Jenny', 'Tom']


### Dealing with textbook data

In [1]:
import math
import sys, os, re
from IPython import embed
from pprint import pprint
import string
from random import shuffle
from collections import defaultdict, Counter

from termcolor import colored

import matplotlib.pyplot as plt
from util import *
import numpy as np
import pandas as pd
import random
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import spacy
import en_core_web_sm
import en_core_web_lg
import inflect
from subject_object_extraction import *
import gender_guesser.detector as gender

import textacy
from textacy.extract import subject_verb_object_triples as extractSVOs

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from tqdm import tqdm_notebook as tqdm
import ast
lemma = nltk.wordnet.WordNetLemmatizer()
  
d = gender.Detector()



In [2]:
d_all1 = pd.read_csv('textbook_data/new_chapter_train_set_gender.csv', delimiter= ",", low_memory=False, index_col=0)
d_all2 = pd.read_csv('textbook_data/new_chapter_test_set_gender.csv', delimiter= ",", low_memory=False, index_col=0)
d_all1 = d_all1.drop(['bool'], axis=1)
assert all(d_all1.columns == d_all2.columns)
d_all = pd.concat([d_all1, d_all2], axis = 0)
d_all.fillna('[]',inplace = True)
len(d_all)

33575

In [97]:
d_all.head()

Unnamed: 0,book,grade,level,science,text,text_org
1,K_ck12.txt,K_1,0,1,we cannot see the wind but we can feel it . t...,We cannot see the wind but we can feel it. Th...
2,K_ck12.txt,K_1,0,1,yesterday it rained . there was a lot of wind...,Yesterday it rained. There was a lot of wind....
3,K_ck12.txt,K_1,0,1,summer is here ! july is sunny and hot . summ...,Summer is here! July is sunny and hot. Summer...
4,K_ck12.txt,K_1,0,1,it 's starting to cool down from the summer h...,It's starting to cool down from the summer he...
5,K_ck12.txt,K_1,0,1,the wind pushes the kite high into the sky . ...,The wind pushes the kite high into the sky. W...


In [98]:
d_all['text_org'].values

array([' We cannot see the wind but we can feel it. The wind makes the trees move. We can see the branches and leaves moving. We can hear the wind rustling, or moving the leaves.',
       ' Yesterday it rained. There was a lot of wind. It was not safe to go out and play. The rain was good for the Earth. Rain helps the trees and flowers grow.',
       ' Summer is here! July is sunny and hot. Summer can have a lot of rain too. Which type of weather do you like best?',
       ...,
       ' Calculate the average power output of this generator. (6 marks) [TOTAL: 12 marks] SOLUTION Question 1 1. Electrical (energy) to mechanical (kinetic) energy 2. Mechanical (kinetic) energy to electrical (energy) 3. Motor effect 4. Electromagnetic induction (4 marks) Question 2 BC (conductor) is parallel to the magnetic field.',
       ' Open switch, no current (2 marks) Question 3 Option 1: Pave = VrmsIrms Option 2: Vmax Vmax Imax Vrms = 2 = 2 2 (311)(21) = 2 = 219,91 V = 3265,5 W Pmax = VmaxImax = (311)(

In [135]:
import csv
# RANDOMLY SAMPLE 100
random.seed(123)
to_annotate = random.sample(range(len(d_all['text_org'].values)), 100)
text_to_annotate = d_all['text_org'].values[to_annotate]
print(text_to_annotate[:5])

[" 6 Gregor Mendel and the Foundations of Genetics Lesson Objectives Explain Mendel's law of segregation. Draw a Punnett square to make predictions about the traits of the offspring of a simple genetic cross."
 " 2. How did Jefferson's strict constructionist interpretation of the Constitution impede his plan to acquire additional U territory?"
 ' Further Reading / Supplemental Links Mark Pagel, ed. The Oxford Encyclopedia of Evolution. New York: Oxford University Press, 2002.'
 ' CHAPTER 20. THE HYDROSPHERE GRADE 10 20 1. Carbon dioxide Carbon dioxide reacts with water in the atmosphere to form carbonic acid (H2CO3).'
 ' 4. How do the rights and responsibilities expressed in the Constitution balance tensions between personal rights and responsibilities as well as individual rights and the common good?']


In [136]:
for i, text in enumerate(text_to_annotate):
    outfile_name = ('/Users/yuling/desktop/brat-master/data/TEXTBOOK/F_M_N_mentions/F_M_N_%d.txt' % (i+1))
    with open(outfile_name, 'w', newline='') as f_output:
        f_output.write(text_to_annotate[i].strip() + "\n")

### Sample gendered mentions to label (for coref and nel)

In [9]:
# gendered
df_gendered = pd.read_csv('textbook_data/gendered.csv', delimiter= ",", low_memory=False, index_col=0)

In [10]:
random.seed(123)
to_annotate_gendered = random.sample(range(len(df_gendered['text'].values)), 300)
text_to_annotate_gendered = df_gendered['text'].values[to_annotate_gendered]
print(text_to_annotate_gendered[:5])

[" Chapter 8 MS Prokaryotes The above image shows bacteria dyed with a fluorescent color. They look just like little cells. Well, that's exactly what they are. Bacteria are prokaryotic organisms. About 3 billion years ago, long before the first plants, people, or other animals appeared, prokaryotes were the first life forms on Earth. For at least a billion years, prokaryotes ruled the Earth as the only existing organisms. What do you think of when you think of bacteria? Germs? Diseases? Bacteria can be harmful, but they can also help you. How do you think bacteria can help humans and other organisms? Did you know that bacteria are not the only type of prokaryote? There is another type, called archaea, which we will explore in addition to the questions asked above."
 ' Online Interactive Activities Use the following interactive to explore half-life and to find the age of several objects:  This interactive allows you to look at how the number of radioactive isotopes present changes over 

In [11]:
for i, text in enumerate(text_to_annotate_gendered):
    outfile_name = ('/Users/yuling/desktop/brat-master/data/TEXTBOOK/Coref_NEL_mentions/Coref_NEL_%d.txt' % (i+1))
    with open(outfile_name, 'w', newline='') as f_output:
        f_output.write(text_to_annotate_gendered[i].strip() + "\n")

### B-cubed

In [97]:
answer_dict = {
    "item1": ["A1", "A2", "A3", "A4", "A5", "A6"],
    "item2": ["B1", "B2"],
    "item3": ["C1", "C2"],
}

system_out_dict = {
    "item1": ["A1", "A2", "A3", "B2"],
    "item2": ["A4", "A5", "A6"],
    "item3": ["B1"],
    "item4": ["C1"],
    "item5": ["C2"],
}


In [98]:
def get_all_links(cdict):
    all_links = []
    all_links_by_mention = []
    for person_cluster in cdict:
        cluster = cdict[person_cluster]
        all_person = list(cluster)
        #print(all_person)
        for mention in all_person:
            mention_based = []
            for other_mention in all_person:
                #print((mention,other_mention), end = "")
                all_links.append((mention,other_mention))
                mention_based.append((mention,other_mention))
            #print()
            all_links_by_mention.append(mention_based)
        #print()
    return all_links, all_links_by_mention

In [99]:
links_answer, links_by_mention_answer = get_all_links(answer_dict)
links_output, links_by_mention_output = get_all_links(system_out_dict)

In [100]:
overlap = [link for link in links_answer if link in links_output]

In [101]:
def get_f_score(precision, recall):
    return (2 * precision * recall)/ (precision + recall)

In [102]:
print("B-cubed, average metric")
precision = len(overlap)/len(links_output)
recall = len(overlap)/len(links_answer)
print("Precision : ", precision)
print("Recall : ", recall)
print("F-score : ",  get_f_score(precision, recall))

B-cubed, average metric
Precision :  0.7857142857142857
Recall :  0.5
F-score :  0.6111111111111112


In [103]:
answer_mention_links = {links[0][0]: links for links in links_by_mention_answer}
output_mention_links = {links[0][0]: links for links in links_by_mention_output}

In [104]:
def get_precision_recall_macro(answer_mention_links, output_mention_links, get_score):
    '''
    get_socre : "precision" or "recall"
    '''
    
    if get_score == "recall":
        links1 = answer_mention_links
        links2 = output_mention_links
    elif get_score == "precision":
        links1 = output_mention_links
        links2 = answer_mention_links
    else:
        print("Indicate precision or recall!")
        return 
    
    score = 0
    for mention in links1:
        # print(mention)
        mention_links = links1[mention]
        if mention in links2:
            compare_links =  links2[mention]
            correct_links = [link for link in mention_links if link in compare_links]
            score += len(correct_links)/len(mention_links)
        else:
            score += 0
    return score/len(links1), score, len(links1)

In [105]:
recall_macro = get_precision_recall_macro(answer_mention_links, output_mention_links, "recall")
precision_macro = get_precision_recall_macro(answer_mention_links, output_mention_links, "precision")
print("B-cubed, mention-based approach")
print("Precision : ", precision_macro)
print("Recall : ", recall_macro)
print("F-score : ",  get_f_score(precision_macro, recall_macro))

B-cubed, mention-based approach
Precision :  0.85
Recall :  0.5
F-score :  0.6296296296296295


# Updated May 31: Start looking from here:

### B-cubed with list format

In [166]:
answer_idx = [[[8, 8], [15, 15], [28, 28]], [[62, 62], [73, 74], [74, 74]], [[96, 96], [102, 102]]]
answer_str = [['bacteria', 'They', 'they'], ['Earth', 'the Earth', 'Earth'], ['Bacteria', 'they']]
output_idx = [[[8, 8], [28, 28]], [[62, 62], [73, 74], [74, 74]], [[96, 96]]]
output_str = [['bacteria', 'they'], ['Earth', 'the Earth', 'Earth'], ['Bacteria']]

In [167]:
def merge_idx_str(list_idx, list_str):
    cluster_w_idx = [[(word, cluster[1][i]) for i, word in enumerate(cluster[0])] for cluster in tuple(zip(list_str, list_idx))]
    return cluster_w_idx


In [168]:
answer_cluster = merge_idx_str(answer_idx, answer_str)
output_cluster = merge_idx_str(output_idx, output_str)

In [173]:
# this is like what the functions below get from neuralcoref and allennlp
answer_cluster

[[('bacteria', [8, 8]), ('They', [15, 15]), ('they', [28, 28])],
 [('Earth', [62, 62]), ('the Earth', [73, 74]), ('Earth', [74, 74])],
 [('Bacteria', [96, 96]), ('they', [102, 102])]]

In [169]:
def get_all_links(cluster_list):
    all_links = []
    all_links_by_mention = []
    for all_person in cluster_list:
        #print(all_person)
        for mention in all_person:
            mention_based = []
            for other_mention in all_person:
                #print((mention,other_mention), end = "")
                all_links.append((mention,other_mention))
                mention_based.append((mention,other_mention))
            #print()
            all_links_by_mention.append(mention_based)
        #print()
    return all_links, all_links_by_mention

In [170]:
links_answer, links_by_mention_answer = get_all_links(answer_cluster)
links_output, links_by_mention_output = get_all_links(output_cluster)
answer_mention_links = {str(links[0][0]): links for links in links_by_mention_answer}
output_mention_links = {str(links[0][0]): links for links in links_by_mention_output}

In [171]:
def get_precision_recall_macro(answer_mention_links, output_mention_links, get_score):
    '''
    get_socre : "precision" or "recall"
    '''
    
    if get_score == "recall":
        links1 = answer_mention_links
        links2 = output_mention_links
    elif get_score == "precision":
        links1 = output_mention_links
        links2 = answer_mention_links
    else:
        print("Indicate precision or recall!")
        return 
    
    score = 0
    correct_cnt = 0 
    total_cnt = 0
    for mention in links1:
        # print(mention)
        mention_links = links1[mention]
        if mention in links2:
            compare_links =  links2[mention]
            correct_links = [link for link in mention_links if link in compare_links]
            score += len(correct_links)/len(mention_links)
            correct_cnt += len(correct_links)
            total_cnt += len(mention_links)
        else:
            total_cnt += len(mention_links)
    return score/len(links1), correct_cnt, total_cnt

def get_f_score(precision, recall):
    return (2 * precision * recall)/ (precision + recall)

In [172]:
recall, tp, tp_fn = get_precision_recall_macro(answer_mention_links, output_mention_links, "recall")
precision, tp, tp_fp = get_precision_recall_macro(answer_mention_links, output_mention_links, "precision")
print("B-cubed, mention-based approach")
print("Precision : ", precision)
print("Recall : ", recall)
print("F-score : ",  get_f_score(precision, recall))

B-cubed, mention-based approach
Precision :  1.0
Recall :  0.6041666666666666
F-score :  0.7532467532467533


In [None]:
'''
macro_avg_numerator_precision = 0
macro_avg_numerator_recall = 0
macro_avg_denominator = 300 # just the num of para
micro_avg_numerator = 0
micro_avg_denominator_recall = 0
micro_avg_denominator_precision = 0
for annotation in 300 paras:
    parse system output and annotation as above
    recall, tp1, tp_fn = get_precision_recall_macro(answer_mention_links, output_mention_links, "recall")
    precision, tp2, tp_fp = get_precision_recall_macro(answer_mention_links, output_mention_links, "precision")
    assert tp1 == tp2
    # macro-avg
    macro_avg_numerator_precision += precision
    macro_avg_numerator_recall += recall
    
    # micro-avg
    micro_avg_numerator += tp1
    micro_avg_denominator_recall += tp_fn
    micro_avg_denominator_precision += tp_fp
    

macro_precision = macro_avg_numerator_precision/ macro_avg_denominator
macro_recall = macro_avg_numerator_recall/ macro_avg_denominator

mirco_precision = micro_avg_numerator/micro_avg_denominator_precision
micro_recall = micro_avg_numerator/micro_avg_denominator_recall

'''

### NeuralCoref track character indices

In [135]:
text = "Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy."

In [None]:
# need to extend to multi sentence, rn doesnt track "Tom is happy"
def get_cluster_neuralcoref(text):
    doc = nlp(text)
    all_clusters = []
    for cluster in doc._.coref_clusters:
        cluster_mentions = []
        for mention in cluster.mentions:
            cluster_mentions.append((mention.text, [mention.start_char, mention.end_char]))
        all_clusters.append(cluster_mentions)
    return all_clusters

In [None]:
print(get_cluster_neuralcoref(text))

### AllenNLP track character indices

In [136]:
def get_cluster_allennlp(text):
    text_idx = [[word.text, word.idx, word.idx + len(word)] for word in nlp(text)]
    prediction = allen_predictor.predict_tokenized([word.text for word in nlp(text)])
    all_clusters = []
    for cluster in prediction['clusters']:
        cluster_mentions = []
        for token in cluster:
            token_idx_start = token[0]
            token_idx_end = token[1]
            if len(text_idx[token_idx_start: token_idx_end + 1]) > 1:
                #text_mention = " ".join([word[0] for word in text_idx[token_idx_start: token_idx_end + 1]])
                start = text_idx[token_idx_start][1]
                end = text_idx[token_idx_end][2]
                text_mention = text[start : end]
                text_start_end = [text_mention, start, end]
            else:
                text_start_end = text_idx[token_idx_start: token_idx_end + 1][0]
            cluster_mentions.append((text_start_end[0], [text_start_end[1], text_start_end[2]]))
        all_clusters.append(cluster_mentions)
    return all_clusters


In [None]:
print(get_cluster_allennlp(text))