### Setting up Huggingface Neuralcoref and small example

In [1]:
# !pip install spacy==2.2.4
# compile neuralcoref from scratch according to https://github.com/huggingface/neuralcoref

In [2]:
import spacy
print(spacy.__version__) 

2.2.4


In [38]:
import en_core_web_lg
# Add neural coref to SpaCy's pipe
import neuralcoref

nlp = en_core_web_lg.load()
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fc82c047c88>

In [39]:

doc = nlp("Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.")

In [40]:
#doc._.has_coref
print(doc._.coref_clusters)

[Eva and Martha: [Eva and Martha, their, they], Jenny: [Jenny, her]]


In [31]:
all_clusters = []
for cluster in doc._.coref_clusters:
    cluster_start_end = []
    for mention in cluster.mentions:
        cluster_start_end.append([mention.start, mention.end -1])
    all_clusters.append(cluster_start_end)
print(all_clusters)
print(doc._.coref_clusters)

[[[0, 2], [6, 6], [13, 13]], [[8, 8], [15, 15]]]
[Eva and Martha: [Eva and Martha, their, they], Jenny: [Jenny, her]]


### Setting up AllenNLP coreference resolution and small example

In [6]:
#!pip install allennlp
#!pip install allennlp-models

In [7]:
from allennlp.predictors.predictor import Predictor

In [12]:
model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz'
predictor = Predictor.from_path(model_url)  # load the model

In [45]:
text = "Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.He says."
prediction = predictor.predict(document=text)  # get the prediction

In [46]:
def get_clusters_allen_nlp(prediction):
    clusters = []
    for cluster in prediction['clusters']:
        print("cluster", cluster)
        first_mention = ""
        mention_ref = {}
        for token in cluster:
            token_start = token[0]
            token_end = token[1]
            mention = " ".join(prediction['document'][token_start:token_end + 1])
            if first_mention == "":
                first_mention = mention
                mention_ref[first_mention] = [first_mention]
            else:
                mention_ref[first_mention] += [mention]
        print(mention_ref)
        clusters.append(mention_ref)
    return clusters


In [47]:
clusters_allennlp = get_clusters_allen_nlp(prediction)
print(clusters_allennlp)

cluster [[0, 2], [6, 6], [13, 13]]
{'Eva and Martha': ['Eva and Martha', 'their', 'they']}
cluster [[6, 8], [15, 15]]
{'their friend Jenny': ['their friend Jenny', 'her']}
cluster [[20, 20], [24, 24]]
{'Tom': ['Tom', 'He']}
[{'Eva and Martha': ['Eva and Martha', 'their', 'they']}, {'their friend Jenny': ['their friend Jenny', 'her']}, {'Tom': ['Tom', 'He']}]


### NER and small example

In [53]:
doc = nlp("Eva and Martha didn't want their friend Jenny to feel lonely so they invited her to the party. Tom is happy.")

In [54]:
person_found = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

In [55]:
print(person_found)

['Eva', 'Martha', 'Jenny', 'Tom']


### Dealing with textbook data

In [33]:
import math
import sys, os, re
from IPython import embed
from pprint import pprint
import string
from random import shuffle
from collections import defaultdict, Counter

from termcolor import colored

import matplotlib.pyplot as plt
from util import *
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import spacy
import en_core_web_sm
import en_core_web_lg
import inflect
from subject_object_extraction import *
import gender_guesser.detector as gender

import textacy
from textacy.extract import subject_verb_object_triples as extractSVOs

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from tqdm import tqdm_notebook as tqdm
import ast
lemma = nltk.wordnet.WordNetLemmatizer()
  
d = gender.Detector()

In [64]:
d_all1 = pd.read_csv('textbook_data/new_chapter_train_set_gender.csv', delimiter= ",", low_memory=False, index_col=0)
d_all2 = pd.read_csv('textbook_data/new_chapter_test_set_gender.csv', delimiter= ",", low_memory=False, index_col=0)
d_all1 = d_all1.drop(['bool'], axis=1)
assert all(d_all1.columns == d_all2.columns)
d_all = pd.concat([d_all1, d_all2], axis = 0)
d_all.fillna('[]',inplace = True)
len(d_all)

33575

In [66]:
d_all.head()

Unnamed: 0,book,grade,level,science,text,text_org
1,K_ck12.txt,K_1,0,1,we cannot see the wind but we can feel it . t...,We cannot see the wind but we can feel it. Th...
2,K_ck12.txt,K_1,0,1,yesterday it rained . there was a lot of wind...,Yesterday it rained. There was a lot of wind....
3,K_ck12.txt,K_1,0,1,summer is here ! july is sunny and hot . summ...,Summer is here! July is sunny and hot. Summer...
4,K_ck12.txt,K_1,0,1,it 's starting to cool down from the summer h...,It's starting to cool down from the summer he...
5,K_ck12.txt,K_1,0,1,the wind pushes the kite high into the sky . ...,The wind pushes the kite high into the sky. W...
