# Downstream Character Identification Pipeline

In [None]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import json
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/Shareddrives/6.864 806 Class Project/downstream")

!pwd

/content/drive/Shareddrives/6.864 806 Class Project/downstream


## Preprocess Dataset

In [None]:

lit_name = 'Harry Potter 1.txt'

with open(lit_name) as f:
    content = f.read()

print(re.split('\n{2,}', content))



In [None]:
# start index (remove headings, cover page, table of content, etc. from the beginning)
# end index (remove project gutenberg copyright info from the end)
start_idx = 0 #default val now
end_idx = -1

content_list = re.split('\n{2,}', content)[start_idx:end_idx] # remove nonsense from beginning & end
content_list = list(filter(None, content_list))
paragraphs = []

for c in content_list:
    paragraphs.append(c.replace('\n', ' '))


In [None]:
paragraphs

["Harry Potter and the Sorcerer's Stone",
 'CHAPTER ONE',
 'THE BOY WHO LIVED',
 "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.",
 'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.',
 "The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about

In [None]:
def preproc_str(st): # could expand the pattern to get rid of other things!
    st= re.sub(r'[_\[\]\*]','',st)
    return st

In [None]:
lit_df = pd.DataFrame(columns = ['paragraph_id', 'sentence_id', 'words'])

from nltk.tokenize import sent_tokenize, word_tokenize

p_id, s_id = 0, 0
pID_list, sID_list, w_list = [], [], []

for p in tqdm(paragraphs):
    sentences = sent_tokenize(p)
    sent_word_count = 0
    for s in sentences:
        words = word_tokenize(s)
        words = [preproc_str(w) for w in words] # added this preproc line to get rid of punctuations
        words = list(filter(None, words)) # drop empty strings (after filtering for punc)

        if words: # if not empty list
            w_list.extend(words)
            sID_list.extend([s_id] * len(words))

            sent_word_count += len(words)
            s_id += 1
    
    pID_list.extend([p_id] * sent_word_count)
    p_id += 1

lit_df['paragraph_id'] = pID_list
lit_df['sentence_id'] = sID_list
lit_df['words'] = w_list

  0%|          | 0/3031 [00:00<?, ?it/s]

In [None]:
lit_df

Unnamed: 0,paragraph_id,sentence_id,words
0,0,0,Harry
1,0,0,Potter
2,0,0,and
3,0,0,the
4,0,0,Sorcerer
...,...,...,...
98843,3030,6765,this
98844,3030,6765,summer
98845,3030,6765,...
98846,3030,6765,.


# NER

## Preprocessing

In [None]:
def get_sentences(df):
    
    sentences = []
    for id in tqdm(df.sentence_id.unique()):
        
        sentence = df[df.sentence_id == id]['words'].values.tolist()
        sentence = list(map(lambda x:str(x).lower(),sentence))
        
        sentences.append(sentence)
        
    return sentences

In [None]:
lit_sentences = get_sentences(lit_df)

  0%|          | 0/6766 [00:00<?, ?it/s]

In [None]:
lit_sentences[:5]

[['harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['chapter', 'one'],
 ['the', 'boy', 'who', 'lived'],
 ['mr.',
  'and',
  'mrs.',
  'dursley',
  ',',
  'of',
  'number',
  'four',
  ',',
  'privet',
  'drive',
  ',',
  'were',
  'proud',
  'to',
  'say',
  'that',
  'they',
  'were',
  'perfectly',
  'normal',
  ',',
  'thank',
  'you',
  'very',
  'much',
  '.'],
 ['they',
  'were',
  'the',
  'last',
  'people',
  'you',
  "'d",
  'expect',
  'to',
  'be',
  'involved',
  'in',
  'anything',
  'strange',
  'or',
  'mysterious',
  ',',
  'because',
  'they',
  'just',
  'did',
  "n't",
  'hold',
  'with',
  'such',
  'nonsense',
  '.']]

In [None]:
import pickle

word2idx_file = open('ner_models/word2idx.txt', 'rb').read()
word2idx = pickle.loads(word2idx_file)

tag2idx_file = open('ner_models/tag2idx.txt', 'rb').read()
tag2idx = pickle.loads(tag2idx_file)

In [None]:
idx2word = {v: k for k, v in word2idx.items()}
idx2tag = {v: k for k, v in tag2idx.items()}

In [None]:
word_embeddings = np.loadtxt('ner_models/glove_embeddings.txt')
embedding_size = word_embeddings.shape[1]
word_embeddings.shape

(12490, 300)

## Model

In [None]:
import torch
from torch import nn

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
#@title BiLSTM_CRF_GloVe Model
class BiLSTM_CRF_GloVe(nn.Module):
    
    def __init__(self, vocab_size, tag2idx, emb_dim, hidden_dim, word_embeddings):
        
        super(BiLSTM_CRF_GloVe, self).__init__()
        
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag2idx = tag2idx
        self.tag_size = len(tag2idx)
    
        # BiLSTM
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.emb.weight = nn.Parameter(torch.FloatTensor(word_embeddings))
        self.lstm = nn.LSTM(emb_dim, hidden_dim // 2, num_layers = 1, bidirectional = True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tag_size)
        
        # CRF
        # transition matrix, (i,j) = score of transitioning to i from j
        self.transitions = nn.Parameter(torch.randn(self.tag_size, self.tag_size))
        # constraing: never transition to <START> and from <END>
        self.transitions.data[tag2idx['<START>'],:] = -10000
        self.transitions.data[tag2idx['<END>'],:] = -10000
        self.hidden = self.init_hidden()
    
    def init_hidden(self):
        
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device), torch.randn(2, 1, self.hidden_dim // 2).to(device))
    
    def _get_lstm_features(self, sentence):
        
        self.hidden = self.init_hidden()
        emb = self.emb(sentence).view(len(sentence),1,-1)
        lstm_output, self.hidden = self.lstm(emb, self.hidden)
        lstm_output = lstm_output.view(len(sentence), self.hidden_dim)
        
        return self.hidden2tag(lstm_output)
    
    def _forward_alg(self, features):
        
        init_alpha = torch.full((1, self.tag_size), -10000.).to(device)
        init_alpha[0][self.tag2idx['<START>']] = 0
        
        forward_var = init_alpha # wrap in variable to get automatic backpropagation
        
        # iterate through sentence
        for feature in features:
            alpha_t = [] # forward tensor for this timestamp
            for next_tag in range(self.tag_size):
                # emission score
                emit_score = feature[next_tag].view(1,-1).expand(1, self.tag_size)
                # score of transitioning to next_tag
                trans_score = self.transitions[next_tag].view(1,-1)
                # value for edge (i -> next_tag)
                next_tag_var = forward_var + trans_score + emit_score
                alpha_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alpha_t).view(1,-1)
        terminal_var = forward_var + self.transitions[self.tag2idx['<END>']]
        
        return log_sum_exp(terminal_var) # alpha
    
    def _score_sentence(self, features, tags):
        
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag2idx['<START>']], dtype=torch.long).to(device), tags])
        
        for i, feature in enumerate(features):
            score = score + self.transitions[tags[i+1], tags[i]] + feature[tags[i+1]]
        
        score = score + self.transitions[self.tag2idx['<END>'], tags[-1]]
        
        return score
        
    def _viterbi_decode(self, features):
        
        backpointers = []
        
        # initialize viterbi var
        init_vars = torch.full((1, self.tag_size), -10000.).to(device)
        init_vars[0][self.tag2idx['<START>']] = 0
        
        # forward_var[i] = viterbi var for step i-1
        forward_var = init_vars
        for feature in features:
            bptr_t = [] # backpointer for this timestamp
            viterbivars_t = [] # viterbi var for this timestamp
            
            for next_tag in range(self.tag_size):
                # next_tag_var[i] = viterbi var for tag i at previous step + 
                # transitioning score from tag i to next_tag
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptr_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
                
            # add emission score, assign forward_var to previously computed sets of viterbi vars
            forward_var = (torch.cat(viterbivars_t) + feature).view(1,-1).to(device)
            backpointers.append(bptr_t)
            
        # transition to <END>
        terminal_var = forward_var + self.transitions[self.tag2idx['<END>']]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]
        
        # decode best path using backpointers
        best_path = [best_tag_id]
        for bptr_t in reversed(backpointers):
            best_tag_id = bptr_t[best_tag_id]
            best_path.append(best_tag_id)
            
        # take out <START> tag
        start = best_path.pop()
        assert start == self.tag2idx['<START>'] # sanity check, make sure whatever is taken out is <START>
        best_path.reverse()
        
        return path_score, best_path
    
    def neg_log_likelihood(self, sentence, tags): # error function
        
        features = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(features)
        gold_score = self._score_sentence(features, tags)
        
        return forward_score - gold_score
    
    def forward(self, sentence):
        
        lstm_features = self._get_lstm_features(sentence) # emission scores
        score, tag_seq = self._viterbi_decode(lstm_features)
        
        return score, tag_seq
        
## helper codes

# return argmax as int
def argmax(vec):
    _, idx = torch.max(vec, dim=1)
    return idx.item()

# log sum exp for forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1,-1).expand(1,vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [None]:
model = BiLSTM_CRF_GloVe(vocab_size = len(word2idx), 
                         tag2idx = tag2idx, 
                         emb_dim = embedding_size, 
                         hidden_dim = 200, 
                         word_embeddings = word_embeddings)
model = model.to(device)
model.load_state_dict(torch.load('ner_models/BiLSTM_CRF_GloVe_model2.pth',map_location= 'cpu')) #don't forget to change it back

<All keys matched successfully>

## Predictions

In [None]:
# convert sequence (array) to idx
def prepare_sequence(seq, word2idx):
    # seq: array, word2idx: dict
    idxs = [word2idx[w] if w in word2idx else 0 for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
model.eval()

# evaluate on test data
preds = []
pos = []

for sentence in tqdm(lit_sentences):
    sentence_idx = prepare_sequence(sentence, word2idx)
    sentence_idx = sentence_idx.to(device)
    pred = model(sentence_idx)[1]

    ## POS tagging
    pos_tags = [tag for _,tag in nltk.pos_tag(sentence)]

    preds.append([idx2tag[idx] for idx in pred])
    pos.append(pos_tags)


  0%|          | 0/6766 [00:00<?, ?it/s]

In [None]:
flat_pred = [tag for pred in preds for tag in pred]
flat_pos = [tag for tags in pos for tag in tags]


tags = []
pronoun_tags = ['PRP', 'PRP$', 'WP', 'WP$']

for ner, pos in zip(flat_pred, flat_pos):
    if ner == 'O' and pos in pronoun_tags:
        tags.append('PRONOUN')
    else:
        tags.append(ner)

lit_df['tags'] = tags

In [None]:
lit_df

Unnamed: 0,paragraph_id,sentence_id,words,tags
0,0,0,Harry,B-PER
1,0,0,Potter,I-PER
2,0,0,and,O
3,0,0,the,O
4,0,0,Sorcerer,O
...,...,...,...,...
98843,3030,6765,this,O
98844,3030,6765,summer,O
98845,3030,6765,...,O
98846,3030,6765,.,O


### Filtering for PER tags

In [None]:
lit_df[(lit_df['tags'] == 'B-PER') | (lit_df['tags'] == 'I-PER')]

Unnamed: 0,paragraph_id,sentence_id,words,tags
0,0,0,Harry,B-PER
1,0,0,Potter,I-PER
9,2,2,THE,B-PER
10,2,2,BOY,I-PER
11,2,2,WHO,I-PER
...,...,...,...,...
98782,3029,6762,Uncle,B-PER
98783,3029,6762,Vernon,I-PER
98787,3029,6762,anyone,B-PER
98799,3030,6763,'',B-PER


In [None]:
with open('Harry_Potter_NER_Outputs.csv', 'w',newline='') as csv_file:
    lit_df.to_csv(path_or_buf=csv_file)

# NER Outputs to Coref Pipeline
## taking the from NER DF, prepare:
doc is a list of lists where sub-list 0 contains the words in sentence 0, etc. ents is a list of lists where sub-list 0 contains the start and end indices of the mentions in sentence 0. names is a list of lists where sub-list 0 contains the words in mention 0

In [None]:
lit_sentences[:2]

[['harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['chapter', 'one']]

In [None]:
def get_doc(lit_sentences):
    '''input: a list of sentences (we've already had this processed before!)
    simply inserts the sep and cls tokens for formatting'''
    all_sent = []
    for sent in lit_sentences:
        new_sent = [w for w in sent]
        new_sent.insert(0,'[SEP]')
        new_sent.append('[CLS]')
        all_sent.append(new_sent)
    return all_sent


In [None]:
hp_doc = get_doc(lit_sentences)
len(hp_doc)

6766

In [None]:
lit_sentences[0]

['harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone']

In [None]:
hp_doc[0]

['[SEP]', 'harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone', '[CLS]']

In [None]:
sense_sensibility_doc = get_doc(lit_sentences)
len(sense_sensibility_doc)

6766

In [None]:
sense_sensibility_doc[1]

['[SEP]', 'chapter', 'one', '[CLS]']

In [None]:
def get_names(df,all_sent):
    '''names is a list of lists where sub-list 0 contains the words in mention 0
    input: NER proc df, a list of sent (without start/end tok)'''
    all_names = [[] for i in range(len(all_sent))]
    df_list = df.values.tolist()
    for i,token in enumerate(df_list):
        # if token[3] == 'B-PER':
        
        if token[3].startswith('B') or token[3] == 'PRONOUN':
            # new ent
            #print(token[3])
            new_name = [token[2]]
            all_names[token[1]].append(new_name)

        # elif token[3] == 'I-PER':
        elif token[3].startswith('I'):
            # continued mention, treat it as part of the last mention
            if all_names[token[1]]:
                all_names[token[1]][-1].append(token[2]) #assume there won't be I-PER to start
            # edge case
            else:
                all_names[token[1]].append([token[2]])
        
    return all_names


In [None]:
hp_names = get_names(lit_df,lit_sentences)
hp_names[10:20]

[[['They'], ['they'], ['it'], ['anyone']],
 [['Mrs.', 'Potter'],
  ['Mrs.'],
  ['sister'],
  ['they'],
  ['Mrs.', 'Dursley'],
  ['she'],
  ['a', 'sister'],
  ['her', 'sister'],
  ['her', 'good-for-nothing', 'husband'],
  ['it']],
 [['what'], ['the', 'neighbors'], ['the', 'Potters'], ['the', 'street']],
 [['The', 'Dursleys'],
  ['the', 'Potters'],
  ['a', 'small', 'son'],
  ['they'],
  ['him']],
 [['This', 'boy'], ['they'], ['a', 'child']],
 [['Mr.'], ['Mrs.', 'Dursley'], ['our'], ['the', 'country']],
 [['Mr.', 'Dursley'], ['he'], ['his'], ['Mrs.', 'Dursley'], ['she'], ['his']],
 [['them']],
 [['Mr.', 'Dursley'], ['his'], ['Mrs.'], ['his']],
 [['Mr.', 'Dursley'], ['he'], ['the', 'house']]]

In [None]:
all_names = get_names(lit_df,lit_sentences)

In [None]:
all_names[10:20]

[[['They'], ['they'], ['it'], ['anyone']],
 [['Mrs.', 'Potter'],
  ['Mrs.'],
  ['sister'],
  ['they'],
  ['Mrs.', 'Dursley'],
  ['she'],
  ['a', 'sister'],
  ['her', 'sister'],
  ['her', 'good-for-nothing', 'husband'],
  ['it']],
 [['what'], ['the', 'neighbors'], ['the', 'Potters'], ['the', 'street']],
 [['The', 'Dursleys'],
  ['the', 'Potters'],
  ['a', 'small', 'son'],
  ['they'],
  ['him']],
 [['This', 'boy'], ['they'], ['a', 'child']],
 [['Mr.'], ['Mrs.', 'Dursley'], ['our'], ['the', 'country']],
 [['Mr.', 'Dursley'], ['he'], ['his'], ['Mrs.', 'Dursley'], ['she'], ['his']],
 [['them']],
 [['Mr.', 'Dursley'], ['his'], ['Mrs.'], ['his']],
 [['Mr.', 'Dursley'], ['he'], ['the', 'house']]]

In [None]:
def get_ents(df,all_sent):
    '''ents is a list of lists where sub-list 0 contains the start and end 
    indices of the mentions in sentence 0
    e.g: [(10,11)]
    all_sent: preprocessed list of sentences with start/end tokens!'''
    
    all_ents = [[] for i in range(len(all_sent))]
    df_list = df.values.tolist()
    all_names = get_names(df,all_sent)
    
    for i,sent in enumerate(all_sent):
        # print(sent)
        if all_names[i]:
            # print("all names[i] exists ", all_names[i])
            for name in all_names[i]:
                if all_ents[i]:
                    start_range = all_ents[i][-1][-1] #avoid multiple occurences
                    idx_pair = (all_sent[i].index(name[0].lower(),start_range),all_sent[i].index(name[-1].lower(),start_range))
                    if idx_pair not in all_ents[i]:
                        all_ents[i].append((all_sent[i].index(name[0].lower(),start_range),all_sent[i].index(name[-1].lower(),start_range)))
                else:
                    idx_pair = (all_sent[i].index(name[0].lower()),all_sent[i].index(name[-1].lower()))
                    if idx_pair not in all_ents[i]:
                        all_ents[i].append((all_sent[i].index(name[0].lower()),all_sent[i].index(name[-1].lower())))
    return all_ents







In [None]:
hp_ents  = get_ents(lit_df,hp_doc)
hp_ents[:10]

[[(1, 2)],
 [],
 [(1, 4)],
 [(1, 1), (3, 4), (18, 18), (24, 24)],
 [(1, 1), (3, 7), (19, 19)],
 [(1, 2), (4, 10)],
 [(1, 1), (3, 11), (14, 14)],
 [(1, 2), (23, 23), (28, 28), (32, 32), (37, 38)],
 [(1, 2), (4, 6), (11, 11), (15, 17)],
 [(1, 2), (5, 5), (16, 16), (21, 21), (24, 24)]]

In [None]:
hp_names[:10]

[[['Harry', 'Potter']],
 [],
 [['THE', 'BOY', 'WHO', 'LIVED']],
 [['Mr.'], ['Mrs.', 'Dursley'], ['they'], ['you']],
 [['They'], ['the', 'last', 'people', 'you', "'d"], ['they']],
 [['Mr.', 'Dursley'],
  ['the', 'director', 'of', 'a', 'firm', 'called', 'Grunnings']],
 [['He'],
  ['a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck'],
  ['he']],
 [['Mrs.', 'Dursley'], ['she'], ['her'], ['garden'], ['the', 'neighbors']],
 [['The', 'Dursleys'],
  ['a', 'small', 'son'],
  ['their'],
  ['no', 'finer', 'boy']],
 [['The', 'Dursleys'], ['they'], ['they'], ['their'], ['somebody'], ['it']]]

In [None]:
ss_ents  = get_ents(lit_df,sense_sensibility_doc)
ss_ents[:10]

[[(1, 1)],
 [(2, 2)],
 [(1, 4)],
 [(1, 1), (3, 4), (18, 18), (24, 24)],
 [(1, 1), (3, 7), (19, 19)],
 [(1, 2), (4, 10)],
 [(1, 1), (3, 11), (14, 14)],
 [(1, 2), (23, 23), (28, 28), (32, 32), (37, 38)],
 [(1, 2), (4, 6), (11, 11), (15, 17)],
 [(1, 2), (5, 5), (16, 16), (21, 21), (24, 24)]]

In [None]:
# sanity check

rand_idx = 28
print(ss_ents[rand_idx])
print(lit_sentences[rand_idx])
print(all_names[rand_idx])


NameError: ignored

In [None]:
rand_idx = 28
print(hp_ents[rand_idx])
print(hp_doc[rand_idx])
print(hp_names[rand_idx])

[(2, 3), (6, 11), (13, 13), (18, 18)]
['[SEP]', 'as', 'mr.', 'dursley', 'drove', 'around', 'the', 'corner', 'and', 'up', 'the', 'road', ',', 'he', 'watched', 'the', 'cat', 'in', 'his', 'mirror', '.', '[CLS]']
[['Mr.', 'Dursley'], ['the', 'corner', 'and', 'up', 'the', 'road'], ['he'], ['his']]


In [None]:
names = []
for name_lst in hp_names:
  for name in name_lst:
    names.append(name)
names

[['Harry'],
 ['ONE'],
 ['THE', 'BOY', 'WHO', 'LIVED'],
 ['Mr.'],
 ['Mrs.', 'Dursley'],
 ['they'],
 ['you'],
 ['They'],
 ['the', 'last', 'people', 'you', "'d"],
 ['they'],
 ['Mr.', 'Dursley'],
 ['the', 'director', 'of', 'a', 'firm', 'called', 'Grunnings'],
 ['He'],
 ['a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck'],
 ['he'],
 ['Mrs.', 'Dursley'],
 ['she'],
 ['her'],
 ['garden'],
 ['the', 'neighbors'],
 ['The', 'Dursleys'],
 ['a', 'small', 'son'],
 ['their'],
 ['no', 'finer', 'boy'],
 ['The', 'Dursleys'],
 ['they'],
 ['they'],
 ['their'],
 ['somebody'],
 ['it'],
 ['They'],
 ['they'],
 ['it'],
 ['anyone'],
 ['Mrs.', 'Potter'],
 ['Mrs.'],
 ['sister'],
 ['they'],
 ['Mrs.', 'Dursley'],
 ['she'],
 ['a', 'sister'],
 ['her', 'sister'],
 ['her', 'good-for-nothing', 'husband'],
 ['it'],
 ['what'],
 ['the', 'neighbors'],
 ['the', 'Potters'],
 ['the', 'street'],
 ['The', 'Dursleys'],
 ['the', 'Potters'],
 ['a', 'small', 'son'],
 ['they'],
 ['him'],
 ['This', 'boy'],
 ['they'],
 ['a

In [None]:
def get_max_words(lit_sentences):
  max_words = 0
  for sent in lit_sentences:
    if len(sent) > max_words:
      max_words = len(sent)
  return max_words

max_words = get_max_words(sense_sensibility_doc)

NameError: ignored

In [None]:
def get_max_ents(ss_ents):
  max_ents = 0
  for sent in ss_ents:
    if len(sent) > max_ents:
      max_ents = len(sent)
  return max_ents

max_ents = get_max_ents(ss_ents)

In [None]:
# can ignore this cell

%%bash

pip3 install -r requirements.txt

Process is terminated.


In [None]:
# DON'T RUN THIS (No Need)
from bert_coref_linear_downstream import test_downstream

test_doc_name = ('sense_and_sensibility_2_brat', '0')
test_downstream('sense_and_sensibility_2.conll', '0_linear.model', sense_sensibility_doc, ss_ents, names, max_ents, max_words, test_doc_name)

100%|██████████| 404400730/404400730 [00:30<00:00, 13333584.46B/s]
100%|██████████| 213450/213450 [00:00<00:00, 290185.21B/s]


KeyboardInterrupt: ignored

In [None]:
# harry potter
from bert_coref_linear_downstream import test_downstream

test_doc_name = ('harry_potter_2_brat', '0')
test_downstream('harry_potter_2.conll', '0_linear.model', hp_doc, hp_ents, names, max_ents, max_words, test_doc_name)

ModuleNotFoundError: ignored

# Getting Coref Cluster Indices

In [None]:
import networkx as nx
import re
import numpy 
import matplotlib.pyplot as plt
import numpy as np
import itertools  
import pandas as pd

from collections import Counter

In [None]:
conll_file = "harry_potter_2.conll"

In [None]:
def proc_raw_conll(raw_file):
    '''Gets 1) the cluster dictionary (key = cluster, value = list of mentions [start token id, end token id, sentence id])
            2) like above, decoded to get the actual strings, value = list of tuples (strings, sent id)
            3) sentence dictionary (key = sentence id, value = sentence)
    '''
             
    with open(raw_file,encoding="utf-8") as file:
        cluster_idx_map = {key : [] for key in range(1000)}
        word_idx_map = {}
        phrase = ''
        word_idx = -1
        sent_id = 0 ###
        max_sent = 0
        sentence_tups = []

        for i, line in enumerate(file):

            if i == 0:
                continue
            line_lst = line.strip().split('\t')

            if len(line_lst) <= 1:
                sent_id+=1 ###
                continue
            sentence_tups.append((sent_id,line_lst[3]))
            #print(sent_id)

            word_idx += 1
            word_idx_map[word_idx] = line_lst[3]

            cluster_str = line_lst[-1]
            if cluster_str == '_':
                continue
            # print("************************")
            # print("i: ",i)
            # print("line: ",line)
            # print("************************")
            # iterate through cluster_str to find parens
            for j, c in enumerate(cluster_str):
                if c == '(':
                    print('found open')
                    print(cluster_str)
                    cluster_idx = int(re.findall(r'\d+', cluster_str[j+1:])[0])
                    if cluster_idx not in cluster_idx_map:#for dealing with a massive list
                      break
                    cluster_idx_map[cluster_idx].append([word_idx, None, None])
                elif c == ')':
                    cluster_idx = int(re.findall(r'\d+', cluster_str[:j])[-1])
                    if cluster_idx not in cluster_idx_map:#for dealing with a massive list
                      break
                    print("cluster_idx: ",cluster_idx)
                    print("cluster_idx_map[cluster_idx]",cluster_idx_map[cluster_idx])
                    if cluster_idx_map[cluster_idx]: #noisy cases: no opening paren before a closing one
                      cluster_idx_map[cluster_idx][-1][1] = word_idx + 1
                      cluster_idx_map[cluster_idx][-1][2] = sent_id

        cluster_phrase_map = {}
        if cluster_idx_map:
            for key in sorted(cluster_idx_map.keys()):
                cluster_phrase_map[key] = []
                for (start_idx, end_idx, sent_id) in cluster_idx_map[key]:
                    if start_idx and end_idx:
                        cluster_phrase_map[key].append(([word_idx_map[idx] for idx in range(start_idx, end_idx)], sent_id))
    max_sent = max([x[0] for x in sentence_tups])
    
    keys = range(max_sent+1)
    new_dict_sent = dict(zip(keys, ([] for _ in keys)))
    #print(new_dict_sent)
    for t in sentence_tups:
        #print(t)
        new_dict_sent[t[0]].append(t[1])
    for s in new_dict_sent.keys():
        new_dict_sent[s] = ' '.join(new_dict_sent[s])
    
    return cluster_idx_map, cluster_phrase_map, new_dict_sent,sentence_tups

In [None]:
cluster_idx_map, cluster_phrase_map, new_dict_sent,sentence_tups = proc_raw_conll(conll_file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(161)
cluster_idx:  161
cluster_idx_map[cluster_idx] [[2958, 2959, 177], [2961, 2963, 177], [2974, 2975, 178], [2979, 2980, 179], [3003, 3004, 179], [3037, 3038, 183], [3039, 3040, 183], [3041, 3042, 183], [3043, 3044, 183], [3048, 3051, 185], [3064, 3065, 186], [3090, 3091, 188], [3106, 3107, 189], [3157, 3158, 193], [3227, 3228, 200], [3233, 3234, 200], [3256, 3257, 202], [3370, 3371, 208], [3389, 3390, 210], [3424, 3425, 214], [3459, 3462, 216], [3468, 3469, 216], [3503, 3504, 217], [3554, 3555, 221], [3559, 3561, 221], [3571, 3572, 222], [3577, 3578, 223], [3596, 3597, 224], [3598, 3599, 224], [3600, 3603, 224], [3614, 3615, 225], [3618, 3619, 226], [3648, 3649, 228], [3651, 3652, 228], [3681, 3682, 230], [3701, 3702, 233], [3710, 3711, 233], [3743, 3744, 233], [3756, 3757, 234], [3778, 3779, 234], [3883, 3884, 244], [3900, 3901, 247], [3906, 3907, 247], [3928, 3929, 249], [4152, 4153, 265], [4168, 4170, 266], [4177, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
found open
(3293)
found open
(3328)
found open
(3452)
found open
(3293)
found open
(3328)
found open
(3293)
found open
(3453)
found open
(3293)
found open
(3454
found open
(3293)
found open
(3328)
found open
(3455
found open
(3328)
found open
(3059)
found open
(3456)
found open
(3328)
found open
(3364)
found open
(3364)
found open
(3457)
found open
(3458)
found open
(3459)
found open
(3458)
found open
(3458)
found open
(3460
found open
(3059
found open
(3364)
found open
(3328)
found open
(3364)
found open
(3254)
found open
(3461)
found open
(3254)
found open
(3328)
found open
(3462)
found open
(3328)
found open
(3328)
found open
(3463
found open
(3464
found open
(3458)
found open
(3059)
found open
(3328)
found open
(3240)
found open
(3458)
found open
(3328)
found open
(3465)
found open
(3059)
found open
(3466)
found open
(3467)
found open
(3240)
found open
(3240)
found open
(3468
found open
(3469)
found open
(3419)
found 

In [None]:
def get_all_sentences_conll(sentence_tups):
  '''helper function to get all sentences in a list of list used with proc_raw_conll output'''
  sentences = [[] for _ in range(sentence_tups[-1][0]+1)] 
  for i in range(len(sentence_tups)):
      sentences[sentence_tups[i][0]].append(sentence_tups[i][1])
  return sentences

In [None]:
sentences = get_all_sentences_conll(sentence_tups)
sentences

[['harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone'],
 ['chapter', 'one'],
 ['the', 'boy', 'who', 'lived'],
 ['mr.',
  'and',
  'mrs.',
  'dursley',
  ',',
  'of',
  'number',
  'four',
  ',',
  'privet',
  'drive',
  ',',
  'were',
  'proud',
  'to',
  'say',
  'that',
  'they',
  'were',
  'perfectly',
  'normal',
  ',',
  'thank',
  'you',
  'very',
  'much',
  '.'],
 ['they',
  'were',
  'the',
  'last',
  'people',
  'you',
  "'d",
  'expect',
  'to',
  'be',
  'involved',
  'in',
  'anything',
  'strange',
  'or',
  'mysterious',
  ',',
  'because',
  'they',
  'just',
  'did',
  "n't",
  'hold',
  'with',
  'such',
  'nonsense',
  '.'],
 ['mr.',
  'dursley',
  'was',
  'the',
  'director',
  'of',
  'a',
  'firm',
  'called',
  'grunnings',
  ',',
  'which',
  'made',
  'drills',
  '.'],
 ['he',
  'was',
  'a',
  'big',
  ',',
  'beefy',
  'man',
  'with',
  'hardly',
  'any',
  'neck',
  ',',
  'although',
  'he',
  'did',
  'have',
  'a',
  'very',
  'large',
  'musta

In [None]:
len(sentences)

6766

In [None]:
sentences[-1]

['i',
 "'m",
 'going',
 'to',
 'have',
 'a',
 'lot',
 'of',
 'fun',
 'with',
 'dudley',
 'this',
 'summer',
 '...',
 '.',
 "''"]

In [None]:
new_dict_sent[1578]

'how often had they complained how much harry cost them to keep ?'

In [None]:
cluster_idx_map

{0: [[0, 1, 0]],
 1: [[9, 13, 2]],
 2: [[13, 14, 3]],
 3: [[15, 17, 3], [233, 239, 11]],
 4: [[30, 31, 3],
  [40, 41, 4],
  [58, 59, 4],
  [152, 153, 8],
  [165, 166, 9],
  [176, 177, 9],
  [186, 187, 10],
  [757, 758, 38]],
 5: [[36, 37, 3]],
 6: [[42, 47, 4]],
 7: [[67, 69, 5],
  [82, 83, 6],
  [95, 96, 6],
  [286, 287, 13],
  [357, 358, 16],
  [360, 361, 16],
  [380, 381, 16],
  [458, 459, 20]],
 8: [[70, 77, 5]],
 9: [[84, 93, 6]],
 10: [[103, 105, 7],
  [125, 126, 7],
  [130, 131, 7],
  [222, 224, 11],
  [225, 226, 11],
  [315, 317, 15],
  [368, 370, 16],
  [374, 375, 16],
  [1271, 1272, 68],
  [1280, 1282, 68],
  [1287, 1288, 69],
  [1657, 1659, 93],
  [1660, 1662, 94],
  [1670, 1671, 95],
  [1683, 1685, 95],
  [1994, 1995, 116],
  [2014, 2016, 118],
  [2040, 2041, 120],
  [2051, 2052, 121],
  [2095, 2097, 126],
  [2117, 2118, 127],
  [2121, 2123, 128],
  [2124, 2125, 128],
  [2137, 2138, 129],
  [3317, 3318, 205],
  [3326, 3327, 205],
  [3434, 3435, 214],
  [3649, 3650, 228],
  

In [None]:
cluster_idx_map

{0: [[0, 1, 0]],
 1: [[9, 13, 2]],
 2: [[13, 14, 3]],
 3: [[15, 17, 3], [233, 239, 11]],
 4: [[30, 31, 3],
  [40, 41, 4],
  [58, 59, 4],
  [152, 153, 8],
  [165, 166, 9],
  [176, 177, 9],
  [186, 187, 10],
  [757, 758, 38]],
 5: [[36, 37, 3]],
 6: [[42, 47, 4]],
 7: [[67, 69, 5],
  [82, 83, 6],
  [95, 96, 6],
  [286, 287, 13],
  [357, 358, 16],
  [360, 361, 16],
  [380, 381, 16],
  [458, 459, 20]],
 8: [[70, 77, 5]],
 9: [[84, 93, 6]],
 10: [[103, 105, 7],
  [125, 126, 7],
  [130, 131, 7],
  [222, 224, 11],
  [225, 226, 11],
  [315, 317, 15],
  [368, 370, 16],
  [374, 375, 16],
  [1271, 1272, 68],
  [1280, 1282, 68],
  [1287, 1288, 69],
  [1657, 1659, 93],
  [1660, 1662, 94],
  [1670, 1671, 95],
  [1683, 1685, 95],
  [1994, 1995, 116],
  [2014, 2016, 118],
  [2040, 2041, 120],
  [2051, 2052, 121],
  [2095, 2097, 126],
  [2117, 2118, 127],
  [2121, 2123, 128],
  [2124, 2125, 128],
  [2137, 2138, 129],
  [3317, 3318, 205],
  [3326, 3327, 205],
  [3434, 3435, 214],
  [3649, 3650, 228],
  

In [None]:
sentence_tups[-1]

In [None]:
cluster_phrase_map

{0: [],
 1: [(['the', 'boy', 'who', 'lived'], 2)],
 2: [(['mr.'], 3)],
 3: [(['mrs.', 'dursley'], 3),
  (['her', 'sister', 'and', 'her', 'good-for-nothing', 'husband'], 11)],
 4: [(['they'], 3),
  (['they'], 4),
  (['they'], 4),
  (['their'], 8),
  (['they'], 9),
  (['their'], 9),
  (['they'], 10),
  (['they'], 38)],
 5: [(['you'], 3)],
 6: [(['the', 'last', 'people', 'you', "'d"], 4)],
 7: [(['mr.', 'dursley'], 5),
  (['he'], 6),
  (['he'], 6),
  (['him'], 13),
  (['he'], 16),
  (['his'], 16),
  (['his'], 16),
  (['his'], 20)],
 8: [(['the', 'director', 'of', 'a', 'firm', 'called', 'grunnings'], 5)],
 9: [(['a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck'], 6)],
 10: [(['mrs.', 'dursley'], 7),
  (['she'], 7),
  (['her'], 7),
  (['mrs.', 'dursley'], 11),
  (['she'], 11),
  (['mrs.', 'dursley'], 15),
  (['mrs.', 'dursley'], 16),
  (['she'], 16),
  (['she'], 68),
  (['her', 'sister'], 68),
  (['her'], 69),
  (['his', 'wife'], 93),
  (['mrs.', 'dursley'], 94),
  (['she'], 

# Heuristic-based Clustering Merging Algorithm

### Rules


1.  If the most popular non-pronoun name between 2 clusters are overlapping, then they should be merged
2.  Overlapping: 
* Simple Rule1: String matching
* Simple Rule2: based on empirical study - if the strings start with the same non-pronoun name, then they should be merged (i.e. many Harry-related and Hagrid-related clusters will be merged)




In [None]:
HE_SERIES =  ['he', 'his', 'him', 'himself']
SHE_SERIES = ['she', 'her', 'hers', 'herself']
THEY_SERIES = ['they', 'them', 'their','theirs', 'themself']
IT_SERIES = ['it', 'itself']
XE_SERIES = ['xe', 'xem', 'xyr', 'xyrs', 'xemself']
AE_SERIES = ['ae', 'aer', 'aers', 'aerself']
FAE_SERIES = ['fae', 'faer', 'faers', 'faerself']
EY_SERIES = ['ey', 'em', 'eir', 'eirs', 'eirself']
VE_SERIES = ['ve', 'ver', 'vis', 'verself']
PER_SERIES = ['per', 'pers', 'perself']
ZE_HIR_SERIES = ['ze', 'hir', 'hirs', 'hirself']
USER_DEFINED_SERIES = []

PRONOUN_COLLECTIONS = HE_SERIES + SHE_SERIES + THEY_SERIES + IT_SERIES + XE_SERIES + AE_SERIES + FAE_SERIES + EY_SERIES + VE_SERIES + PER_SERIES + ZE_HIR_SERIES +USER_DEFINED_SERIES

In [None]:
PRONOUN_COLLECTIONS

['he',
 'his',
 'him',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'they',
 'them',
 'their',
 'theirs',
 'themself',
 'it',
 'itself',
 'xe',
 'xem',
 'xyr',
 'xyrs',
 'xemself',
 'ae',
 'aer',
 'aers',
 'aerself',
 'fae',
 'faer',
 'faers',
 'faerself',
 'ey',
 'em',
 'eir',
 'eirs',
 'eirself',
 've',
 'ver',
 'vis',
 'verself',
 'per',
 'pers',
 'perself',
 'ze',
 'hir',
 'hirs',
 'hirself']

Hypothesis: If 2 clusters have the overlapping non-pronouns, then they should be the same

In [None]:
def get_most_popular_entity_name(single_cluster_phrase):
    '''returns a string of character name of the cluster that's the most popular'''
    if not single_cluster_phrase:
      return ''
    #most_popular_name = [] # a list of tokens
    pronouns = []
    non_pronouns = []
    # case 1: all expressions are pronouns
    for ent in single_cluster_phrase:
        if len(ent[0]) == 1 and ent[0][0] in PRONOUN_COLLECTIONS:
           pronouns.append(ent[0][0])
        else:
           non_pronouns.append(' '.join(ent[0]))
    #print(pronouns)
    #print(non_pronouns)
    if not pronouns and not non_pronouns:
        return ''
    if pronouns and not non_pronouns:
        #get most freq. pronouns
        freq_pronouns = sorted([[pronoun,pronouns.count(pronoun)] for pronoun in pronouns],key=lambda c:c[1],reverse=True)
        #print(freq_pronouns[0][0])
        return freq_pronouns[0][0]
    else:
        # get most freq non-pronouns
        freq_non_pronouns = sorted([[name,non_pronouns.count(name)] for name in non_pronouns],key=lambda c:c[1],reverse=True)
        #print(freq_non_pronouns[0][0])
        return freq_non_pronouns[0][0]   

    # case 2: at least 1 expression is non-pronoun
      # case 2a: all non-pronoun expressions are the same
      # case 2b: there exist different non-pronoun expressions
        # case 2b1: simple voting could do it
        # case 2b2: tie exists - have a tie-breaking system
    #return most_popular_name       

In [None]:
def merge_clusters_rev(char_phrase_map,char_index_map):
    
    adj_cluster_map = {} #rule2's simple assumption for now

    for phrase_cluster in char_phrase_map:
        name = get_most_popular_entity_name(cluster_phrase_map[phrase_cluster])
        if name not in adj_cluster_map:
           adj_cluster_map[name] = [phrase_cluster] # add cluster number
        else:
           adj_cluster_map[name].append(phrase_cluster)

    #merge
    merged_char_phrase_map = {key : [] for key in range(len(adj_cluster_map))}
    merged_char_index_map = {key : [] for key in range(len(adj_cluster_map))}
    adj_cluster_sorted = sorted(adj_cluster_map.items(),key=lambda x:x[1],reverse=True)
    #print(adj_cluster_sorted)

    for i,cluster_idx in enumerate(adj_cluster_sorted):
        for idx in cluster_idx[-1]:
            merged_char_phrase_map[i] += char_phrase_map[idx] # want a list not a list of list
            merged_char_index_map[i] += char_index_map[idx]
        #print(merged_char_index_map[i])
        #merged_char_index_map[i] = [] if not merged_char_index_map[i] else sorted(merged_char_index_map[i],key=lambda clu:clu[-1])
        merged_char_phrase_map[i] = sorted(merged_char_phrase_map[i],key=lambda clu:clu[-1])
    #print(merged_char_index_map)
    #print(merged_char_phrase_map)

    return merged_char_phrase_map,merged_char_index_map
    

In [None]:
# HARRY MERGED CLUSTERS!
harry_merged_char_phrase_map,harry_merged_char_index_map=merge_clusters_rev(CLUSTER_PHRASE_MAP,CLUSTER_IDX_MAP)

In [None]:
harry_merged_char_index_map[47]

[[18898, 18903, 1222]]

In [None]:
import csv
def merged_cluster_phrase_to_csv(char_cluster,output_file_name): # for cluster_phrase_map
    header = ['cluster_id','reference_name', 'sentence_id']
    all_clusters = [header]
    for cluster in char_cluster:
        single_cluster = char_cluster[cluster]
        if not single_cluster:
            continue
        
        for entry in single_cluster:
                new_row = [cluster,' '.join(entry[0]),entry[-1]]
                all_clusters.append(new_row)

    with open(output_file_name,'w',encoding='utf-8',newline='') as outfile:
        csvwriter = csv.writer(outfile)
        csvwriter.writerows(all_clusters)
    return all_clusters

In [None]:
def merged_cluster_idx_to_csv(char_cluster,output_file_name): # for cluster_idx_map
    header = ['cluster_id','start_idx', 'end_idx','sentence_id']
    all_clusters = [header]
    for cluster in char_cluster:
        single_cluster = char_cluster[cluster]
        if not single_cluster:
            continue
        
        for entry in single_cluster:
                new_row = [cluster,entry[0],entry[1],entry[-1]]
                all_clusters.append(new_row)

    with open(output_file_name,'w',encoding='utf-8',newline='') as outfile:
        csvwriter = csv.writer(outfile)
        csvwriter.writerows(all_clusters)
    return all_clusters

In [None]:
merged_cluster_phrase_map = merged_cluster_phrase_to_csv(harry_merged_char_phrase_map,'Harry_Merged_Clusters_Phrase_Map.csv')
merged_cluster_idx_map = merged_cluster_idx_to_csv(harry_merged_char_index_map,'Harry_Merged_Clusters_Index_Map.csv')

In [None]:
# output an unmerged character cluster for reference
harry_cluster_phrase_map = merged_cluster_phrase_to_csv(cluster_phrase_map,'Harry_Character_Clusters_Phrase_Map.csv')

In [None]:
CLUSTER_IDX_MAP = cluster_idx_map
CLUSTER_PHRASE_MAP = cluster_phrase_map

In [None]:
CLUSTER_PHRASE_MAP[7]

[(['mr.', 'dursley'], 5),
 (['he'], 6),
 (['he'], 6),
 (['him'], 13),
 (['he'], 16),
 (['his'], 16),
 (['his'], 16),
 (['his'], 20)]

In [None]:
popular_names = {}
for clu in CLUSTER_PHRASE_MAP:
    popular_names[get_most_popular_entity_name(CLUSTER_PHRASE_MAP[clu])] = popular_names.get(get_most_popular_entity_name(CLUSTER_PHRASE_MAP[clu]),0) + 1
#popular_names = sorted(popular_names.items(),key=lambda x:x[1],reverse=True)
popular_names

{'': 3,
 "''": 8,
 "'m": 1,
 "'s aunt": 1,
 '``': 1,
 '`` watching': 1,
 "`` well , ted , '' said the weatherman": 1,
 "`` where 's the cannon": 1,
 '`` yeh look a lot like yet dad': 1,
 'a -- a wizard': 1,
 'a baby': 2,
 "a baby , ''": 1,
 'a baby angel': 1,
 'a baby boy': 1,
 'a bearded giant , uncle': 1,
 "a beginners ' guide": 1,
 'a big , beefy man with hardly any neck': 1,
 'a braver man than vernon dursley': 1,
 'a broken-down escalator that led up to a bustling road lined with shops': 1,
 'a bus': 1,
 'a cat reading a map': 1,
 'a child': 1,
 'a cold , hard wall all day , for neither as a cat nor as a woman': 1,
 'a complete stranger': 1,
 'a couple of spiders': 1,
 'a couple of them': 1,
 'a cupboard as a bedroom': 1,
 'a deafening': 1,
 'a deafening crash landed flat on the floor': 1,
 'a famous place': 1,
 'a forest': 1,
 'a friend': 1,
 'a gloomy-looking hotel on the outskirts of a big city': 1,
 'a good beating': 1,
 "a group of them next to the baker 's": 1,
 'a guide': 1

In [None]:
import scipy
from scipy.spatial import distance

In [None]:
normalized_distance = distance.hamming(list('mrs. dursley'),list('mr.dursley'))
normalized_distance

ValueError: ignored

In [None]:
# just keep track of empirical results
import csv
with open('Harry Cluster Popular Names and Count for First Two Chapters.csv','w',newline='') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerows(popular_names)


In [None]:
def merge_clusters(char_phrase_map,char_index_map):
    merged_cluster_phrase_map = {}
    merged_index_map = {}
    cluster_name_map = {}
    # iterative merge? -> merge 2 clusters first, and then merge the mega one with the 3rd
    # cluster_phrase_list = [[cluster_id,char_phrase] for char_ph]
    for char_cluster in char_phrase_map:
        cluster_name_map[char_cluster] = get_most_popular_entity_name(char_phrase_map[char_cluster])
    for char_cluster1 in char_phrase_map.items(): # defintiely could optimize!
        for char_cluster2 in char_phrase_map.items():
            if char_cluster1[0] != char_cluster2[0]:
                popular_name_cluster1 = cluster_name_map[char_cluster1[0]]
                popular_name_cluster2 = cluster_name_map[char_cluster2[0]]
                if popular_name_cluster1 == popular_name_cluster2:
                  # perform merge

                


        

    return merged_cluster_phrase_map,merged_index_map

IndentationError: ignored

In [None]:
def merge_clusters(char_phrase_map,char_index_map):
    merged_cluster_phrase_map = {}
    merged_index_map = {}
    cluster_name_map = {}
    # iterative merge? -> merge 2 clusters first, and then merge the mega one with the 3rd
    # cluster_phrase_list = [[cluster_id,char_phrase] for char_ph]
    for char_cluster in char_phrase_map:
        cluster_name_map[char_cluster] = get_most_popular_entity_name(char_phrase_map[char_cluster])
    to_merge_cluster_by_popular_name = {}
    for idx,name in enumerate(cluster_name_map):
      if name not in PRONOUN_COLLECTIONS:
        if name not in to_merge_cluster_by_popular_name:
          to_merge_cluster_by_popular_name[name] = [idx]
        else:
          to_merge_cluster_by_popular_name[name].append(idx)
    return to_merge_cluster_by_popular_name

### Appendix

In [None]:
# original code for reference


for token in corpus:
            if token.pos == spacy.symbols.VERB or token.pos == spacy.symbols.ADJ or token.pos == spacy.symbols.NOUN:
                for argument in token.children:
                    # resolve argument coreference entity
                    if argument._.in_coref: resolved = argument._.coref_clusters[0].main.text
                    else: resolved = argument.text
                    # dependency parsing: count the sharing dependencies
                    if argument.dep_ in {"nsubj", "nsubjpass"}:
                        subjects[token.lemma_.lower()][argument.text.lower()] += 1 # updating subject dict
                        ordered.append((token.lemma_, resolved.lower(), argument.dep_)) #ordered event chains?
                        total += 1
                    elif argument.dep_ in {"dobj", "iobj", "pobj", "obj"}:
                        objects[token.lemma_.lower()][argument.text.lower()] += 1
                        ordered.append((token.lemma_, resolved.lower(), argument.dep_))
                        total += 1
verbs = set(subjects.keys()) | set(objects.keys())

In [None]:
def get_probs_for_words(sent,forms):
    pre,target,post=sent.split('***')
    if 'mask' in target.lower():
        target=['[MASK]']
    else:
        target=tokenizer.tokenize(target)
    tokens=['[CLS]']+tokenizer.tokenize(pre)
    target_idx=len(tokens)
    #print(target_idx)
    tokens+=target
    if not only_prefix:
        tokens+=tokenizer.tokenize(post)
    tokens+=['[SEP]']
    input_ids=tokenizer.convert_tokens_to_ids(tokens)
    try:
        word_ids=tokenizer.convert_tokens_to_ids(forms)
    except KeyError:
        print("skipping",forms[0],"bad wins")
        return None
    tens=torch.LongTensor(input_ids).unsqueeze(0).to(device)
    with torch.no_grad():
        res=bert(tens)[0][0,target_idx]
    res=torch.nn.functional.log_softmax(res,-1)
    scores = res[word_ids]
    return [float(x.item()) for x in scores]

# Unsupervised Event Detection for Character Clusters

most simple hypothesis:
unsupervised event mining using Narrative Event Chains method; with an augmentation of character clusters


### Simple Hypothesis 1: events are verbs/Nouns/Adj and contain subject and object

want: event chains associated with each character

In [None]:
INPUT_FILE = "Harry Potter 1.txt"
TEXT = ""
INPUT_TEXT = [" ".join(sent) for sent in sentences] # a list of sent
for sent in INPUT_TEXT:
  TEXT += sent

In [None]:
TEXT



In [None]:
INPUT_TEXT[0].split()

['harry', 'potter', 'and', 'the', 'sorcerer', "'s", 'stone']

In [None]:
import spacy

In [None]:
def get_cluster_id_from_sent(sent_idx,char_idx_map):
  """helper fcn to use with token_idx_to_char_cluster_idx"""
  clu_id_list = set()
  for clu in char_idx_map:
      for phrase in char_idx_map[clu]:
          if phrase[-1] == sent_idx:
            clu_id_list.add(clu)
  return list(clu_id_list)


In [None]:
get_cluster_id_from_sent(6,cluster_idx_map)

[9, 7]

In [None]:
def token_idx_to_char_cluster_idx(sent_idx,global_word_idx,char_idx_map):
    '''locate the best char_idx cluster based on token index'''
    single_cluster_idx = -1
    clu_idx_list = get_cluster_id_from_sent(sent_idx,char_idx_map)
    for clu in clu_idx_list:
        phrases = char_idx_map[clu]
        for phrase in phrases:
            if type(phrase[0])==int and type(phrase[1])==int:
              if phrase[0] <= global_word_idx and global_word_idx < phrase[1]:
                  return clu


    return single_cluster_idx



In [None]:
token_idx_to_char_cluster_idx(3,16,cluster_idx_map)

3

In [None]:
# take instead the sentences from conll output from coref cluster from above - holistic pipeline

# with open(INPUT_FILE) as f:
    #text = " ".join(f.readlines())

# USE THESE MERGED CLUSTERS: harry_merged_char_phrase_map,harry_merged_char_index_map

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
dependencies = set()
cluster_idx_map = harry_merged_char_index_map
char_cluster = harry_merged_char_index_map # from post-processing conll files
char_cluster_events = {clu:{"subj":[],"obj":[]} for clu in char_cluster}
global_word_idx = 0
accumulated_word_idx = 0 # accumulated word idx till the i-1 sentence before


for id,sent in enumerate(INPUT_TEXT):
  corpus = nlp(sent)
  sent_length = len(sent.split())
  for token in corpus:
    # verb/noun/adj are triggers
      if token.pos == spacy.symbols.VERB or token.pos == spacy.symbols.ADJ or token.pos == spacy.symbols.NOUN:
          for argument in token.children:
              global_word_idx = accumulated_word_idx + argument.i
              if argument.dep_ in {"nsubj","nsubjpass","csubj","csubjpass"}:
                  char_cluster_idx = token_idx_to_char_cluster_idx(id,global_word_idx,cluster_idx_map)   # needs change -> map idx in particular sentence into char_cluster 
                  if char_cluster_idx != -1:  
                        # subjects[token.lemma_.lower()][argument.text.lower()] += 1 # updating subject dict
                      event = (token, argument.text, argument.dep_,sent) # WITHOUT lemma to fit downstream
                      char_cluster_events[char_cluster_idx]["subj"].append(event)
                       
              elif argument.dep_ in {"dobj", "iobj", "pobj", "obj"}:
                  char_cluster_idx = token_idx_to_char_cluster_idx(id,global_word_idx,cluster_idx_map)   # needs change -> map idx in particular sentence into char_cluster 
                  if char_cluster_idx != -1:  
                        # subjects[token.lemma_.lower()][argument.text.lower()] += 1 # updating subject dict
                      event = (token, argument.text, argument.dep_,sent) # need: substitue for resolved
                      char_cluster_events[char_cluster_idx]["obj"].append(event)
  accumulated_word_idx += sent_length
                    
char_cluster_events





    

{0: {'obj': [], 'subj': []},
 1: {'obj': [], 'subj': []},
 2: {'obj': [(passed,
    'shops',
    'dobj',
    'they passed book shops and music stores , hamburger restaurants and cinemas , but nowhere that looked as if it could sell you a magic wand .')],
  'subj': []},
 3: {'obj': [(climbed,
    'escalator',
    'dobj',
    "`` i do n't know how the muggles manage without magic , '' he said as they climbed a broken-down escalator that led up to a bustling road lined with shops .")],
  'subj': [(led,
    'that',
    'nsubj',
    "`` i do n't know how the muggles manage without magic , '' he said as they climbed a broken-down escalator that led up to a bustling road lined with shops .")]},
 4: {'obj': [], 'subj': []},
 5: {'obj': [],
  'subj': [(bring,
    'students',
    'nsubj',
    'students may also bring an owl or a cat or a toad')]},
 6: {'obj': [], 'subj': []},
 7: {'obj': [], 'subj': []},
 8: {'obj': [], 'subj': []},
 9: {'obj': [], 'subj': []},
 10: {'obj': [], 'subj': []},
 11:

In [None]:
dependencies

set()

processing char cluster dict to a CSV for further processing

In [None]:
import csv
def char_cluster_to_csv(char_cluster_events,output_file_name):
    header = ['cluster_id','obj_or_subj', 'key_word','argument','dependency','sentence']
    all_cluster_events = [header]
    for cluster in char_cluster_events:
        char_events = char_cluster_events[cluster]
        if not char_events['obj'] and not char_events['subj']:
            continue
        if char_events['obj']:
            for entry in char_events['obj']:
                new_row = [cluster,'obj',entry[0],entry[1],entry[2],entry[3]]
                all_cluster_events.append(new_row)
        if char_events['subj']:
            for entry in char_events['subj']:
                new_row = [cluster,'subj',entry[0],entry[1],entry[2],entry[3]]
                all_cluster_events.append(new_row)
    with open(output_file_name,'w',encoding='utf-8',newline='') as outfile:
        csvwriter = csv.writer(outfile)
        csvwriter.writerows(all_cluster_events)
    return all_cluster_events
  


In [None]:
all_harry_cluster_events = char_cluster_to_csv(char_cluster_events,'Harry_Cluster_Events_With_Merged_Clusters.csv')

In [None]:
len(all_harry_cluster_events)

1167

In [None]:
all_harry_cluster_events[1]

[1, 'subj', lived, 'who', 'nsubj', 'the boy who lived']