# HeadMatch and HeadMatchPro 

In [28]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle

import os
import os.path
import sys

In [29]:
def reading_data(folder_name):
    files = [os.path.join(dirpath, filename)
             for dirpath, dirnames, filenames in os.walk(f"./{folder_name}") 
             for filename in [f for f in filenames if f.endswith(".csv")]]
    # returning list of dataframes and keeping only relevant features
    return [pd.read_csv(file, dtype='str')[['sent_id', 'uuid', 'form', 
                               'lemma', 'upostag', 'gender', 'number', 
                               'hero', 'head_uuid', 'pronoun_person']] 
            for file in files]

In [30]:
udpipe_dfs = reading_data('/data/texts_udpipe')

about constructed features:

**Hero** if animated and (pro)noun subject (deprel variable contains "nsubj");

**Deictic** if pronoun and Person equals either 1 or 2, if not Deictic and pronoun then Other;

**uuid** identification code of a token -- it is compose from text id, sentence id, and word id in the sentence

**head_uuid** identification code of a head token. composed the same as *uuid* but for the head word of a token

In [31]:
udpipe_dfs[0].head(3)

Unnamed: 0,sent_id,uuid,form,lemma,upostag,gender,number,hero,head_uuid,pronoun_person
0,1,11,Спасибо,спасИБО,CCONJ,,,0,10,
1,1,12,",",",",PUNCT,,,0,14,
2,1,13,что,ЧТО,SCONJ,,,0,14,


## Get list of all Noun Phrases

In [32]:
def get_all_nps(udpipe_dfs):
    """
    returns list of dfs of all NPs where head is hero: 
    each df stands for a separate text
    each row is a NP where head token is a hero
    all columns ending with _x and all the word characteristics are referring to the NP head 
    and those that end with _y are referring to the dependant word 
    """
    # returns list of dfs of all NPs where head is hero 
    # each df stands for a separate text
    all_heros = [text[text.hero == "1"] for text in udpipe_dfs]
    all_nps = [pd.merge(all_heros[text_ind], udpipe_dfs[text_ind][['uuid', 'head_uuid', 'form', 'lemma']], 
                        left_on='uuid', right_on='head_uuid', how='left')
               for text_ind in range(len(udpipe_dfs))]
    
    for text_ind in range(len(udpipe_dfs)):
        all_nps[text_ind].uuid_y = np.where(pd.isna(all_nps[text_ind].uuid_y) == True, 
                                     all_nps[text_ind].uuid_x, 
                                     all_nps[text_ind].uuid_y) 
        
        all_nps[text_ind].head_uuid_y = np.where(pd.isna(all_nps[text_ind].head_uuid_y) == True, 
                                                 all_nps[text_ind].uuid_x, 
                                                 all_nps[text_ind].head_uuid_y)
        all_nps[text_ind]['np_id'] = list(range(len(all_nps[text_ind])))
    return all_nps

In [33]:
all_nps = get_all_nps(udpipe_dfs)

In [34]:
all_nps[0].head()

Unnamed: 0,sent_id,uuid_x,form_x,lemma_x,upostag,gender,number,hero,head_uuid_x,pronoun_person,uuid_y,head_uuid_y,form_y,lemma_y,np_id
0,1,113,книги,КНИГА,NOUN,Masc,Plur,1,14,,111,113,http://royallib.com,http://royallib.com,0
1,1,113,книги,КНИГА,NOUN,Masc,Plur,1,14,,112,113,Все,ВЕСЬ,1
2,1,113,книги,КНИГА,NOUN,Masc,Plur,1,14,,114,113,автора,автора,2
3,1,113,книги,КНИГА,NOUN,Masc,Plur,1,14,,112,113,",",",",3
4,8,81,Разбойники,Разбойники,NOUN,Masc,Plur,1,82,,81,81,,,4


## HeadMatch 

•	 HeadMatch: two NPs corefer if their heads are the same (only for nouns and
deictic pronouns);

•	 HeadMatchPro: like the previous one, only non-deictic pronouns are paired
with the nearest NP that agrees in gender and number

In [63]:
def get_headmatch_corefs(text_nps, window_size=3, pro=True):
    """
    takes a dataframe of NPs with their udpipe features of one text and sets connection between two NPs if
    - they have the same head for nouns and deictic pronouns
    - if pro parameter is True: non-deictic pronouns are paired with the nearest NP with the same gender and number
    
    for pro=True: 
    window_size parameter is the number of sentences to look at while searching for coreferences
    if window_size=3 and sentence id of NP is X then we will look for closest NP in [X-3:X+3] sentences
    
    returns list of sets of NP indexes in text_nps that are coreferenced for each text
    """
    # get indexes of deictic NPs
    deictic_nps = text_nps[['sent_id', 'np_id']][(text_nps.upostag == "NOUN") | 
                                                 (text_nps.pronoun_person == "Deictic")]
    nps_by_sent = {}
    for sent in list(deictic_nps.sent_id.unique()):
        nps_by_sent[sent] = []
        deictic_nps_sent = deictic_nps[deictic_nps.sent_id == sent]
        nps_by_sent[sent].extend(deictic_nps_sent['np_id'])
        
    connections = [(np_x, np_y)
                   for sent in list(nps_by_sent.keys()) 
                   for np_x in nps_by_sent[sent] 
                   for np_y in nps_by_sent[sent] 
                   if ((int(text_nps['head_uuid_y'][text_nps.np_id == np_x]) == 
                        int(text_nps['head_uuid_y'][text_nps.np_id == np_y])) &
                       (np_x != np_y))]
    
    # TODO
    #window_nps = [(vals['sent_id']-window_size, vals['sent_id']+window_size) for ind, vals in all_nps.iterrows()]

    return connections

def get_corefs_for_all_texts(all_nps, window_size=3, method="headmatch", pro=True, folder_name="./data/texts_corefs"):
    """
    saves lists of sets (where each list of sets is list of NP coreferences for one text) to separate .pkl files 
    """
    iters = 0
    for text_id in tqdm(range(len(all_nps))):
        text_noun_phrases = all_nps[text_id]
        connections = get_headmatch_corefs(text_nps=text_noun_phrases, window_size=3, pro=True)
        with open(f'{folder_name}/{text_id}.pkl', 'wb') as f:
            pickle.dump(connections, f)
            
    print(f"Have saved all the {len(all_nps)} texts to the {folder_name} folder")

In [64]:
get_corefs_for_all_texts(all_nps=all_nps)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Have saved all the 6 texts to the ./data/texts_corefs folder


In [65]:
# with open('./data/texts_corefs/0.pkl', 'rb') as f:
#     test_corefs = pickle.load(f)