# HeadMatch and HeadMatchPro 

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pickle

import os
import os.path
import sys

In [2]:
def reading_data(folder_name):
    files = [os.path.join(dirpath, filename)
             for dirpath, dirnames, filenames in os.walk(f"./{folder_name}") 
             for filename in [f for f in filenames if f.endswith(".csv")]]
    # returning list of dataframes and keeping only relevant features
    return [pd.read_csv(file, dtype='str')[['sent_id', 'uuid', 'form', 
                               'lemma', 'upostag', 'gender', 'number', 
                               'hero', 'head_uuid', 'pronoun_person']] 
            for file in files]

In [3]:
udpipe_dfs = reading_data('/data/texts_udpipe')

about constructed features:

**Hero** if animated and (pro)noun subject (deprel variable contains "nsubj");

**Deictic** if pronoun and Person equals either 1 or 2, if not Deictic and pronoun then Other;

**uuid** identification code of a token -- it is compose from text id, sentence id, and word id in the sentence

**head_uuid** identification code of a head token. composed the same as *uuid* but for the head word of a token

In [4]:
udpipe_dfs[0].head(3)

Unnamed: 0,sent_id,uuid,form,lemma,upostag,gender,number,hero,head_uuid,pronoun_person
0,1,11,Спасибо,спасибо,NOUN,,Sing,0,10,
1,1,12,",",",",PUNCT,,,0,14,
2,1,13,что,что,SCONJ,,,0,14,


## Get list of all Noun Phrases

In [5]:
def get_all_nps(udpipe_dfs):
    """
    returns list of dfs of all NPs where head is hero: 
    each df stands for a separate text
    each row is a NP where head token is a hero
    all columns ending with _x and all the word characteristics are referring to the NP head 
    and those that end with _y are referring to the dependant word 
    """
    # returns list of dfs of all NPs where head is hero 
    # each df stands for a separate text
    all_heros = [text[text.hero == "1"] for text in udpipe_dfs]
    all_nps = [pd.merge(all_heros[text_ind], udpipe_dfs[text_ind][['uuid', 'head_uuid', 'form', 'lemma']], 
                        left_on='uuid', right_on='head_uuid', how='left')
               for text_ind in range(len(udpipe_dfs))]
    
    for text_ind in range(len(udpipe_dfs)):
        all_nps[text_ind].uuid_y = np.where(pd.isna(all_nps[text_ind].uuid_y) == True, 
                                     all_nps[text_ind].uuid_x, 
                                     all_nps[text_ind].uuid_y) 
        
        all_nps[text_ind].head_uuid_y = np.where(pd.isna(all_nps[text_ind].head_uuid_y) == True, 
                                                 all_nps[text_ind].uuid_x, 
                                                 all_nps[text_ind].head_uuid_y)
        all_nps[text_ind]['np_id'] = list(range(len(all_nps[text_ind])))
    return all_nps

In [81]:
all_nps = get_all_nps(udpipe_dfs)

In [82]:
all_nps[0].head()

Unnamed: 0,sent_id,uuid_x,form_x,lemma_x,upostag,gender,number,hero,head_uuid_x,pronoun_person,uuid_y,head_uuid_y,form_y,lemma_y,np_id
0,8,81,Разбойники,Разбойник,NOUN,Masc,Plur,1,82,,81,81,,,0
1,19,192,Мальчики,мальчик,NOUN,Masc,Plur,1,191,,194,192,новоткацкой,новоткацкая,1
2,20,206,мальчики,мальчик,NOUN,Masc,Plur,1,207,,206,206,,,2
3,22,223,гости,гость,NOUN,Masc,Plur,1,222,,223,223,,,3
4,23,231,Разбойники,Разбойник,NOUN,Masc,Plur,1,232,,231,231,,,4


## HeadMatch 

•	 HeadMatch: two NPs corefer if their heads are the same (only for nouns and
deictic pronouns);

•	 HeadMatchPro: like the previous one, only non-deictic pronouns are paired
with the nearest NP that agrees in gender and number

In [199]:
def get_headmatch_corefs(text_nps, window_size=3, pro=True):
    """
    takes a dataframe of NPs with their udpipe features of one text and sets connection between two NPs if
    - they have the same head for nouns and deictic pronouns
    - if pro parameter is True: non-deictic pronouns are paired with the nearest NP with the same gender and number
    
    for pro=True: 
    window_size parameter is the number of sentences to look at while searching for coreferences
    if window_size=3 and sentence id of NP is X then we will look for closest NP in [X-3:X+3] sentences
    
    returns list of sets of NP indexes in text_nps that are coreferenced for each text
    """
    # get indexes of deictic NPs
    deictic_nps = text_nps[['sent_id', 'np_id']][(text_nps.upostag == "NOUN") | 
                                                 (text_nps.pronoun_person == "Deictic")]
    nps_by_sent = {}
    for sent in list(deictic_nps.sent_id.unique()):
        nps_by_sent[sent] = []
        deictic_nps_sent = deictic_nps[deictic_nps.sent_id == sent]
        nps_by_sent[sent].extend(deictic_nps_sent['np_id'])
        
    connections = [(np_x, np_y)
                   for sent in list(nps_by_sent.keys()) 
                   for np_x in nps_by_sent[sent] 
                   for np_y in nps_by_sent[sent] 
                   if ((int(text_nps['head_uuid_y'][text_nps.np_id == np_x]) == 
                        int(text_nps['head_uuid_y'][text_nps.np_id == np_y])) &
                       (np_x != np_y))]
    
    if pro:
        non_deic = text_nps.np_id[text_nps.pronoun_person=='Non-deictic'].tolist()
        windows_inds = [(int(vals['sent_id'])-window_size, int(vals['sent_id'])+window_size) 
                        for ind, vals in text_nps.iterrows() 
                        if vals.np_id in non_deic]
        windows_words = [text_nps[text_nps.sent_id.astype(int).isin(range(window[0], window[1]+1))] 
                         for window in windows_inds]
        nps_by_wind = [(vals_X['np_id'], (vals_X['np_id'], vals_Y['np_id']))
                       for window in windows_words
                       for ind_X, vals_X in window.iterrows()
                       for ind_Y, vals_Y in window.iterrows() 
                       if ((vals_X.np_id in non_deic) or vals_X.upostag == "NOUN") 
                       and (vals_X.gender == vals_Y.gender) 
                       and (vals_X.number == vals_Y.number) 
                       and (vals_X.np_id != vals_Y.np_id)]

        np_conns = {}
        for np in nps_by_wind:
            np_conns[np[0]] = []
            for connection in nps_by_wind:
                if np[0] == connection[0]:
                    np_conns[np[0]].extend(connection[1])
                    np_conns[np[0]].remove(np[0])
        
        # choosing the closests matching np
        connections_pro = []
        for i in list(np_conns.keys()):
            np_x = i
            np_y = min(np_conns[i], key=lambda x:abs(x-i))
            connections_pro.append((np_x, np_y))

        connections.extend(connections_pro)
        connections = list(set(connections))

    return connections

def get_corefs_for_all_texts(all_nps, window_size=3, method="headmatch", pro=True, folder_name="./data/texts_corefs"):
    """
    saves lists of sets (where each list of sets is list of NP coreferences for one text) to separate .pkl files 
    """
    iters = 0
    for text_id in tqdm(range(len(all_nps))):
        text_noun_phrases = all_nps[text_id]
        connections = get_headmatch_corefs(text_nps=text_noun_phrases, window_size=3, pro=pro)
        with open(f'{folder_name}/{text_id}_{method}_{pro}.pkl', 'wb') as f:
            pickle.dump(connections, f)
            
    print(f"Have saved all the {len(all_nps)} texts to the {folder_name} folder")

In [200]:
get_corefs_for_all_texts(all_nps, window_size=10)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Have saved all the 6 texts to the ./data/texts_corefs folder


In [201]:
get_corefs_for_all_texts(all_nps, pro=False)

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


Have saved all the 6 texts to the ./data/texts_corefs folder


Checking how similar the headmatch and headmatchpro lists of coreferences are

In [202]:
import difflib

for text in range(6):
    with open(f'./data/texts_corefs/{text}_headmatch_True.pkl', 'rb') as f:
        head_true = pickle.load(f)

    with open(f'./data/texts_corefs/{text}_headmatch_False.pkl', 'rb') as f:
        head_false = pickle.load(f)

    sm=difflib.SequenceMatcher(None,head_true,head_false)
    print(sm.ratio())

0.013782542113323124
0.05037783375314862
0.007774538386783284
0.007774538386783284
0.021447721179624665
0.06535947712418301
