In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm.notebook import tqdm
from collections import Counter
import json

In [2]:
def get_df_PER(df):
    df_PER_1 = df[df['Tag'] == 'B-PER']
    df_PER_2 = df[df['Tag'] == 'S-PER']
    
    df_PER = pd.concat([df_PER_1, df_PER_2])
    df_PER = df_PER.sort_index(ascending=True)
    return df_PER

In [3]:
def get_counter(list_tag, N):
    list_elem_cleared = [elem for elem in list_tag if elem is not None]
    return Counter(list_elem_cleared).most_common(N)

In [4]:
def get_N_surronding_tokens(index, df_courant, N):
    list_token = []
    list_mention_token = []
    list_head_token = []
    
    for index_mention in index:
        if index_mention > N:
            for i in range(index_mention-N, index_mention+N):
                if i < len(df_courant):
                    if df_courant.loc[[i]]['Token'].values[0] is not None:
                        list_token.append(df_courant.loc[[i]]['Token'].values[0])
                        list_mention_token.append(df_courant.loc[[i]]['Tag'].values[0])
                        list_head_token.append(df_courant.loc[[i]]['Head_tag'].values[0])
        else:
            for i in range(index_mention, index_mention+N):
                if df_courant.loc[[i]]['Token'].values[0] is not None:
                    list_token.append(df_courant.loc[[i]]['Token'].values[0])
                    list_mention_token.append(df_courant.loc[[i]]['Tag'].values[0])
                    list_head_token.append(df_courant.loc[[i]]['Head_tag'].values[0])

    return list_token, list_mention_token, list_head_token

In [5]:
def get_PER_surrounding(list_mention_token, list_head_token):
    list_surronding_PER = []
    for i in range(len(list_mention_token)):
        if list_mention_token[i] == 'B-PER' or list_mention_token[i] == 'S-PER':
            list_surronding_PER.append(list_head_token[i])
    return list_surronding_PER

In [14]:
def get_dict_surronding_PER(df_courant, N_PER, N_token):
    dict_surronding_PER = {}
    df_PER = get_df_PER(df_courant)
    dict_mention = get_counter(list(df_PER['Head_tag']), N_PER)
    df_mention_roman = pd.DataFrame(dict_mention, columns=['mention', 'count'])

    list_mention = list(df_mention_roman['mention'])
    for i in range(0, len(list_mention)):# bug fixed (1-0)
        # get_index_mention w/ PER preds
        df_mentionned = df_courant[df_courant['Head_tag'] == list_mention[i]]
        index = df_mentionned.index
        # get N surronding tokens (useful for topic modelling, later), mention_tokens (BIES), head_mention_tokens (head coref mention)
        list_surronding_token, list_surronding_mention_token, list_surronding_head_token = get_N_surronding_tokens(index, df_courant, N_token)
        # get head mention if mention is B or S-PER 
        list_surronding_PER = get_PER_surrounding(list_surronding_mention_token, list_surronding_head_token)
        #dict_surronding_PER[list_mention[i]+'_'+str(i)] = list_surronding_PER
        # dict_mention is counter so no need to have numbered mention
        dict_surronding_PER[list_mention[i]] = list_surronding_PER
        
    return dict_surronding_PER, df_mention_roman        

In [15]:
def mouli_preds(path_name, N_PER, N_token):
    str_base = '_'
    for doc in tqdm(glob(path_name)):
        print(doc)
        df_courant = pd.read_csv(doc, sep = '\t')
        dict_surronding_PER, df_mention_PER = get_dict_surronding_PER(df_courant, N_PER, N_token)
                
        temp = doc.split('/')[1].split('.')[0]
        
        df_mention_PER.to_csv(r'res_mention_PER/'+temp+'.csv',  encoding='utf-8', index=False)
        
        df_surronding_PER = pd.DataFrame.from_dict(dict_surronding_PER, orient='index')

        df_surronding_PER = df_surronding_PER.transpose()
        df_surronding_PER.to_csv(r'res_surronding_PER/'+temp+'.csv',  encoding='utf-8', index=False)
        
        
    return df_surronding_PER, df_mention_PER

In [27]:
path_name = "data/*.csv"
path_test = "data_test/*.*"
N_PER = 10
N_token = 50

In [28]:
df_surronding_PER, df_mention_PER = mouli_preds(path_test, N_PER, N_token)

  0%|          | 0/1 [00:00<?, ?it/s]

data_test/1869_Flaubert-Gustave_L-Education-sentimentale.tsv


In [29]:
df_surronding_PER

Unnamed: 0,<Frédéric>,<Arnoux>,<Deslauriers>,<Mme Arnoux>,<Rosanette>,<Hussonnet>,<Pellerin>,<!Frédéric>,<Sénécal>,<Ils>
0,<un joueur de harpe en haillons>,<La petite fille>,"<Froissart , Commines , Pierre de l' Estoile>",<ils>,<les pareilles de Rosanette>,<un petit jeune homme qui>,<un homme de taille moyenne>,<M . Dambreuse>,<ils>,<des bourgeois>
1,<le capitaine>,<La petite fille>,"<Commines , Pierre de l' Estoile>",<ils>,<Rosanette>,<un petit jeune homme qui>,<un homme de taille moyenne>,<Deslauriers>,<Deslauriers>,<un garçon de café>
2,<Frédéric>,<Frédéric>,<Pierre de l' Estoile>,<ils>,<Frédéric>,<un petit jeune homme qui>,<un homme de taille moyenne>,<M . Dambreuse>,<ils>,<Frédéric>
3,<Frédéric>,<un jeune homme>,<Brantôme>,<Mme Arnoux>,<Frédéric>,<un petit jeune homme qui>,<Les vieux qui>,<M . Dambreuse>,<Deslauriers>,<Frédéric>
4,<Frédéric>,<Frédéric>,<Deslauriers qui>,<ils>,<un garçon d' esprit>,<un petit jeune homme qui>,<Les vieux qui>,<Deslauriers>,<ils>,<Frédéric>
...,...,...,...,...,...,...,...,...,...,...
21239,<Mme Arnoux>,,,,,,,,,
21240,<Rosanette>,,,,,,,,,
21241,<Rosanette>,,,,,,,,,
21242,<Rosanette>,,,,,,,,,


In [30]:
df_mention_PER

Unnamed: 0,mention,count
0,<Frédéric>,1465
1,<Arnoux>,1126
2,<Deslauriers>,634
3,<Mme Arnoux>,629
4,<Rosanette>,570
5,<Hussonnet>,469
6,<Pellerin>,446
7,<!Frédéric>,360
8,<Sénécal>,329
9,<Ils>,319
