# Corpus Words Study

In [1]:
# imports a sister directory: ../datasets
import sys, os 
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'datasets')) 
sys.path.append('../datasets/')

import pandas as pd 
from data_vocabularies import vocab_word2vec

INPUT_DIR= '../datasets/inputs/00'
CSV_DIR= '../datasets/csvs'

INPUT_CORPUS= '{:}/{:}'.format(CSV_DIR, 'zhou.csv')
df = pd.read_csv(INPUT_CORPUS)
df.shape

(141730, 18)

### 1. Fetch LEMMA and Word2Vec model

Under propbankbr lemmas are transformed tokens ( column 'FORM' ).
* verbs, go to infinitive
* plural gets converted to singular
* multi word terms are joint by underscode '\_'. ( That comes from token )


In [2]:
df_lemma= df.drop_duplicates(subset='LEMMA')
lemmas=df_lemma['LEMMA'].values.tolist()
print('unique lemmas',len(lemmas))

unique lemmas 9071


In [3]:
word2vec= vocab_word2vec(dataset_name='glove_s50')

In [55]:
missing=[]
for lemma in lemmas:
    try:
        word2vec[lemma]
    except KeyError:
        missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(missing), 100*float(len(missing))/len(lemmas)))        

missing lemmas:3284	 missing(%):36.20%


### 2. Preprocessing 

Embeddings from:
>  http://nilc.icmc.usp.br/embeddings

have been trained with lower cased words.

#### 2.1 Lower case

In [9]:
lower_missing=[]
for lemma in lemmas:
    try:
        word2vec[lemma.lower()]
    except KeyError:
        lower_missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(lower_missing), 100*float(len(lower_missing))/len(lemmas)))                

missing lemmas:2134	 missing(%):23.53%


#### 2.2 Remove  punctuations

In [13]:
import string
import re
re_punctuation= re.compile(r'[{:}]'.format(string.punctuation), re.UNICODE)

no_punctuation_missing=[]
for lemma in lower_missing:
    try:
        lemma2= re_punctuation.sub('', lemma.lower())
        word2vec[lemma2]
    except KeyError:
        no_punctuation_missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(no_punctuation_missing), 100*float(len(no_punctuation_missing))/len(lemmas)))        

missing lemmas:1984	 missing(%):21.87%


#### 2.3 All numbers are mapped to zero.

In [15]:
re_number= re.compile(r'^\d+$')

no_number_missing=[]
for lemma in no_punctuation_missing:
    try:
        lemma2= re_punctuation.sub('', lemma.lower())
        lemma2= re_number.sub('0', lemma2)        
        word2vec[lemma2]
    except KeyError:
        no_number_missing.append(lemma)
print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(no_number_missing), 100*float(len(no_number_missing))/len(lemmas)))        

missing lemmas:1584	 missing(%):17.46%


#### 2.4 Separate NER like from unknowns

In [17]:
re_uppercase= re.compile(r'[A-Z]')
ner_candidates=[]
unknowns=[]
for lemma in no_number_missing:
        if re_uppercase.match(lemma):
            ner_candidates.append(lemma)    
        else:
            unknowns.append(lemma)
print('saving unknowns')
with open('corpus-words-missing-unknowns.txt','w+',encoding="utf-8") as f:
    for word in unknowns:
        f.write('{:}\n'.format(word))
print('unknown lemmas:{:d}\t missing(%):{:.2f}%'.format(len(unknowns), 100*float(len(unknowns))/len(lemmas)))        

with open('corpus-words-missing-ner.txt','w+',encoding="utf-8") as f:
    for word in ner_candidates:
        f.write('{:}\n'.format(word))
        
print(len(ner_candidates))        

print('ner candidates lemmas:{:d}\t missing(%):{:.2f}%'.format(len(ner_candidates), 100*float(len(ner_candidates))/len(lemmas)))        

unknown lemmas:280	 missing(%):3.09%
ner candidates lemmas:1304	 missing(%):14.38%


#### 3.Manual NER

Manually tagging NER entities using following pattern:
    
John  lives in New   York  and works for the European Union


B-PER O     O  B-LOC I-LOC O   O     O   O   B-ORG    I-ORG

Use PER: person, LOC: location, ORG: organization O: nothing


In [22]:
ner_tag=''
with open('corpus-words-missing-ner-tags.txt','w+',encoding="utf-8") as f:
    for word in ner_candidates:
        while not(ner_tag in ['PER', 'ORG', 'LOC', 'O']):
            ner_tag= input('{:} is ''PER'', ''ORG'', ''LOC'', or ''O''?\t'.format(word))
        f.write('{:}\t{:}\n'.format(word, ner_tag))
        ner_tag=''


Pesquisa_Datafolha is PER, ORG, LOC, or O?	O
Fernando_Henrique_Cardoso is PER, ORG, LOC, or O?	PER
Confissões_de_Adolescente is PER, ORG, LOC, or O?	O
TCF1 is PER, ORG, LOC, or O?	O
Câmera_Manchete is PER, ORG, LOC, or O?	O
Rede_Manchete is PER, ORG, LOC, or O?	ORG
Ronaldo_Rosas is PER, ORG, LOC, or O?	PER
Sônia_Pompeu is PER, ORG, LOC, or O?	{ER
Sônia_Pompeu is PER, ORG, LOC, or O?	PER
Ewaldo_Ruy is PER, ORG, LOC, or O?	PER
Primeiro_Mundo is PER, ORG, LOC, or O?	O
Free_shop is PER, ORG, LOC, or O?	O
Lx_810 is PER, ORG, LOC, or O?	O
Ministério_da_Fazenda is PER, ORG, LOC, or O?	ORG
Sérgio_Danese is PER, ORG, LOC, or O?	PER
Rubens_Ricúpero is PER, ORG, LOC, or O?	PER
Benedito_Vieira_Pereira is PER, ORG, LOC, or O?	PER
Suzuki_Swift is PER, ORG, LOC, or O?	O
Herbert_Berger is PER, ORG, LOC, or O?	PER
Honda_Civic is PER, ORG, LOC, or O?	O
Exterminador_do_Futuro_2_-_O_Julgamento_Final is PER, ORG, LOC, or O?	O
Alexandre_Cardoso is PER, ORG, LOC, or O?	PER
João_Havelange is PER, ORG, LOC, or

Bradesco_VISA is PER, ORG, LOC, or O?	ORG
Marco_Antonio_Nahum is PER, ORG, LOC, or O?	PER
Winfreys is PER, ORG, LOC, or O?	O
Taras_Liskevich is PER, ORG, LOC, or O?	PER
Rio_Branco is PER, ORG, LOC, or O?	O
Santo_André is PER, ORG, LOC, or O?	LOC
Bruno_José_Daniel is PER, ORG, LOC, or O?	PER
Paulo_Cesar is PER, ORG, LOC, or O?	PER
Ronald_Reagan is PER, ORG, LOC, or O?	PER
Diário_da_Justiça is PER, ORG, LOC, or O?	ORG
Free_Press is PER, ORG, LOC, or O?	O
PSDB-PFL-PTB is PER, ORG, LOC, or O?	ORG
Gustavo_Franco is PER, ORG, LOC, or O?	PER
Comissão_Justiça_e_Paz is PER, ORG, LOC, or O?	ORG
Regina_Casé is PER, ORG, LOC, or O?	PER
Luiz_Zerbini is PER, ORG, LOC, or O?	PER
Muir_Woods is PER, ORG, LOC, or O?	PER
Itamar_Borges is PER, ORG, LOC, or O?	PER
Santa_Fé_do_Sul is PER, ORG, LOC, or O?	LOC
José_Machado_de_Campos_Filho is PER, ORG, LOC, or O?	PER
Alberto_Goldman is PER, ORG, LOC, or O?	PER
Aurélio_Alonso is PER, ORG, LOC, or O?	PER
Folha_Norte is PER, ORG, LOC, or O?	O
Burle_Filho is PER, 

Sérgio_Nogueira is PER, ORG, LOC, or O?	PER
Associação_Nacional_de_Pós-graduação_em_Ciências_Sociais is PER, ORG, LOC, or O?	ORG
Buenos_Aires is PER, ORG, LOC, or O?	LOC
Othon_Bastos is PER, ORG, LOC, or O?	PER
Lambari_d'_Oeste is PER, ORG, LOC, or O?	O
Reinaldo_Lourenço is PER, ORG, LOC, or O?	PER
Walter_Rodrigues is PER, ORG, LOC, or O?	PER
Der_Haten is PER, ORG, LOC, or O?	O
Viva_Viva is PER, ORG, LOC, or O?	O
Sucumbe_a_Cólera is PER, ORG, LOC, or O?	O
Cia._do_Linho is PER, ORG, LOC, or O?	O
Special_K is PER, ORG, LOC, or O?	O
Lorenzo_Merlino is PER, ORG, LOC, or O?	PER
Anderson_Rubbo is PER, ORG, LOC, or O?	PER
Cacá_di_Guglielmo is PER, ORG, LOC, or O?	PRE
Cacá_di_Guglielmo is PER, ORG, LOC, or O?	PER
Alessandro_Tierni is PER, ORG, LOC, or O?	PER
Claudia_Guimarães is PER, ORG, LOC, or O?	PER
Folha_Imagem is PER, ORG, LOC, or O?	O
Bolsa_de_Valores_de_São_Paulo is PER, ORG, LOC, or O?	ORG
Conselho_Monetário_Nacional is PER, ORG, LOC, or O?	ORG
Plano_Verão is PER, ORG, LOC, or O?	O
Mi

Secretaria_de_Vias_Públicas is PER, ORG, LOC, or O?	ORG
Michael_Myers is PER, ORG, LOC, or O?	PER
Freddy_Krueger is PER, ORG, LOC, or O?	PER
Rubens_Ricupero is PER, ORG, LOC, or O?	PER
Assuntos_Internacionais is PER, ORG, LOC, or O?	O
Tab_Ramos is PER, ORG, LOC, or O?	PER
Meca_Vargas is PER, ORG, LOC, or O?	PER
Associação_Médica_Norte-americana is PER, ORG, LOC, or O?	ORG
Jos_Verstappen is PER, ORG, LOC, or O?	PER
Deborah_Caldas is PER, ORG, LOC, or O?	PER
Vera_Bononi is PER, ORG, LOC, or O?	PER
Asuapi is PER, ORG, LOC, or O?	O
Departamento_de_Estado is PER, ORG, LOC, or O?	ORG
Federal_Bureau_Investigation is PER, ORG, LOC, or O?	ORG
Agropecuária_América is PER, ORG, LOC, or O?	ORG
Citrovita_Agropecuária is PER, ORG, LOC, or O?	ORG
Congresso_Nacional is PER, ORG, LOC, or O?	ORG
Programa_de_Integração_Social is PER, ORG, LOC, or O?	O
Fundo_de_Participação_dos_Estados_e_municípios is PER, ORG, LOC, or O?	O
João_Alves is PER, ORG, LOC, or O?	PER
Caio_Gorentzvaig is PER, ORG, LOC, or O?	PE

Os_Imperdoáveis is PER, ORG, LOC, or O?	O
Um_Mundo_Perfeito is PER, ORG, LOC, or O?	O
Philip_Perry is PER, ORG, LOC, or O?	PER
T.J._Lowther is PER, ORG, LOC, or O?	PER
Butch_Haynes is PER, ORG, LOC, or O?	PER
Kevin_Costner is PER, ORG, LOC, or O?	PER
Neil_Armstrong is PER, ORG, LOC, or O?	PER
Rene_Degni-Segui is PER, ORG, LOC, or O?	PER
Costa_do_Marfim is PER, ORG, LOC, or O?	LOC
Nova_Força is PER, ORG, LOC, or O?	
Nova_Força is PER, ORG, LOC, or O?	O
Djalma_Falcão is PER, ORG, LOC, or O?	PER
Congresso_Internacional_de_Direitos_Autorais is PER, ORG, LOC, or O?	ORG
Luiz_Roberto_Nascimento_Silva is PER, ORG, LOC, or O?	PER
Conselho_Nacional_de_Direito_Autoral is PER, ORG, LOC, or O?	ORG
Espiridião_Amin is PER, ORG, LOC, or O?	PER
Ronan_Tito is PER, ORG, LOC, or O?	PER
Gilberto_Miranda is PER, ORG, LOC, or O?	PER
Renata_Lourenção is PER, ORG, LOC, or O?	PER
Secretaria_Estadual_da_Saúde is PER, ORG, LOC, or O?	ORG
Centro_de_Vigilância_Epidemiológica is PER, ORG, LOC, or O?	ORG
Wagner_Costa

Irlanda_do_Norte is PER, ORG, LOC, or O?	LOC
Frank_Kerr is PER, ORG, LOC, or O?	PER
Jardim_Mutinga is PER, ORG, LOC, or O?	O
Cia._City_de_Desenvolvimento is PER, ORG, LOC, or O?	O
Sérgio_Ueta is PER, ORG, LOC, or O?	PER
Ueta_Indústria_e_Comércio_de_Aparelhos_Eletrônicos is PER, ORG, LOC, or O?	ORG
Empresarial_Jaraguá is PER, ORG, LOC, or O?	O
Martin_Luther_King is PER, ORG, LOC, or O?	PER
Humberto_Lucena is PER, ORG, LOC, or O?	PER
Adylson_Motta is PER, ORG, LOC, or O?	PER
Ambito_Financiero is PER, ORG, LOC, or O?	O
Donna_Karan is PER, ORG, LOC, or O?	PER
Bryant_Park is PER, ORG, LOC, or O?	PER
Biblioteca_Nacional is PER, ORG, LOC, or O?	O
Benito_Gama is PER, ORG, LOC, or O?	PER
John_Paul_Getty is PER, ORG, LOC, or O?	PER
John_Paul_Getty_2º is PER, ORG, LOC, or O?	PER
Antonio_Canova is PER, ORG, LOC, or O?	PER
Grande_Prêmio is PER, ORG, LOC, or O?	O
Senna_erra_e_Schumacher_vence is PER, ORG, LOC, or O?	O
Sean_Connery is PER, ORG, LOC, or O?	PER
Kate_Moss is PER, ORG, LOC, or O?	PER
Ben

James_Brown is PER, ORG, LOC, or O?	PER
Dave_Bruce is PER, ORG, LOC, or O?	PER
Adroaldo_Streck is PER, ORG, LOC, or O?	PER
Barra_Funda is PER, ORG, LOC, or O?	O
G._Love_and_Special_Sauce is PER, ORG, LOC, or O?	O
Tribunal_Superior_do_Trabalho is PER, ORG, LOC, or O?	ORG
Ordem_dos_Advogados_do_Brasil is PER, ORG, LOC, or O?	ORG
Serviço_de_Apoio_à_Pequena_e_Média_Empresa is PER, ORG, LOC, or O?	ORG
Conflitos_no_Campo_Brasil_1993 is PER, ORG, LOC, or O?	O
Associação_Brasileira_de_Imprensa is PER, ORG, LOC, or O?	ORG
Comissão_Pastoral_da_Terra is PER, ORG, LOC, or O?	ORG
Andrew_Duncan is PER, ORG, LOC, or O?	PER
José_Nazar is PER, ORG, LOC, or O?	PER
Miguel_Reale_Jr. is PER, ORG, LOC, or O?	PER
Anna_Lee-Feldstein is PER, ORG, LOC, or O?	PER
Universidade_da_Califórnia is PER, ORG, LOC, or O?	ORG
Marcus_Vinícius is PER, ORG, LOC, or O?	PER
Anderson_Leão is PER, ORG, LOC, or O?	PER
Silvio_Santos is PER, ORG, LOC, or O?	PER
Esplanada_Grill is PER, ORG, LOC, or O?	ORG
Rubinho_Gimenes is PER, OR

Virginia_Slims_Masters is PER, ORG, LOC, or O?	PER
Madison_Square_Garden is PER, ORG, LOC, or O?	LOC
Osny_Silveira_Neto is PER, ORG, LOC, or O?	PER
Guilherme_Silveira is PER, ORG, LOC, or O?	PER
Marilena_Chaui is PER, ORG, LOC, or O?	PER
Gloria_Kalil is PER, ORG, LOC, or O?	PER
Jorge_da_Cunha_Lima is PER, ORG, LOC, or O?	PER
Claude_Lefort is PER, ORG, LOC, or O?	PER
Sérgio_Cardoso is PER, ORG, LOC, or O?	PER
Cláudia_Morgado is PER, ORG, LOC, or O?	PER
Dick_Advocaat is PER, ORG, LOC, or O?	PER
Manoel_Salviano is PER, ORG, LOC, or O?	PER
Renata_Queiróz is PER, ORG, LOC, or O?	PER
Patrícia_Gomes is PER, ORG, LOC, or O?	PER
Ciro_Gomes is PER, ORG, LOC, or O?	PER
Ruth_Cardoso is PER, ORG, LOC, or O?	PER
ColorEdge is PER, ORG, LOC, or O?	O
Lono_Island is PER, ORG, LOC, or O?	O
Southern_Methodist_University is PER, ORG, LOC, or O?	ORG
Manuel_Tavares_da_Graça is PER, ORG, LOC, or O?	PER
José_Tavares_da_Graça is PER, ORG, LOC, or O?	PER
Floriano_Graça is PER, ORG, LOC, or O?	PER
John_Cassavette

#### 3.1 Estimate distribution

In [27]:
df_ner= pd.read_csv('corpus-words-missing-ner-tags.txt', sep='\t', index_col=False)
df_ner.sort_values('TOKEN', axis=0, inplace=True)
table= pd.pivot_table(data=df_ner, index='NER_TAG', values='TOKEN', aggfunc=len)
table



Unnamed: 0_level_0,TOKEN
NER_TAG,Unnamed: 1_level_1
LOC,78
O,332
ORG,265
PER,629


In [9]:
df_orgs= df_ner[df_ner['NER_TAG']=='ORG']
df_orgs.sort_values('TOKEN', axis=0, inplace=True)
df_orgs.to_csv('corpus-word-missing-orgs.txt', sep='\t', index=False)



In [10]:
df_pers= df_ner[df_ner['NER_TAG']=='PER']
df_pers.sort_values('TOKEN', axis=0, inplace=True)
df_pers.to_csv('corpus-word-missing-pers.txt', sep='\t', index=False)

In [None]:
df_locs= df_ner[df_ner['NER_TAG']=='LOC']
df_locs.sort_values('TOKEN', axis=0, inplace=True)
df_locs.to_csv('corpus-word-missing-locs.txt', sep='\t', index=False)

In [26]:
df_o= df_ner[df_ner['NER_TAG']=='O']
df_o.sort_values('TOKEN', axis=0, inplace=True)
df_o.to_csv('corpus-word-missing-o.txt', sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
