# Corpus Words Study

In [1]:
# imports a sister directory: ../datasets
import sys, os 
sys.path.append(os.path.join(os.path.dirname(sys.path[0]),'datasets')) 
sys.path.append('../datasets/')

import pandas as pd 
from data_vocabularies import vocab_word2vec

INPUT_DIR= '../datasets/inputs/00'
CSV_DIR= '../datasets/csvs'

INPUT_CORPUS= '{:}/{:}'.format(CSV_DIR, 'zhou.csv')
df = pd.read_csv(INPUT_CORPUS)
df.shape

(141730, 18)

### 1. Fetch LEMMA and Word2Vec model

Under propbankbr lemmas are transformed tokens ( column 'FORM' ).
* verbs, go to infinitive
* plural gets converted to singular
* multi word terms are joint by underscode '\_'. ( That comes from token )


In [2]:
df_lemma= df.drop_duplicates(subset='LEMMA')
lemmas=df_lemma['LEMMA'].values.tolist()
print('unique lemmas',len(lemmas))

unique lemmas 9071


In [3]:
word2vec= vocab_word2vec(dataset_name='glove_s50')

In [55]:
missing=[]
for lemma in lemmas:
    try:
        word2vec[lemma]
    except KeyError:
        missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(missing), 100*float(len(missing))/len(lemmas)))        

missing lemmas:3284	 missing(%):36.20%


### 2. Preprocessing 

Embeddings from:
>  http://nilc.icmc.usp.br/embeddings

have been trained with lower cased words.

#### 2.1 Lower case

In [9]:
lower_missing=[]
for lemma in lemmas:
    try:
        word2vec[lemma.lower()]
    except KeyError:
        lower_missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(lower_missing), 100*float(len(lower_missing))/len(lemmas)))                

missing lemmas:2134	 missing(%):23.53%


#### 2.2 Remove  punctuations

In [13]:
import string
import re
re_punctuation= re.compile(r'[{:}]'.format(string.punctuation), re.UNICODE)

no_punctuation_missing=[]
for lemma in lower_missing:
    try:
        lemma2= re_punctuation.sub('', lemma.lower())
        word2vec[lemma2]
    except KeyError:
        no_punctuation_missing.append(lemma)

print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(no_punctuation_missing), 100*float(len(no_punctuation_missing))/len(lemmas)))        

missing lemmas:1984	 missing(%):21.87%


#### 2.3 All numbers are mapped to zero.

In [15]:
re_number= re.compile(r'^\d+$')

no_number_missing=[]
for lemma in no_punctuation_missing:
    try:
        lemma2= re_punctuation.sub('', lemma.lower())
        lemma2= re_number.sub('0', lemma2)        
        word2vec[lemma2]
    except KeyError:
        no_number_missing.append(lemma)
print('missing lemmas:{:d}\t missing(%):{:.2f}%'.format(len(no_number_missing), 100*float(len(no_number_missing))/len(lemmas)))        

missing lemmas:1584	 missing(%):17.46%


In [17]:
re_uppercase= re.compile(r'[A-Z]')
ner_candidates=[]
unknowns=[]
for lemma in no_number_missing:
        if re_uppercase.match(lemma):
            ner_candidates.append(lemma)    
        else:
            unknowns.append(lemma)

print('unknown lemmas:{:d}\t missing(%):{:.2f}%'.format(len(unknowns), 100*float(len(unknowns))/len(lemmas)))        
print('ner candidates lemmas:{:d}\t missing(%):{:.2f}%'.format(len(ner_candidates), 100*float(len(ner_candidates))/len(lemmas)))        

unknown lemmas:280	 missing(%):3.09%
ner candidates lemmas:1304	 missing(%):14.38%


In [18]:
with open('corpus-words-missing-ner.txt','w+',encoding="utf-8") as f:
    for word in ner_candidates:
        f.write('{:}\n'.format(word))
        
print(len(ner_candidates))        

1304


In [20]:
with open('corpus-words-missing-unknowns.txt','w+',encoding="utf-8") as f:
    for word in unknowns:
        f.write('{:}\n'.format(word))
print(len(unknowns))                

280
