In [1]:
import numpy as np
import pandas as pd
import nltk
import gzip
import sys
import pickle as pkl

In [2]:
np.random.seed(1337)  # for reproducibility
DATA_PATH = "bi-lstm-data/"
EMBEDDINGS_PATH = DATA_PATH+'embeddings.pkl.gz'
TRAIN_SET_PATH = DATA_PATH+'train_set.pkl.gz'
TEST_SET_PATH = DATA_PATH+'test_set.csv'

# Data preprocessing
Read NLTK Spanish corpus.

https://github.com/teropa/nlp/tree/master/resources/corpora/conll2002

In [3]:
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()

[nltk_data] Downloading package conll2002 to /Users/dk/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [4]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
dev_sents = list(nltk.corpus.conll2002.iob_sents('esp.testa'))

CPU times: user 2.77 s, sys: 131 ms, total: 2.9 s
Wall time: 2.91 s


In [5]:
all_data = train_sents + test_sents + dev_sents
len(all_data)

11755

In [6]:
all_data[5]

[('Por', 'SP', 'O'),
 ('su', 'DP', 'O'),
 ('parte', 'NC', 'O'),
 (',', 'Fc', 'O'),
 ('el', 'DA', 'O'),
 ('Abogado', 'NC', 'B-PER'),
 ('General', 'AQ', 'I-PER'),
 ('de', 'SP', 'O'),
 ('Victoria', 'NC', 'B-LOC'),
 (',', 'Fc', 'O'),
 ('Rob', 'NC', 'B-PER'),
 ('Hulls', 'AQ', 'I-PER'),
 (',', 'Fc', 'O'),
 ('indicó', 'VMI', 'O'),
 ('que', 'CS', 'O'),
 ('no', 'RN', 'O'),
 ('hay', 'VAI', 'O'),
 ('nadie', 'PI', 'O'),
 ('que', 'PR', 'O'),
 ('controle', 'VMS', 'O'),
 ('que', 'CS', 'O'),
 ('las', 'DA', 'O'),
 ('informaciones', 'NC', 'O'),
 ('contenidas', 'AQ', 'O'),
 ('en', 'SP', 'O'),
 ('CrimeNet', 'NC', 'B-MISC'),
 ('son', 'VSI', 'O'),
 ('veraces', 'AQ', 'O'),
 ('.', 'Fp', 'O')]

In [7]:
def read_sentences(sentences):
    ''' Read sentences. For each sentence there is one token per line, 
    each with its part-of-speech tag and its named entity tag.
    Remove POS tag and leave only NE tag'''
    
    sentence_list = []
    for sentence in sentences:
        sentence_words = []
        for word in sentence:
            word_list = [word[0], word[2]]
            sentence_words.append(word_list)
        sentence_list.append(sentence_words)
    print(len(sentence_list), 'sentences')
    return(sentence_list)

In [8]:
data = read_sentences(all_data)

11755 sentences


In [9]:
data[5]

[['Por', 'O'],
 ['su', 'O'],
 ['parte', 'O'],
 [',', 'O'],
 ['el', 'O'],
 ['Abogado', 'B-PER'],
 ['General', 'I-PER'],
 ['de', 'O'],
 ['Victoria', 'B-LOC'],
 [',', 'O'],
 ['Rob', 'B-PER'],
 ['Hulls', 'I-PER'],
 [',', 'O'],
 ['indicó', 'O'],
 ['que', 'O'],
 ['no', 'O'],
 ['hay', 'O'],
 ['nadie', 'O'],
 ['que', 'O'],
 ['controle', 'O'],
 ['que', 'O'],
 ['las', 'O'],
 ['informaciones', 'O'],
 ['contenidas', 'O'],
 ['en', 'O'],
 ['CrimeNet', 'B-MISC'],
 ['son', 'O'],
 ['veraces', 'O'],
 ['.', 'O']]

In [10]:
# Count location words
b_cnt = 0
i_cnt = 0
loc = set()
for lst in data:
    for x in lst:
        if x[1] == 'B-LOC':
            loc.add(str(x))
            b_cnt += 1
        elif x[1] == 'I-LOC':
            loc.add(str(x))
            i_cnt += 1
            #print(x[0])    
print("**** Location counts B-LOC: ", b_cnt, " I-LOC: ", i_cnt)

**** Location counts B-LOC:  6981  I-LOC:  2553


In [11]:
# Create case embeddings with hard coded case lookup
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')
len(case2Idx)

8

In [12]:
caseEmbeddings

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [13]:
def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
            
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
      
    return caseLookup[casing]

In [14]:
# Collect words and labels from conll2002
labelSet = set() 
words = {}
for sentence in data:
    for token, label in sentence:
        labelSet.add(label)
        words[token.lower()] = True

In [15]:
# Read events
inputFilePath = "./data/Corpus.csv"
df_raw = pd.read_csv(inputFilePath)#, sep=';')
df_raw.info()
df_raw.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2419 entries, 0 to 2418
Data columns (total 10 columns):
Folio         2419 non-null int64
Fecha         2419 non-null object
Nivel_1       2419 non-null object
Nivel_2       2419 non-null object
Nivel_3       2416 non-null object
Titulo        2419 non-null object
Sintesis      2255 non-null object
Texto         2415 non-null object
Unnamed: 8    198 non-null object
Unnamed: 9    5 non-null object
dtypes: int64(1), object(9)
memory usage: 189.1+ KB


Unnamed: 0,Folio,Fecha,Nivel_1,Nivel_2,Nivel_3,Titulo,Sintesis,Texto,Unnamed: 8,Unnamed: 9
0,1,08/20/2017,ActivismoSocial,Marcha,Sindicato,Exigen a TransCanada transparentar proyecto de...,Presidentes de las sociedades cooperativas pes...,Presidentes de las sociedades cooperativas pes...,,
1,2,08/18/2017,ActivismoSocial,Marcha,Sindicato,iVan a cerrar la carretera Tuxpan-Tamiahua!,,"Tamaulipas, Ver.- Por los danos que ocasionan ...",,
2,3,08/18/2017,ActivismoSocial,Marcha,Sindicato,iVan a cerrar la carretera Tuxpan-Tamiahua!,,"Tamaulipas, Ver.- Por los danos que ocasionan ...",,


In [16]:
# Select news 
df = pd.DataFrame() 
df = df_raw[['Titulo', 'Sintesis', 'Texto']]#.copy()
df = df.dropna()
df = df.reset_index()
df = df.drop(['index'], axis = 1)
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2251 entries, 0 to 2250
Data columns (total 3 columns):
Titulo      2251 non-null object
Sintesis    2251 non-null object
Texto       2251 non-null object
dtypes: object(3)
memory usage: 52.9+ KB


Unnamed: 0,Titulo,Sintesis,Texto
0,Exigen a TransCanada transparentar proyecto de...,Presidentes de las sociedades cooperativas pes...,Presidentes de las sociedades cooperativas pes...
1,Violento fin de semana en dos estados de Mexic...,Los casos ocurrieron en Guanajuato y Chihuahua...,Los casos ocurrieron en Guanajuato y Chihuahua...
2,Expulsan a 3 personas en Pahuatlan por trabaja...,Los empleados de la TransCanada regresaban de ...,Los empleados de la TransCanada regresaban de ...


In [17]:
df['Sintesis'][8]

'Cuatro personas fueron ejecutadas esta noche en la Ciudad de Chihuahua, informaron autoridades.'

In [18]:
def textTokens(text, words_dict) :
    """Split text into a list of tokens. Add toknes to the external 'words' dictionary. 
    Return a list of tokens.
    """
    text_tokens = []    
    for sent in nltk.sent_tokenize(text):
        sent_tokens = []
        for token in nltk.word_tokenize(sent):
            #print(token)
            sent_tokens.append(token)          
            words_dict[token.lower()] = True
        text_tokens.append(sent_tokens)
    return (text_tokens)
            

In [19]:
len(words)

28382

In [20]:
# Add words from 'Titulo' column to words dictionary
df['titulo_tokens'] = df['Titulo'].apply(lambda text: textTokens(text, words))
len(words)

29986

In [21]:
# Add words from 'Sintesis' column to words dictionary
df['sintesis_tokens'] = df['Sintesis'].apply(lambda text: textTokens(text, words))
len(words)

33187

In [22]:
# Add words from 'Texto' column to words dictionary
df['texto_tokens'] = df['Texto'].apply(lambda text: textTokens(text, words))
len(words)

43221

In [23]:
df.head()

Unnamed: 0,Titulo,Sintesis,Texto,titulo_tokens,sintesis_tokens,texto_tokens
0,Exigen a TransCanada transparentar proyecto de...,Presidentes de las sociedades cooperativas pes...,Presidentes de las sociedades cooperativas pes...,"[[Exigen, a, TransCanada, transparentar, proye...","[[Presidentes, de, las, sociedades, cooperativ...","[[Presidentes, de, las, sociedades, cooperativ..."
1,Violento fin de semana en dos estados de Mexic...,Los casos ocurrieron en Guanajuato y Chihuahua...,Los casos ocurrieron en Guanajuato y Chihuahua...,"[[Violento, fin, de, semana, en, dos, estados,...","[[Los, casos, ocurrieron, en, Guanajuato, y, C...","[[Los, casos, ocurrieron, en, Guanajuato, y, C..."
2,Expulsan a 3 personas en Pahuatlan por trabaja...,Los empleados de la TransCanada regresaban de ...,Los empleados de la TransCanada regresaban de ...,"[[Expulsan, a, 3, personas, en, Pahuatlan, por...","[[Los, empleados, de, la, TransCanada, regresa...","[[Los, empleados, de, la, TransCanada, regresa..."
3,Enfrentamiento entre civiles y militares deja ...,Los hechos se registraron la madrugada de este...,se reporta una persona detenida.,"[[Enfrentamiento, entre, civiles, y, militares...","[[Los, hechos, se, registraron, la, madrugada,...","[[se, reporta, una, persona, detenida, .]]"
4,Identifican a joven localizada sin vida en Gua...,La joven madre fue identificada como Rosario J...,La joven madre fue identificada como Rosario J...,"[[Identifican, a, joven, localizada, sin, vida...","[[La, joven, madre, fue, identificada, como, R...","[[La, joven, madre, fue, identificada, como, R..."


In [24]:
# Create a mapping for the labels 
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)

In [25]:
label2Idx

{'B-MISC': 0,
 'B-LOC': 1,
 'I-PER': 2,
 'I-ORG': 3,
 'O': 4,
 'I-MISC': 5,
 'B-PER': 6,
 'I-LOC': 7,
 'B-ORG': 8}

# Word embeddings
Read in embeddings from 'Spanish Billion Words Corpus and Embeddings linguistic resource'.
https://crscardellino.github.io/SBWCE/

In [26]:
word2Idx = {}
wordEmbeddings = []
embeddingsPath = "./word2vec-spanish-vectors/SBW-vectors-300-min5.txt.gz"
# Load the pre-trained embeddings file 
fEmbeddings = gzip.open(
    embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(
    embeddingsPath, encoding="utf8")

fEmbeddings.readline() #skip first line

b'1000653 300\n'

In [27]:
%%time
# Find embedding vectors that match words from our corpus and events
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)

CPU times: user 45.6 s, sys: 391 ms, total: 46 s
Wall time: 46.1 s


In [28]:
wordEmbeddings = np.array(wordEmbeddings)
print("Embeddings shape: ", wordEmbeddings.shape)

Embeddings shape:  (75272, 300)


In [29]:
# Dictionary to store embeddings
embeddings = {'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
              'caseEmbeddings': caseEmbeddings, 'case2Idx': case2Idx,
              'label2Idx': label2Idx}

In [30]:
print(label2Idx, '\n---\n',case2Idx)

{'B-MISC': 0, 'B-LOC': 1, 'I-PER': 2, 'I-ORG': 3, 'O': 4, 'I-MISC': 5, 'B-PER': 6, 'I-LOC': 7, 'B-ORG': 8} 
---
 {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7}


In [31]:
# Save embeddings
f = gzip.open(EMBEDDINGS_PATH, 'wb')
pkl.dump(embeddings, f, -1)
f.close()

# Train Set

In [32]:
# Create train set matrices
def createMatrices(sentences, word2Idx, label2Idx, case2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    #paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        labelIndices = []
        
        for word, label in sentence:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
            
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, labelIndices]) 
        
        
    return dataset


In [33]:
train_set = createMatrices(data, word2Idx,  label2Idx, case2Idx)

In [34]:
train_set[0]

[[12558, 1, 1445, 1, 1, 1, 18468, 1, 6566, 1, 1],
 [3, 4, 3, 4, 4, 0, 1, 4, 2, 4, 4],
 [1, 4, 1, 4, 4, 4, 4, 4, 8, 4, 4]]

In [35]:
len(train_set)

11755

In [36]:
# Save train set
f = gzip.open(TRAIN_SET_PATH, 'wb')
pkl.dump(train_set, f)
f.close()

# Test Set

In [37]:
def createTestMatrices(text, word2Idx, case2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []  
    wordCount = 0
    unknownWordCount = 0
    text_wordIndices = []    
    text_caseIndices = []
    
    for sentence in text:
        sent_wordIndices = []    
        sent_caseIndices = []
        for word in sentence:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
                        
            sent_wordIndices.append(wordIdx)
            sent_caseIndices.append(getCasing(word, case2Idx))        
        
        text_wordIndices.append(sent_wordIndices)
        text_caseIndices.append(sent_caseIndices)
        
    return (text_wordIndices, text_caseIndices)


In [38]:
# Add word and case indexes to dataframe
df[['titulo_word_inds','titulo_case_inds']] = df.apply(lambda x: pd.Series(createTestMatrices(x['titulo_tokens'], word2Idx, case2Idx)), axis=1)
#df.head()

In [39]:
df[['sintesis_word_inds','sintesis_case_inds']] = df.apply(lambda x: pd.Series(createTestMatrices(x['sintesis_tokens'], word2Idx, case2Idx)), axis=1)
#df.head()

In [40]:
df[['texto_word_inds','texto_case_inds']] = df.apply(lambda x: pd.Series(createTestMatrices(x['texto_tokens'], word2Idx, case2Idx)), axis=1)
df.head()

Unnamed: 0,Titulo,Sintesis,Texto,titulo_tokens,sintesis_tokens,texto_tokens,titulo_word_inds,titulo_case_inds,sintesis_word_inds,sintesis_case_inds,texto_word_inds,texto_case_inds
0,Exigen a TransCanada transparentar proyecto de...,Presidentes de las sociedades cooperativas pes...,Presidentes de las sociedades cooperativas pes...,"[[Exigen, a, TransCanada, transparentar, proye...","[[Presidentes, de, las, sociedades, cooperativ...","[[Presidentes, de, las, sociedades, cooperativ...","[[23151, 8, 56126, 25835, 128, 2, 20125]]","[[3, 1, 3, 1, 1, 1, 1]]","[[8276, 2, 11, 2597, 8975, 13703, 2, 47087, 6,...","[[3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 4, 1, 1, 1, 3,...","[[8276, 2, 11, 2597, 8975, 13703, 2, 47087, 6,...","[[3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 4, 1, 1, 1, 3,..."
1,Violento fin de semana en dos estados de Mexic...,Los casos ocurrieron en Guanajuato y Chihuahua...,Los casos ocurrieron en Guanajuato y Chihuahua...,"[[Violento, fin, de, semana, en, dos, estados,...","[[Los, casos, ocurrieron, en, Guanajuato, y, C...","[[Los, casos, ocurrieron, en, Guanajuato, y, C...","[[42177, 127, 2, 435, 4, 38, 1425, 2, 16336, 1...","[[3, 1, 1, 1, 1, 1, 1, 1, 3, 4, 0, 1]]","[[35, 301, 8038, 4, 5840, 6, 6319, 1], [5285, ...","[[3, 1, 1, 1, 3, 1, 3, 4], [3, 1, 5, 1, 1, 1, ...","[[35, 301, 8038, 4, 5840, 6, 6319, 1], [5285, ...","[[3, 1, 1, 1, 3, 1, 3, 4], [3, 1, 5, 1, 1, 1, ..."
2,Expulsan a 3 personas en Pahuatlan por trabaja...,Los empleados de la TransCanada regresaban de ...,Los empleados de la TransCanada regresaban de ...,"[[Expulsan, a, 3, personas, en, Pahuatlan, por...","[[Los, empleados, de, la, TransCanada, regresa...","[[Los, empleados, de, la, TransCanada, regresa...","[[48750, 8, 1, 71, 4, 66437, 13, 926, 4, 20125]]","[[3, 1, 0, 1, 1, 3, 1, 1, 1, 1]]","[[35, 1659, 2, 3, 56126, 21451, 2, 855, 103, 2...","[[3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[35, 1659, 2, 3, 56126, 21451, 2, 855, 103, 2...","[[3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
3,Enfrentamiento entre civiles y militares deja ...,Los hechos se registraron la madrugada de este...,se reporta una persona detenida.,"[[Enfrentamiento, entre, civiles, y, militares...","[[Los, hechos, se, registraron, la, madrugada,...","[[se, reporta, una, persona, detenida, .]]","[[32119, 33, 1148, 6, 872, 1877, 1, 1752, 4, 6...","[[3, 1, 1, 1, 1, 1, 0, 1, 1, 3]]","[[35, 834, 12, 4628, 3, 3223, 2, 34, 1199, 4, ...","[[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]]","[[12, 12611, 17, 429, 8479, 1]]","[[1, 1, 1, 1, 1, 4]]"
4,Identifican a joven localizada sin vida en Gua...,La joven madre fue identificada como Rosario J...,La joven madre fue identificada como Rosario J...,"[[Identifican, a, joven, localizada, sin, vida...","[[La, joven, madre, fue, identificada, como, R...","[[La, joven, madre, fue, identificada, como, R...","[[29293, 8, 779, 7394, 54, 133, 4, 57330]]","[[3, 1, 1, 1, 1, 1, 1, 3]]","[[24, 779, 842, 32, 9692, 23, 3764, 17412, 1, ...","[[3, 1, 1, 1, 1, 1, 3, 3, 2, 2, 4, 1, 1, 3, 4,...","[[24, 779, 842, 32, 9692, 23, 3764, 17412, 1, ...","[[3, 1, 1, 1, 1, 1, 3, 3, 2, 2, 4, 1, 1, 3, 4,..."


In [41]:
# Save test set
df.to_csv(TEST_SET_PATH, sep=';')