# Modelo Ocultos de Markov

El presente notebook implementa un HMM para Named Entity Recognition en Español. Se utilizo el dataset conll2002(esp) de nltk. Para ello primeramente se preproceso el dataset, se calculo los parametros del modelo mediante el conteo de frecuencias y por ultimo se implemento el algoritmo de Viterbi para la decodificacion de oraciones.

## 1. Preprocesamiento del Dataset

In [1]:

from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn

import pandas as pd
import numpy as np

#import pycrfsuite

In [2]:
type(nltk.corpus.conll2002.fileids())
from nltk.corpus import conll2002


In [3]:
type(conll2002.chunked_sents('esp.train'))

nltk.collections.LazyMap

In [4]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))

In [5]:
#numero de oraciones del dataset
N_sentences =len(train_sents)

In [6]:
##definicion de funciones para preprocesamiento del dataset
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


##Obtener tags
def sent2labels(sent):
    return [label for token, postag, label in sent]
##tokenizar cadena de texto
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [7]:
#test
sent2labels(train_sents[0])

['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']

In [8]:
#test
sent2tokens(train_sents[2])

['El',
 'Abogado',
 'General',
 'del',
 'Estado',
 ',',
 'Daryl',
 'Williams',
 ',',
 'subrayó',
 'hoy',
 'la',
 'necesidad',
 'de',
 'tomar',
 'medidas',
 'para',
 'proteger',
 'al',
 'sistema',
 'judicial',
 'australiano',
 'frente',
 'a',
 'una',
 'página',
 'de',
 'internet',
 'que',
 'imposibilita',
 'el',
 'cumplimiento',
 'de',
 'los',
 'principios',
 'básicos',
 'de',
 'la',
 'Ley',
 '.']

## 2. Calculo de parametros

In [9]:
##Funcion que cuenta el numero de veces que se usa un tag  en el dataset
def find_states_counts(n_sentences):
    tags = []
    #concatenate all tags
    for i in range(n_sentences):
        tags = tags + sent2labels(train_sents[i])

    #{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}
    dic = dict()
    states = set(tags) #set
    states = dic.fromkeys(states,0)
    
    #count frecuency by tag
    for f in range(n_sentences):
        tags = sent2labels(train_sents[f])
        for t in tags:
            states[t] = 1 + states[t]
        
        #ids = [i for i in range(len(states))]
        #states = dict(zip(states.keys(), ids))
    return states


In [10]:
states = find_states_counts(N_sentences)                            
N_STATES = len(states)
states

{'I-PER': 3903,
 'B-ORG': 7390,
 'B-LOC': 4913,
 'I-ORG': 4992,
 'I-MISC': 3212,
 'I-LOC': 1891,
 'O': 231920,
 'B-PER': 4321,
 'B-MISC': 2173}

### Probabilidades iniciales

In [11]:

def calc_start_prob(dic, total_sentences):
    start_probabilities = {}
    start_probabilities = start_probabilities.fromkeys(dic.keys(),0)
    for i in range(N_sentences):
        tag = sent2labels(train_sents[i])[0]
        start_probabilities[tag] = 1+ start_probabilities[tag]
    
    total = sum(start_probabilities.values())
    print(total)
    
    assert total == total_sentences, "Houston tenemos un problema: not equal sum..."
    
    for i in start_probabilities:
        start_probabilities[i] = float(start_probabilities[i]/total)
    
    assert 1.0 == sum(start_probabilities.values()), "Houston tenemos un problema: not equal to 1.0..."
    
    return start_probabilities


In [12]:
initial_prob = calc_start_prob(states,N_sentences)
initial_prob

8323


{'I-PER': 0.0,
 'B-ORG': 0.03556409948335937,
 'B-LOC': 0.07605430734110297,
 'I-ORG': 0.0,
 'I-MISC': 0.0,
 'I-LOC': 0.00012014898474107894,
 'O': 0.7998317914213625,
 'B-PER': 0.07232968881412952,
 'B-MISC': 0.016099963955304577}

### Matriz de trancision

In [13]:
def calc_transitions(states, n_states, n_sentences):
    data = np.zeros([n_states, n_states], dtype = float)
    transitions = pd.DataFrame(data, index= states.keys(),columns=states.keys())
#    print('initial transitions: ',transitions)
    for s in range(n_sentences):
        tags = sent2labels(train_sents[s])
        for j in range(len(tags)-1):
            if tags[j]!=tags[j+1]:
                transitions.at[tags[j],tags[j+1]] += 1
                #print('->',tags[j],tags[j+1])
    
 #   print('sums transitions: ',transitions)
    for idx in transitions.index:
        for  col in transitions.columns:
            transitions[col][idx] /=  states[idx]    
    return transitions

In [14]:
transitions = calc_transitions(states, N_STATES,N_sentences)
transitions

Unnamed: 0,I-PER,B-ORG,B-LOC,I-ORG,I-MISC,I-LOC,O,B-PER,B-MISC
I-PER,0.0,0.0,0.005893,0.0,0.0,0.0,0.729439,0.001537,0.0
B-ORG,0.0,0.0,0.00406,0.303383,0.0,0.0,0.686333,0.005954,0.000135
B-LOC,0.0,0.0,0.0,0.0,0.0,0.216975,0.774883,0.002646,0.0
I-ORG,0.0,0.000601,0.000401,0.0,0.0,0.0,0.44391,0.004006,0.0002
I-MISC,0.0,0.000623,0.000934,0.0,0.0,0.0,0.383562,0.002491,0.002179
I-LOC,0.0,0.0,0.001058,0.0,0.0,0.0,0.563194,0.0,0.0
O,0.0,0.030515,0.018045,0.0,0.0,0.0,0.0,0.015609,0.008753
B-PER,0.665587,0.000231,0.000694,0.0,0.0,0.0,0.333256,0.0,0.0
B-MISC,0.0,0.004602,0.002761,0.0,0.577543,0.0,0.411873,0.003221,0.0


### Matriz de emision

In [15]:
len(sent2tokens(train_sents[2]))
all_sentences = []
for s in range(N_sentences):
    all_sentences += sent2tokens(train_sents[s])
    
len(all_sentences)
#dic = dict()
all_sentences = set(all_sentences) #set
all_sentences = list(all_sentences)
N_WORDS = len(all_sentences)

data = np.zeros([N_STATES, N_WORDS], dtype = float)
emissions = pd.DataFrame(data, index= states.keys(),columns=all_sentences)


#states = dic.fromkeys(states,0)

In [16]:

def cal_emissions():
    for index in range(N_sentences):
        sentence = sent2tokens(train_sents[index])
        tags = sent2labels(train_sents[index])

        for j in range(len(tags)):
            emissions[sentence[j]][tags[j]] += 1
    
    for idx in emissions.index:
        for col in emissions.columns:
            if states[idx] != 0:
                emissions[col][idx] /= states[idx]
            #print(emissions[sentence[j]][tags[j]])

In [17]:
cal_emissions()
emissions

Unnamed: 0,14.95,graneles,jersey,"7,9",Cuarenta,incurrir,navío,enunciar,dedicarán,derechas,...,14.48,cuentos,capos,agresores,argumentó,MIA,amplias,blanquiverde,realizadas,Gallardon
I-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000256
B-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000135,0.0,0.0,0.0,0.0
B-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-ORG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-LOC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O,9e-06,1.3e-05,4e-06,1.3e-05,4e-06,4e-06,9e-06,4e-06,9e-06,4e-06,...,4e-06,9e-06,4e-06,9e-06,9e-06,0.0,4e-06,1.3e-05,4.3e-05,0.0
B-PER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-MISC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def transition(ei_row,ef_col):
    return transitions[ef_col][ei_row]

def emission(ei_row,ef_col):
    return emissions[ef_col][ei_row]

def max_p(f1,f2):
    maximo = f1 if f1 >= f2 else f2
    return maximo

In [19]:
#test
transition('I-LOC','O')

0.5631940772078265

## Algoritmo Viterbi

In [20]:
def viterbi(sequence):
    tags_rpta = []
    #probabilidades iniciales
    for st in states:
        dv_table.at[st,0] = initial_prob[st] + emission(st,sequence[0]) 
        for i in range(1,len(sequence),1):
            for st in states:
                #probabilidades del estado anterior
                col_dv_table = dv_table.loc[:,i-1] #probabilidades P(t-1)
                col_dv_table = np.array(col_dv_table)

                 #probabilidades de transicion S(t-1)->S(t)
                row_transition = transitions.loc[st,:]
                row_transition = np.array(row_transition)

                m = col_dv_table + row_transition
                p = emission(st,sequence[i]) + np.max(m)
                dv_table.at[st,i] = p
    
    for i in range(len(sequence)):
        col = dv_table.loc[dv_table[i].idxmax()]
        tags_rpta.append(col.name)
    return tags_rpta


## Test

In [21]:
idx=2
#sequence = sent2tokens(train_sents[idx])
sequence = ['El', 'Abogado', 'General', 'del', 'Estado']
l_seq = len(sequence)
data = np.zeros([N_STATES,l_seq])
dv_table = pd.DataFrame(data,index=states,columns=np.arange(l_seq))

#print(sequence)
#print(sent2labels(train_sents[idx]))

In [22]:
tags = viterbi(sequence)
tags


['O', 'B-LOC', 'B-PER', 'B-LOC', 'B-LOC']

In [23]:
#tabla dinamica generada
dv_table

Unnamed: 0,0,1,2,3,4
I-PER,0.000512,1.534738,1.587356,2.33689,2.960931
B-ORG,0.036782,1.491632,1.585054,2.28456,2.926528
B-LOC,0.080329,1.580182,1.585469,2.37311,3.005606
I-ORG,0.002204,1.249209,1.58479,2.230573,2.677438
I-MISC,0.002491,1.188861,1.583607,2.230213,2.616153
I-LOC,0.00012,1.368493,1.581769,2.230468,2.793917
O,0.805299,0.805299,1.598227,2.230723,2.391159
B-PER,0.073255,1.13925,2.200325,2.252943,3.002476
B-MISC,0.020702,1.217172,1.766403,2.203546,2.808676
