# Hidden Markov Model for Named Entity Recognition

Obtained data from https://www.clips.uantwerpen.be/conll2002/ner/

## Preprocessing data

Read data downloaded and parsing data

In [1]:
def parsingData(text):
    words = []
    labels = []
    
    for line in text:
        if line != "":
            word, label = line.split(" ")
            words.append(word)
            labels.append(label)
    
    return words, labels

In [2]:
data = open("data/esp.train", "r", encoding='utf8',errors="ignore")

In [3]:
text  = data.read().split("\n")

In [4]:
x_train, y_train = parsingData(text)

## Trainning

Create probability distribution

In [5]:
def distribution(x_train, y_train):
    res = {}
    
    for i in range(len(y_train)):
        if x_train[i] not in res:
            res[x_train[i]] = {}
        
        if y_train[i] not in res[x_train[i]]:
                res[x_train[i]][y_train[i]]  = 1
                
        else:
            res[x_train[i]][y_train[i]] += 1 
    
    
    for key, val in res.items():
        sumLabels = sum(val.values())
        for k, v in val.items():
            res[key][k] = v / sumLabels
            
    return res

In [6]:
prob_dist = distribution(x_train, y_train)

Obtención de la etiqueta que hace máxima a la distribución de probabilidad dada la palabra

In [7]:
def predict(word, prob_dist):
    if word in prob_dist:
        maxVal = max(prob_dist[word].values())
        for k, v in prob_dist[word].items():
            if v == maxVal:
                return k
    else:
        return '0'

In [8]:
predict('CARLOS', prob_dist)

'B-PER'

## Testing

calculate correct predict labels

In [9]:
def testing(input_test, label_test, prob_dist):
    n   = len(input_test)
    acc = 0.
    
    for i in range(n):
        if predict(input_test[i], prob_dist) == label_test[i]:
            acc += 1
            
    return acc/n

### Dataset A

Using dataset esp.testa

In [10]:
data = open("data/esp.testa", "r", encoding='utf8',errors="ignore")
text_a  = data.read().split("\n")
data.close()

In [11]:
x_test, y_test = parsingData(text_a)

In [12]:
acc_a = testing(x_test, y_test, prob_dist)
print("Accuracy for 'esp.testa' : {:4.3f}".format(acc_a))

Accuracy for 'esp.testa' : 0.870


### Dataset B

Using dataset esp.testb

In [13]:
data = open("data/esp.testb", "r", encoding='utf8',errors="ignore")
text_b  = data.read().split("\n")
data.close()

In [14]:
x_test, y_test = parsingData(text_b)

In [15]:
acc_b = testing(x_test, y_test, prob_dist)
print("Accuracy for 'esp.testa' : {:4.3f}".format(acc_b))

Accuracy for 'esp.testa' : 0.898
