# **POS** tagger by using **Hidden Markov Models**

# Step 1: Data Exploration

In [1]:
# Importing all the libraries here
import os
import conllu

In [2]:
# file Path to the multiple .conllu datasets
data_paths = ['Data/Practice2/en_gum-ud-dev.conllu', 'Data/Practice2/en_gum-ud-train.conllu', 'Data/Practice2/en_gum-ud-test.conllu']

In [3]:
#Initializing the Global Variables
datas = []
sentence_dataset = []

# Opening the .conllu file
for data_path in data_paths:
    with open(data_path, 'r', encoding='utf-8') as file:
        datas.append(file.read())

# Parsing the data using conllu library
for data in datas:
    sentence_dataset.append(conllu.parse(data))

In [4]:
# Testing the Data
print(f"Number of Sentence Datsets: {len(sentence_dataset)}")

for idx, sentences in enumerate(sentence_dataset):
    print(f"Sentences in Dataset {idx+1}:  {len(sentences)}")

first_sentence = sentence_dataset[2][1]
    
for token in first_sentence:
    print(f"Length of the First Sentence: {len(first_sentence)}")
    print(token.keys())
    print(token.values())
    break

Number of Sentence Datsets: 3
Sentences in Dataset 1:  1117
Sentences in Dataset 2:  8548
Sentences in Dataset 3:  1096
Length of the First Sentence: 8
dict_keys(['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])
dict_values([1, 'Results', 'result', 'NOUN', 'NNS', {'Number': 'Plur'}, 0, 'root', [('root', 0)], {'Discourse': 'elaboration-additional:2->1:0', 'Entity': '(5-abstract-new-cf3-1-sgl'}])


**Now we will store these sentences into a (token, u-pos) tulpe within our list**

In [5]:
# Initializing Global Variables
total_tokens = []
POS_tags = set()

# Adding the Word and its tag to the Token and POS_tags variable
for sentences in sentence_dataset:
    for sentence in sentences:
        for token in sentence:
            total_tokens.append((token['form'], token['upos']))
            POS_tags.add(token['upos'])

In [6]:
# Testing out our data 
print(f"Number of Tokens: {len(total_tokens)}")
print("The First Five Tokens: ", total_tokens[:5])

print(f"\nNumber of Tags: {len(POS_tags)}")
print("Types of Tags: ", POS_tags)

Number of Tokens: 190278
The First Five Tokens:  [('Introduction', 'NOUN'), ('Research', 'NOUN'), ('on', 'ADP'), ('adult', 'NOUN'), ('-', 'PUNCT')]

Number of Tags: 18
Types of Tags:  {'SCONJ', 'DET', 'SYM', 'PUNCT', 'NUM', 'ADV', 'ADJ', '_', 'ADP', 'PROPN', 'PART', 'CCONJ', 'PRON', 'VERB', 'NOUN', 'X', 'INTJ', 'AUX'}


**Now we will split our tokens into Test and Training Datasets**

In [7]:
# Initializing our Global Variables
train_tokens = []
test_tokens = []

# finding the index form which we will split
train_idx = int(len(total_tokens)*0.8)

#Splitting and assignment
train_tokens = total_tokens[:train_idx]
test_tokens = total_tokens[train_idx:]

print(f"Length of Training Tokens: {len(train_tokens)}")
print(f"Length of Testing Tokens: {len(test_tokens)}")

Length of Training Tokens: 152222
Length of Testing Tokens: 38056


# Step 2: Implementing HMM and Viterbi