In [17]:
# Calculate the emission and transition matrix which will be helpful for tagging parts of Speech using Hidden Markov Model


# Hidden Markov Model (HMM) is a statistical model used to analyze sequential data like text or speech.
# It represents a system with hidden states that evolve over time, and the observer can only see outputs that depend on these hidden states.
# HMMs are particularly useful for tasks like part-of-speech tagging, speech recognition, and machine translation.

with open('tagged_sentences.txt', 'r') as f:
    content = f.read()

sentences = content.strip().split('\n')

tagged_sentences = []
for sentence in sentences:
    tagged_sentence = []
    word_pos_pairs = sentence.split()
    for pair in word_pos_pairs:
        word, pos = pair.rsplit('/', 1)
        tagged_sentence.append((word, pos))
    tagged_sentences.append(tagged_sentence)

print(f"Loaded {len(tagged_sentences)} tagged sentences.")
if tagged_sentences:
    print("First sentence example:", tagged_sentences[0])

Loaded 2 tagged sentences.
First sentence example: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('dummy', 'JJ'), ('sentence', 'NN'), ('.', '.')]


In [18]:
transition_counts = {}

for sentence in tagged_sentences:
    for i in range(len(sentence) - 1):
        current_tag = sentence[i][1]
        next_tag = sentence[i+1][1]
        if current_tag not in transition_counts:
            transition_counts[current_tag] = {}
        if next_tag not in transition_counts[current_tag]:
            transition_counts[current_tag][next_tag] = 0
        transition_counts[current_tag][next_tag] += 1

print("Tag transition counts:")
print(transition_counts)

Tag transition counts:
{'DT': {'VBZ': 1, 'JJ': 1, 'NN': 1}, 'VBZ': {'DT': 1}, 'JJ': {'NN': 1}, 'NN': {'.': 1, 'RB': 1}, 'RB': {'.': 1}}


In [19]:
transition_probabilities = {}

for current_tag, next_tags in transition_counts.items():
    total_transitions_from_current_tag = sum(next_tags.values())
    transition_probabilities[current_tag] = {}
    for next_tag, count in next_tags.items():
        transition_probabilities[current_tag][next_tag] = count / total_transitions_from_current_tag

print("\nTag transition probabilities:")
print(transition_probabilities)


Tag transition probabilities:
{'DT': {'VBZ': 0.3333333333333333, 'JJ': 0.3333333333333333, 'NN': 0.3333333333333333}, 'VBZ': {'DT': 1.0}, 'JJ': {'NN': 1.0}, 'NN': {'.': 0.5, 'RB': 0.5}, 'RB': {'.': 1.0}}


In [20]:
emission_counts = {}

for sentence in tagged_sentences:
    for word, tag in sentence:
        if tag not in emission_counts:
            emission_counts[tag] = {}
        if word not in emission_counts[tag]:
            emission_counts[tag][word] = 0
        emission_counts[tag][word] += 1

print("Word-tag emission counts:")
print(emission_counts)

Word-tag emission counts:
{'DT': {'This': 1, 'a': 1, 'Another': 1}, 'VBZ': {'is': 1}, 'JJ': {'dummy': 1}, 'NN': {'sentence': 1, 'example': 1}, '.': {'.': 2}, 'RB': {'here': 1}}


In [21]:
emission_probabilities = {}

for tag, word_counts in emission_counts.items():
    total_tag_count = sum(word_counts.values())
    emission_probabilities[tag] = {}
    for word, count in word_counts.items():
        emission_probabilities[tag][word] = count / total_tag_count

print("\nWord-tag emission probabilities:")
print(emission_probabilities)


Word-tag emission probabilities:
{'DT': {'This': 0.3333333333333333, 'a': 0.3333333333333333, 'Another': 0.3333333333333333}, 'VBZ': {'is': 1.0}, 'JJ': {'dummy': 1.0}, 'NN': {'sentence': 0.5, 'example': 0.5}, '.': {'.': 1.0}, 'RB': {'here': 1.0}}
