# Part 1

In [59]:
import data_helpers
import hmm

import os
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
from collections import Counter, defaultdict
import math
from tqdm import tqdm


In [60]:
# Download needed data
data_helpers.download_all_datasets()


✓ Already exists: cs_pdtc-ud-train-la.conllu
✓ Already exists: cs_pdtc-ud-train-ca.conllu
✓ Already exists: cs_pdtc-ud-dev.conllu
✓ Already exists: cs_pdtc-ud-test.conllu
✓ Already exists: en_gum-ud-train.conllu
✓ Already exists: en_gum-ud-dev.conllu
✓ Already exists: en_gum-ud-test.conllu

Total: 4/4 Czech files, 3/3 English files


In [61]:
expected_counts = {
    'en_gum-ud-train.conllu': 177410,
    'cs_pdtc-ud-dev.conllu': 384431,
    'cs_pdtc-ud-test.conllu': 305808
}

In [62]:
os.makedirs('ud_data_filtered', exist_ok=True)
files_to_filter = [
    'en_gum-ud-train.conllu',
    'en_gum-ud-dev.conllu',
    'en_gum-ud-test.conllu',
    'cs_pdtc-ud-train-la.conllu',
    'cs_pdtc-ud-train-ca.conllu',
    'cs_pdtc-ud-dev.conllu',
    'cs_pdtc-ud-test.conllu'
]
for file in files_to_filter:
    input_path = os.path.join('ud_data', file)
    output_path = os.path.join('ud_data_filtered', file)

    if os.path.exists(input_path):
        original_count = data_helpers.count_valid_lines(input_path)
        data_helpers.filter_valid_lines(input_path, output_path)
        filtered_count = data_helpers.count_valid_lines(output_path)
        removed = original_count - filtered_count


In [51]:
for file, expected in expected_counts.items():
    file_path = os.path.join('ud_data_filtered', file)
    if os.path.exists(file_path):
        actual = data_helpers.count_valid_lines(file_path)
        print(f"  Expected: {expected:,}")
        print(f"  Actual:   {actual:,}")


  Expected: 177,410
  Actual:   177,410
  Expected: 384,431
  Actual:   21,752
  Expected: 305,808
  Actual:   22,844


In [52]:
# Truncate Czech test data to 1500 sentences

cs_test_input = os.path.join('ud_data_filtered', 'cs_pdtc-ud-test.conllu')
cs_test_output = os.path.join('ud_data_filtered', 'cs_pdtc-ud-test.conllu.tmp')
data_helpers.truncate_to_n_sentences(cs_test_input, cs_test_output, 1500)
os.replace(cs_test_output, cs_test_input)

✓ Czech test data truncated to 1500 sentences


In [53]:
# Truncate Czech dev data to 1500 sentences
cs_dev_input = os.path.join('ud_data_filtered', 'cs_pdtc-ud-dev.conllu')
cs_dev_output = os.path.join('ud_data_filtered', 'cs_pdtc-ud-dev.conllu.tmp')
data_helpers.truncate_to_n_sentences(cs_dev_input, cs_dev_output, 1500)
os.replace(cs_dev_output, cs_dev_input)

✓ Czech dev data truncated to 1500 sentences


In [54]:
test_files_expected = {
    'en_gum-ud-test.conllu': 28397,
    'cs_pdtc-ud-test.conllu': 22844,
    'cs_pdtc-ud-train-la.conllu': 218409,
    'en_gum-ud-train.conllu': 177410
}

for file, expected in test_files_expected.items():
    file_path = os.path.join('ud_data_filtered', file)
    if os.path.exists(file_path):
        actual = data_helpers.count_word_tag_pairs(file_path)
        status = "✓ MATCH" if actual == expected else "✗ MISMATCH"
        print(f"{status} {file}: Expected {expected:,}, Actual {actual:,}")
    else:
        print(f"File not found: {file_path}")


✓ MATCH en_gum-ud-test.conllu: Expected 28,397, Actual 28,397
✓ MATCH cs_pdtc-ud-test.conllu: Expected 22,844, Actual 22,844
✓ MATCH cs_pdtc-ud-train-la.conllu: Expected 218,409, Actual 218,409
✓ MATCH en_gum-ud-train.conllu: Expected 177,410, Actual 177,410


In [9]:
import stanza

stanza.download('en')
stanza.download('cs')

def tag_with_stanza(sentences, lang_code):
    nlp = stanza.Pipeline(lang_code, processors='tokenize,pos',
                         tokenize_pretokenized=True)
    gold_tags = []
    pred_tags = []

    print(f"\nTagging {lang_code.upper()} sentences...")
    for sentence in tqdm(sentences):
        tokens = [token['word'] for token in sentence]
        doc = nlp([tokens])

        for i, word in enumerate(doc.sentences[0].words):
            gold_tags.append(sentence[i]['gold_tag'])
            pred_tags.append(word.upos)
    return gold_tags, pred_tags

en_sentences = data_helpers.read_conllu_sentences('ud_data_filtered/en_gum-ud-test.conllu')
cs_sentences = data_helpers.read_conllu_sentences('ud_data_filtered/cs_pdtc-ud-test.conllu')

print(f"English sentences: {len(en_sentences)}")
print(f"Czech sentences: {len(cs_sentences)}")

en_token_count = sum(len(sent) for sent in en_sentences)
cs_token_count = sum(len(sent) for sent in cs_sentences)

print(f"English tokens: {en_token_count}")
print(f"Czech tokens: {cs_token_count}")

en_gold, en_pred = tag_with_stanza(en_sentences, 'en')
cs_gold, cs_pred = tag_with_stanza(cs_sentences, 'cs')

en_accuracy = accuracy_score(en_gold, en_pred)
cs_accuracy = accuracy_score(cs_gold, cs_pred)

Downloading Stanza models...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2026-01-18 12:06:40 INFO: Downloaded file to C:\Users\micha\stanza_resources\resources.json
2026-01-18 12:06:40 INFO: Downloading default packages for language: en (English) ...
2026-01-18 12:06:41 INFO: File exists: C:\Users\micha\stanza_resources\en\default.zip
2026-01-18 12:06:44 INFO: Finished downloading models and saved to C:\Users\micha\stanza_resources


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2026-01-18 12:06:44 INFO: Downloaded file to C:\Users\micha\stanza_resources\resources.json
2026-01-18 12:06:44 INFO: Downloading default packages for language: cs (Czech) ...
2026-01-18 12:06:45 INFO: File exists: C:\Users\micha\stanza_resources\cs\default.zip
2026-01-18 12:06:46 INFO: Finished downloading models and saved to C:\Users\micha\stanza_resources
2026-01-18 12:06:46 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Reading test data...
English sentences: 1464
Czech sentences: 1500
English tokens: 28397
Czech tokens: 22844


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2026-01-18 12:06:46 INFO: Downloaded file to C:\Users\micha\stanza_resources\resources.json
2026-01-18 12:06:47 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2026-01-18 12:06:47 INFO: Using device: cuda
2026-01-18 12:06:47 INFO: Loading: tokenize
2026-01-18 12:06:47 INFO: Loading: pos
2026-01-18 12:06:58 INFO: Done loading processors!



Tagging EN sentences...


100%|██████████| 1464/1464 [01:13<00:00, 19.94it/s]
2026-01-18 12:08:11 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2026-01-18 12:08:11 INFO: Downloaded file to C:\Users\micha\stanza_resources\resources.json
2026-01-18 12:08:12 INFO: Loading these models for language: cs (Czech):
| Processor | Package      |
----------------------------
| tokenize  | pdt          |
| pos       | pdt_nocharlm |

2026-01-18 12:08:12 INFO: Using device: cuda
2026-01-18 12:08:12 INFO: Loading: tokenize
2026-01-18 12:08:12 INFO: Loading: pos
2026-01-18 12:08:14 INFO: Done loading processors!



Tagging CS sentences...


100%|██████████| 1500/1500 [01:39<00:00, 15.08it/s]


In [10]:
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"English accuracy: {en_accuracy:.4f} ({en_accuracy*100:.2f}%)")
print(f"Czech accuracy: {cs_accuracy:.4f} ({cs_accuracy*100:.2f}%)")


RESULTS
English accuracy: 0.9783 (97.83%)
Czech accuracy: 0.9729 (97.29%)


In [12]:
data_helpers.print_confusion_matrix(en_gold, en_pred, "English")


English Confusion Matrix:


Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
ADJ,1755,2,11,0,0,0,0,34,0,0,0,13,0,0,0,18,0
ADP,0,2833,12,0,2,0,0,0,0,1,0,2,0,15,0,0,0
ADV,37,36,1226,0,0,14,6,8,0,0,3,1,0,4,0,1,0
AUX,0,0,0,1520,0,0,0,0,0,0,0,0,0,0,0,13,0
CCONJ,0,0,0,0,987,0,0,0,0,0,0,0,0,1,0,0,0
DET,0,0,1,0,3,2515,1,0,0,0,1,0,0,0,0,0,0
INTJ,1,4,12,0,1,0,155,2,0,0,0,1,0,3,0,2,3
NOUN,25,0,0,1,0,0,2,4806,3,0,1,59,0,0,0,8,1
NUM,0,1,0,0,0,0,0,0,478,0,0,0,0,0,0,0,0
PART,0,1,0,1,0,1,0,0,0,668,0,0,0,0,0,0,0


In [13]:
data_helpers.print_confusion_matrix(cs_gold, cs_pred, "Czech")


Czech Confusion Matrix:


Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
ADJ,2572,0,1,0,0,0,0,16,1,0,0,11,0,0,0,0,1
ADP,0,2109,0,0,0,0,0,3,0,0,1,0,0,0,0,1,0
ADV,1,2,878,0,16,1,0,10,0,5,3,0,0,0,0,1,0
AUX,0,0,0,774,0,0,0,0,0,0,0,0,0,0,0,2,0
CCONJ,0,0,12,0,731,0,0,0,0,2,0,0,0,11,0,0,0
DET,1,0,1,0,0,916,0,0,0,0,0,0,0,0,0,1,0
INTJ,0,0,0,0,0,0,14,2,0,3,0,1,0,0,0,0,0
NOUN,12,3,7,0,4,0,4,5525,0,0,0,32,0,0,6,3,4
NUM,0,0,0,0,0,0,0,0,616,0,0,0,0,0,0,0,0
PART,0,0,230,0,79,0,2,2,0,160,0,0,0,2,0,0,0


In [None]:
# Add these calls after your accuracy calculation:
en_perf = data_helpers.print_per_tag_performance(en_gold, en_pred, "English")
cs_perf = data_helpers.print_per_tag_performance(cs_gold, cs_pred, "Czech")

# Part 2 supervised HMM

In [None]:
en_acc_hmm, en_cm_hmm = hmm.train_and_eval_hmm(
    "ud_data_filtered/en_gum-ud-train.conllu",
    "ud_data_filtered/en_gum-ud-dev.conllu",
    "ud_data_filtered/en_gum-ud-test.conllu",
    language_name="English"
)

cs_acc_hmm, cs_cm_hmm = hmm.train_and_eval_hmm(
    "ud_data_filtered/cs_pdtc-ud-train-la.conllu",
    "ud_data_filtered/cs_pdtc-ud-dev.conllu",
    "ud_data_filtered/cs_pdtc-ud-test.conllu",
    language_name="Czech"
)

print("\n" + "="*70)
print("FINAL HMM RESULTS")
print(f"English HMM accuracy: {en_acc_hmm:.4f}")
print(f"Czech-la HMM accuracy:   {cs_acc_hmm:.4f}")



Training HMM tagger for English
Converged after 123 iterations
[English] Tag transition lambdas: [4.2852429869090354e-08, 0.018333266080433657, 0.1259233291667251, 0.8557433619004113]
[English] Emission mus: [0.08769801810227804, 0.00012268744637180853, 0.9121792944513502]

Decoding English test set with Viterbi...


 48%|████▊     | 699/1464 [01:07<01:00, 12.61it/s]

## English supervised HMM confusion matrix

In [None]:
en_cm_hmm

## Czech-la supervised HMM confusion matrix

In [None]:
cs_cm_hmm

# Part 3 - semi-supervised HMM

In [28]:
en_base, en_bw, en_base_gold, en_base_pred, en_bw_gold, en_bw_pred = hmm.train_and_eval_hmm_semi_supervised(
    "ud_data_filtered/en_gum-ud-train.conllu",
    "ud_data_filtered/en_gum-ud-dev.conllu",
    "ud_data_filtered/en_gum-ud-test.conllu",
    language_name="English",
    n_supervised_pairs=10_000,
    bw_iters=2
)

print("\n" + "="*70)
print("SEMI-SUPERVISED RESULTS")
print("="*70)
print(f"English baseline: {en_base:.4f}  -> after BW: {en_bw:.4f}")

en_cm_baseline = data_helpers.print_confusion_matrix(en_base_gold, en_base_pred, "English Baseline")
en_cm_bw = data_helpers.print_confusion_matrix(en_bw_gold, en_bw_pred, "English After BW")



SEMI-SUPERVISED HMM (Baum-Welch) for English
[English] Supervised sentences: 391
[English] Unlabeled sentences:   9834
Converged after 54 iterations
[English] Initial lambdas: [0.001, 0, 0, 0.999]
[English] Initial mu:      [0.001, 0, 0.999]


Decoding English baseline: 100%|██████████| 1464/1464 [01:36<00:00, 15.20it/s]



[English] Baseline accuracy (10k supervised): 0.7285

[Full Baum-Welch] Iteration 1/1


100%|██████████| 9834/9834 [27:25<00:00,  5.98it/s] 


[Full Baum-Welch] avg -log P(words) per token: 9.2307


Decoding English after BW: 100%|██████████| 1464/1464 [01:21<00:00, 17.86it/s]



[English] After Baum-Welch accuracy: 0.7654

[Full Baum-Welch] Iteration 1/1


100%|██████████| 9834/9834 [25:23<00:00,  6.45it/s]  


[Full Baum-Welch] avg -log P(words) per token: 6.9151


Decoding English after BW: 100%|██████████| 1464/1464 [01:20<00:00, 18.10it/s]



[English] After Baum-Welch accuracy: 0.7695

[Full Baum-Welch] Iteration 1/1


  0%|          | 26/9834 [00:09<1:00:28,  2.70it/s]


KeyboardInterrupt: 

## EN SEMI-SUPERVISED(BASELINE) HMM CONFUSION MATRICES

In [None]:
en_cm_baseline

## EN SEMI-SUPERVISED(AFTER BAUM WELCH) HMM CONFUSION MATRICES

In [None]:
en_cm_bw

In [None]:
cs_base, cs_bw, cs_base_gold, cs_base_pred, cs_bw_gold, cs_bw_pred = hmm.train_and_eval_hmm_semi_supervised(
    "ud_data_filtered/cs_pdtc-ud-train-la.conllu",
    "ud_data_filtered/cs_pdtc-ud-dev.conllu",
    "ud_data_filtered/cs_pdtc-ud-test.conllu",
    language_name="Czech",
    n_supervised_pairs=10_000,
    bw_iters=2
)
print(f"Czech baseline:   {cs_base:.4f}  -> after BW: {cs_bw:.4f}")

cs_cm_baseline = data_helpers.print_confusion_matrix(cs_base_gold, cs_base_pred, "Czech Baseline")
cs_cm_bw = data_helpers.print_confusion_matrix(cs_bw_gold, cs_bw_pred, "Czech After BW")



SEMI-SUPERVISED HMM (Baum-Welch) for Czech
[Czech] Supervised sentences: 624
[Czech] Unlabeled sentences:   11896
Converged after 74 iterations
[Czech] Initial lambdas: [0.001, 0, 0, 0.999]
[Czech] Initial mu:      [0.001, 0, 0.999]


Decoding Czech baseline: 100%|██████████| 1500/1500 [01:12<00:00, 20.79it/s]



[Czech] Baseline accuracy (10k supervised): 0.6968

[Full Baum-Welch] Iteration 1/1


100%|██████████| 11896/11896 [32:25<00:00,  6.11it/s] 


[Full Baum-Welch] avg -log P(words) per token: 10.3929


Decoding Czech after BW: 100%|██████████| 1500/1500 [01:14<00:00, 20.10it/s]



[Czech] After Baum-Welch accuracy: 0.7127

[Full Baum-Welch] Iteration 1/1


100%|██████████| 11896/11896 [30:52<00:00,  6.42it/s] 


[Full Baum-Welch] avg -log P(words) per token: 8.2312


Decoding Czech after BW: 100%|██████████| 1500/1500 [01:03<00:00, 23.72it/s]



[Czech] After Baum-Welch accuracy: 0.7119

[Full Baum-Welch] Iteration 1/1


 37%|███▋      | 4358/11896 [16:40<27:37,  4.55it/s]     

## CZ-LA SEMI-SUPERVISED(BASELINE) HMM CONFUSION MATRICES

In [None]:
cs_cm_baseline

## CZ-LA SEMI-SUPERVISED(AFTER BAUM WELCH) HMM CONFUSION MATRICES

In [None]:
cs_cm_bw

# BONUS EXPERIMENTS ON DIFFERENT TRAIN

In [26]:
cs_ca_acc_hmm, cs_ca_cm_hmm = hmm.train_and_eval_hmm(
    "ud_data_filtered/cs_pdtc-ud-train-ca.conllu",
    "ud_data_filtered/cs_pdtc-ud-dev.conllu",
    "ud_data_filtered/cs_pdtc-ud-test.conllu",
    language_name="Czech"
)

print("\n" + "="*70)
print("CZECH-CA HMM RESULTS")
print("="*70)
print(f"Czech HMM accuracy:   {cs_ca_acc_hmm:.4f}")


Training HMM tagger for Czech
Converged after 61 iterations
[Czech] Tag transition lambdas: [0.00820063533417119, 0.05062266556872382, 0.251482464340909, 0.689694234756196]
[Czech] Emission mus: [0.27359881036668804, 0.00014568764750449192, 0.7262555019858076]

Decoding Czech test set with Viterbi...


100%|██████████| 1500/1500 [01:05<00:00, 22.96it/s]


[Czech] HMM Accuracy: 0.8269 (82.69%)

CZECH CA HMM RESULTS
Czech HMM accuracy:   0.8269





## CZ-CA SUPERVISED HMM CONFUSION MATRICES

In [27]:
cs_ca_cm_hmm

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,SYM,VERB,X
ADJ,1895,165,14,4,1,6,1,305,31,0,5,6,90,2,0,77,0
ADP,0,2068,0,0,0,0,0,3,0,0,38,0,4,0,0,1,0
ADV,25,12,734,2,16,4,0,64,1,2,13,0,18,10,0,16,0
AUX,3,0,0,757,0,0,0,10,0,0,1,0,0,2,0,3,0
CCONJ,3,0,13,0,733,0,0,0,0,1,0,0,0,6,0,0,0
DET,14,6,1,0,0,884,0,9,0,0,2,0,1,1,0,1,0
INTJ,0,0,0,0,0,0,3,14,3,0,0,0,0,0,0,0,0
NOUN,298,138,24,13,12,12,3,4661,75,0,15,15,231,10,0,90,3
NUM,24,3,1,0,0,0,0,25,539,0,0,0,22,0,0,2,0
PART,1,0,167,1,71,4,0,6,0,220,0,0,2,3,0,0,0


In [None]:
cs_ca_base, cs_ca_bw, cs_ca_base_gold, cs_ca_base_pred, cs_ca_bw_gold, cs_ca_bw_pred = hmm.train_and_eval_hmm_semi_supervised(
    "ud_data_filtered/cs_pdtc-ud-train-ca.conllu",
    "ud_data_filtered/cs_pdtc-ud-dev.conllu",
    "ud_data_filtered/cs_pdtc-ud-test.conllu",
    language_name="Czech",
    n_supervised_pairs=10_000,
    bw_iters=2
)
print(f"CZECH-CA baseline:   {cs_base:.4f}  -> after BW: {cs_bw:.4f}")

## CZ-CA SEMI-SUPERVISED(BASELINE) HMM CONFUSION MATRICES

In [None]:
cs_ca_cm_baseline = data_helpers.print_confusion_matrix(cs_base_gold, cs_base_pred, "Czech ca Baseline")

## CZ-CA SEMI-SUPERVISED(AFTER BAUM WELCH) HMM CONFUSION MATRICES

In [None]:
cs_ca_cm_bw = data_helpers.print_confusion_matrix(cs_bw_gold, cs_bw_pred, "Czech ca After BW")