In [20]:
import os
import pickle
import numpy as np
import tensorflow as tf

# Import project modules
from conllu_reader import ConlluReader
from algorithm import ArcEager, Transition, Sample
from conllu_token import Token
from model import ParserMLP

# Check for GPU availability (Optional)
print("TensorFlow Version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow Version: 2.17.0
Num GPUs Available:  1


In [21]:
dataset_path = "dataset.pkl"

if not os.path.exists(dataset_path):
    print(f"Error: '{dataset_path}' not found. Please run main.py first to generate it.")
else:
    print(f"Loading dataset from {dataset_path}...")
    with open(dataset_path, "rb") as f:
        data = pickle.load(f)
        
    training_samples = data["training_samples"]
    dev_samples = data["dev_samples"]
    deprels = data["deprels"]
    actions = data["actions"]
    
    print("Dataset loaded successfully!")
    print(f"Training Samples: {len(training_samples)}")
    print(f"Development Samples: {len(dev_samples)}")
    print(f"Unique Dependency Labels: {len(deprels)}")

Loading dataset from dataset.pkl...
Dataset loaded successfully!
Training Samples: 81182
Development Samples: 4978
Unique Dependency Labels: 43


In [22]:
# Inspect the first training sample
sample = training_samples[80000]

print(f"--- Sample 1 ---")
# Note: Accessing internal state for visualization
print(f"State Stack: {[t.form for t in sample.state.S]}")
print(f"State Buffer (first 3): {[t.form for t in sample.state.B[:3]]}...")
print(f"Gold Transition: {sample.transition}")

--- Sample 1 ---
State Stack: ['ROOT', 'The']
State Buffer (first 3): ['revolution', 'in', 'the']...
Gold Transition: LEFT-ARC-det


In [23]:
# Extract features from the loaded sample
feats = sample.state_to_feats(nstack_feats=3, nbuffer_feats=3)

print("Feature List:", feats)

# Expected output format: 
# [Stack_Word_1, Stack_Word_0, Buffer_Word_0, Buffer_Word_1, 
#  Stack_UPOS_1, Stack_UPOS_0, Buffer_UPOS_0, Buffer_UPOS_1]

Feature List: ['The', 'ROOT', '<PAD>', 'revolution', 'in', 'the', 'DET', 'ROOT_UPOS', '<PAD>', 'NOUN', 'ADP', 'DET']


In [24]:
# Initialize Model
# You can adjust dimensions and epochs as needed
model = ParserMLP(word_emb_dim=200, hidden_dim=512, epochs=50, batch_size=64, learning_rate=0.0005)

print("Starting training...")
# The train function handles vocabulary building and vectorization internally
model.train(training_samples, dev_samples)

Starting training...
Building vocabulary...
Vocab built: 6872 words, 20 UPOS tags.
Detected Feature Shape: 4 words, 4 tags.
Vectorizing data...
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 words_input (InputLayer)    [(None, 4)]                  0         []                            
                                                                                                  
 upos_input (InputLayer)     [(None, 4)]                  0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 4, 200)               1374400   ['words_input[0][0]']         
                                                                                                  
 embedding_5 (Embedding)     (None, 4, 50)     

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring f

   3/1269 [..............................] - ETA: 1:40 - loss: 4.1836 - action_output_loss: 1.3795 - deprel_output_loss: 3.7388 - action_output_accuracy: 0.3281 - deprel_output_accuracy: 0.0938

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


   6/1269 [..............................] - ETA: 1:27 - loss: 4.1542 - action_output_loss: 1.3715 - deprel_output_loss: 3.7102 - action_output_accuracy: 0.3646 - deprel_output_accuracy: 0.2760

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  10/1269 [..............................] - ETA: 1:34 - loss: 4.1112 - action_output_loss: 1.3634 - deprel_output_loss: 3.6638 - action_output_accuracy: 0.3750 - deprel_output_accuracy: 0.3797

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  14/1269 [..............................] - ETA: 1:27 - loss: 4.0608 - action_output_loss: 1.3497 - deprel_output_loss: 3.6148 - action_output_accuracy: 0.4163 - deprel_output_accuracy: 0.4129

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  18/1269 [..............................] - ETA: 1:24 - loss: 4.0019 - action_output_loss: 1.3383 - deprel_output_loss: 3.5513 - action_output_accuracy: 0.4245 - deprel_output_accuracy: 0.4271

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  23/1269 [..............................] - ETA: 1:19 - loss: 3.9048 - action_output_loss: 1.3226 - deprel_output_loss: 3.4429 - action_output_accuracy: 0.4436 - deprel_output_accuracy: 0.4416

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  29/1269 [..............................] - ETA: 1:15 - loss: 3.7486 - action_output_loss: 1.2938 - deprel_output_loss: 3.2731 - action_output_accuracy: 0.4693 - deprel_output_accuracy: 0.4515

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  37/1269 [..............................] - ETA: 1:10 - loss: 3.5294 - action_output_loss: 1.2469 - deprel_output_loss: 3.0433 - action_output_accuracy: 0.4970 - deprel_output_accuracy: 0.4662

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  43/1269 [>.............................] - ETA: 1:05 - loss: 3.4028 - action_output_loss: 1.2004 - deprel_output_loss: 2.9364 - action_output_accuracy: 0.5214 - deprel_output_accuracy: 0.4644

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  49/1269 [>.............................] - ETA: 1:02 - loss: 3.2638 - action_output_loss: 1.1575 - deprel_output_loss: 2.8083 - action_output_accuracy: 0.5402 - deprel_output_accuracy: 0.4707

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  66/1269 [>.............................] - ETA: 54s - loss: 2.9353 - action_output_loss: 1.0375 - deprel_output_loss: 2.5304 - action_output_accuracy: 0.5874 - deprel_output_accuracy: 0.4858

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  71/1269 [>.............................] - ETA: 54s - loss: 2.8744 - action_output_loss: 1.0171 - deprel_output_loss: 2.4763 - action_output_accuracy: 0.5984 - deprel_output_accuracy: 0.4890

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


  80/1269 [>.............................] - ETA: 52s - loss: 2.7548 - action_output_loss: 0.9738 - deprel_output_loss: 2.3746 - action_output_accuracy: 0.6170 - deprel_output_accuracy: 0.5008

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


 120/1269 [=>............................] - ETA: 43s - loss: 2.3928 - action_output_loss: 0.8359 - deprel_output_loss: 2.0759 - action_output_accuracy: 0.6740 - deprel_output_accuracy: 0.5375

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


 125/1269 [=>............................] - ETA: 6:09 - loss: 2.3527 - action_output_loss: 0.8206 - deprel_output_loss: 2.0429 - action_output_accuracy: 0.6806 - deprel_output_accuracy: 0.5430

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring f

 246/1269 [====>.........................] - ETA: 2:58 - loss: 1.8252 - action_output_loss: 0.6562 - deprel_output_loss: 1.5586 - action_output_accuracy: 0.7480 - deprel_output_accuracy: 0.6252

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)




'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)




'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)




'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


Epoch 2/50

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)


Epoch 3/50

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)




'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)




In [25]:
from conllu_reader import ConlluReader

# Initialize reader if not already done
reader = ConlluReader()

# Load the test set
# We use inference=True to ensure we treat this as unparsed data
print("Loading test set...")
test_trees = reader.read_conllu_file("en_partut-ud-test_clean.conllu", inference=True)
print(f"Loaded {len(test_trees)} sentences for testing.")

Loading test set...
Loaded 153 sentences for testing.


In [26]:
test_trees[1]

[0	ROOT	ROOT	ROOT_UPOS	ROOT_CPOS	ROOT_FEATS	_	_	_	_,
 1	Any	any	DET	DI	PronType=Ind	_	_	_	_,
 2	use	use	NOUN	S	Number=Sing	_	_	_	_,
 3	of	of	ADP	E	_	_	_	_	_,
 4	the	the	DET	RD	Definite=Def|PronType=Art	_	_	_	_,
 5	work	work	NOUN	S	Number=Sing	_	_	_	_,
 6	other	other	ADJ	A	Degree=Pos	_	_	_	_,
 7	than	than	SCONJ	CS	_	_	_	_	_,
 8	as	as	ADP	E	_	_	_	_	_,
 9	authorized	authorize	VERB	V	Tense=Past|VerbForm=Part	_	_	_	_,
 10	under	under	ADP	E	_	_	_	_	_,
 11	this	this	DET	DD	Number=Sing|PronType=Dem	_	_	_	_,
 12	license	license	NOUN	S	Number=Sing	_	_	_	_,
 13	or	or	CCONJ	CC	_	_	_	_	_,
 14	copyright	copyright	NOUN	S	Number=Sing	_	_	_	_,
 15	law	law	NOUN	S	Number=Sing	_	_	_	_,
 16	is	be	AUX	VA	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	_	_	_	_,
 17	prohibited	prohibit	VERB	V	Tense=Past|VerbForm=Part	_	_	_	_,
 18	.	.	PUNCT	FS	_	_	_	_	_]

In [27]:
# Conduct inference on the test set
print("Running inference on the test set...")
# The model.run method modifies the trees in-place or returns them
# It predicts the HEAD and DEPREL for each token
predicted_test_trees = model.run(test_trees)

print("Inference complete.")

Running inference on the test set...
Inference complete.


In [28]:
output_path = "output_three_feature_raw.conllu"

print(f"Saving raw predictions to {output_path}...")
reader.write_conllu_file(output_path, predicted_test_trees)
print("File saved.")

Saving raw predictions to output_three_feature_raw.conllu...
File saved.


In [29]:
from postprocessor import PostProcessor

post = PostProcessor()

print(f"Post-processing predictions in {output_path}...")

fixed_trees = post.postprocess(output_path)

final_output_path = "output_three_feature_fixed.conllu"
reader.write_conllu_file(final_output_path, fixed_trees)

print(f"Post-processing complete. Final predictions saved to {final_output_path}")

Post-processing predictions in output_three_feature_raw.conllu...
Post-processing complete. Final predictions saved to output_three_feature_fixed.conllu


In [30]:
# Run the evaluation script comparing the Gold Standard (test_clean) against your Fixed Output
# -v provides verbose output
!python conll18_ud_eval.py en_partut-ud-test_clean.conllu output_three_feature_fixed.conllu -v

Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |    100.00 |    100.00 |    100.00 |    100.00
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |    100.00 |    100.00 |    100.00 |    100.00
AllTags    |    100.00 |    100.00 |    100.00 |    100.00
Lemmas     |    100.00 |    100.00 |    100.00 |    100.00
UAS        |     76.73 |     76.73 |     76.73 |     76.73
LAS        |     66.90 |     66.90 |     66.90 |     66.90
CLAS       |     52.26 |     51.88 |     52.07 |     51.88
MLAS       |     50.45 |     50.08 |     50.27 |     50.08
BLEX       |     52.26 |     51.88 |     52.07 |     51.88
