In [None]:
# =============================================
# extract_features_from_xml.ipynb - Youngjun Yu
# =============================================

"""
Usage:
    - Run this notebook to see how XML data is read and stored.
    - The function parse(file_path) returns a list of sentences,
      where each sentence is a list of (orth, ctag) pairs.
"""

import gzip
import xml.etree.ElementTree as ET

def parse(file_path):
    """
    Parses an XCES XML file (gzipped) and extracts sentences.
    
    Args:
        file_path (str): Path to the .xml.gz file (relative path).
        
    Returns:
        list of list of (str, str): A list of sentences; 
                                    each sentence is a list of (orth, ctag) tuples.
    """
    sentences = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        
        for chunk in root.findall('.//chunk'):
            sentence_tokens = []
            for tok in chunk.findall('tok'):
                orth_elem = tok.find('orth')
                lex_elem = tok.find('lex')
                
                if orth_elem is not None and lex_elem is not None:
                    ctag_elem = lex_elem.find('ctag')
                    if ctag_elem is not None:
                        orth = orth_elem.text.strip()
                        pos_tag = ctag_elem.text.strip()
                        sentence_tokens.append((orth, pos_tag))            
            if len(sentence_tokens) > 0:
                sentences.append(sentence_tokens)                
    return sentences

if __name__ == "__main__":
    train_file = "../Data/train.xml.gz"
    validate_file = "../Data/validate.xml.gz"

    train_sentences = parse(train_file)
    print(f"Number of sentences in train set: {len(train_sentences)}")

    validate_sentences = parse(validate_file)
    print(f"Number of sentences in validation set: {len(validate_sentences)}")

    # Print first sentence from training data as an example
    if train_sentences:
        print("\nExample of first sentence (orth, ctag) pairs:")
        for token_pos in train_sentences[0]:
            print(token_pos)


Number of sentences in train set: 51397
Number of sentences in validation set: 17133

Example of first sentence (orth, ctag) pairs:
('Zabiję', 'fin:sg:pri:perf')
('cię', 'ppron12:sg:acc:m1:sec:nakc')
(',', 'interp')
('jeśli', 'comp')
('umrzesz', 'fin:sg:sec:perf')
('!', 'interp')
('"', 'interp')
