In [None]:
# ========================================
# baseline.ipynb -Youngjun Yu
# ========================================

"""
Overall steps:
    1) Parse the training and validation sets.
    2) Build context-window features for each token.
    3) Train two classifiers.
    4) Evaluate classifiers on the validation and test set.
"""

import gzip
import xml.etree.ElementTree as ET
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# 1. Load and Parse data

def parse(file_path):
    """
    Parses an XCES XML file (gzipped) and extracts sentences.
    
    Args:
        file_path (str): Path to the .xml.gz file (relative path).
        
    Returns:
        list of list of (str, str): A list of sentences; 
                                    each sentence is a list of (orth, ctag) tuples.
    """
    sentences = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        
        for chunk in root.findall('.//chunk'):
            sentence_tokens = []
            for tok in chunk.findall('tok'):
                orth_elem = tok.find('orth')
                lex_elem = tok.find('lex')
                
                if orth_elem is not None and lex_elem is not None:
                    ctag_elem = lex_elem.find('ctag')
                    if ctag_elem is not None:
                        orth = orth_elem.text.strip()
                        pos_tag = ctag_elem.text.strip()
                        sentence_tokens.append((orth, pos_tag))
            if sentence_tokens:
                sentences.append(sentence_tokens)
    return sentences

train_sentences = parse("../Data/train.xml.gz")
validate_sentences = parse("../Data/validate.xml.gz")
test_sentences = parse("../Data/test-1-1.xml.gz")


# 2. Create a function to build context features

def build_features(sentences, window=1):
    """
    Build a feature representation for each token in each sentence 
    using a simple bag-of-words context window.
    """
    X = []
    y = []
    for sentence in sentences:
        tokens = [tok for tok, _ in sentence]
        pos_tags = [pt for _, pt in sentence]
        
        for i in range(len(tokens)):
            target_token = tokens[i]
            target_pos = pos_tags[i]
            
            # Build features for the target token
            features = {}
            
            # 1) Current token
            features["token"] = target_token.lower()
            
            # 2) Context to the left
            for w in range(1, window+1):
                left_i = i - w
                if left_i >= 0:
                    features[f"left_{w}"] = tokens[left_i].lower()
                else:
                    features[f"left_{w}"] = "<PAD>"
            
            # 3) Context to the right
            for w in range(1, window+1):
                right_i = i + w
                if right_i < len(tokens):
                    features[f"right_{w}"] = tokens[right_i].lower()
                else:
                    features[f"right_{w}"] = "<PAD>"
            
            X.append(features)
            y.append(target_pos)
    return X, y

X_train_dict, y_train = build_features(train_sentences, window=1)
X_validate_dict, y_validate = build_features(validate_sentences, window=1)
X_test_dict, y_test = build_features(test_sentences, window=1)

X_full_dict = X_train_dict + X_validate_dict
y_full = y_train + y_validate


# 3. Vectorize features
vec = DictVectorizer(sparse=True)
X_full = vec.fit_transform(X_full_dict)
X_validate = vec.transform(X_validate_dict)
X_test = vec.transform(X_test_dict)


# 4. Train two classifiers and evaluate

# Classifier 1: Naive Bayes
NB = MultinomialNB()
NB.fit(X_full, y_full)
y_pred_NB_test = NB.predict(X_test)
y_pred_NB_val = NB.predict(X_validate)

# Classifier 2: Decision Tree
DT = DecisionTreeClassifier(
    random_state=0,
    max_depth=100
)
DT.fit(X_full, y_full)
y_pred_DT_test = DT.predict(X_test)
y_pred_DT_val = DT.predict(X_validate)


# 5. Evaluate and print metrics

def evaluate(y_true, y_pred, model_name):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    accuracy = accuracy_score(y_true, y_pred)
    print(f"=== {model_name} ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("")

# Evaluate on validation set
evaluate(y_validate, y_pred_NB_val, model_name="Naive Bayes (Validation)")
evaluate(y_validate, y_pred_DT_val, model_name="Decision Tree (Validation)")

# Evaluate on test set
evaluate(y_test, y_pred_NB_test, model_name="Naive Bayes (Test)")
evaluate(y_test, y_pred_DT_test, model_name="Decision Tree (Test)")

=== Naive Bayes (Validation) ===
Accuracy: 0.6923
Precision: 0.2474
Recall: 0.0903
F1-score: 0.1104

=== Decision Tree (Validation) ===
Accuracy: 0.4570
Precision: 0.1653
Recall: 0.0690
F1-score: 0.0784

=== Naive Bayes (Test) ===
Accuracy: 0.6368
Precision: 0.1960
Recall: 0.0708
F1-score: 0.0858

=== Decision Tree (Test) ===
Accuracy: 0.4529
Precision: 0.1169
Recall: 0.0511
F1-score: 0.0569

