In [None]:
# ========================================
# improved.ipynb- Youngjun Yu
# ========================================

import gzip
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# 1. Load and Parse data

def parse(file_path):
    """
    Parses an XCES XML file (gzipped) and extracts sentences.
    
    Args:
        file_path (str): Path to the .xml.gz file (relative path).
        
    Returns:
        list of list of (str, str): A list of sentences; 
                                    each sentence is a list of (orth, ctag) tuples.
    """
    sentences = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        for chunk in root.findall('.//chunk'):
            sentence_tokens = []
            for tok in chunk.findall('tok'):
                orth_elem = tok.find('orth')
                lex_elem = tok.find('lex')
                if orth_elem is not None and lex_elem is not None:
                    ctag_elem = lex_elem.find('ctag')
                    if ctag_elem is not None:
                        orth = orth_elem.text.strip()
                        pos_tag = ctag_elem.text.strip()
                        sentence_tokens.append((orth, pos_tag))
            if sentence_tokens:
                sentences.append(sentence_tokens)
    return sentences

train_sentences = parse("../Data/train.xml.gz")
validate_sentences = parse("../Data/validate.xml.gz")
test_sentences = parse("../Data/test-1-1.xml.gz")


# 2. Build improved features

def token_shape(token):
    """
    Simple function to capture shape features of a token.
    """
    shape_str = []
    for char in token:
        if char.isdigit():
            shape_str.append('d')
        elif char.isalpha():
            if char.isupper():
                shape_str.append('X')
            else:
                shape_str.append('x')
        else:
            shape_str.append(char)
    return "".join(shape_str)


def build_improved_features(sentences, window=2):
    """
    Build a feature representation for each token using:
      - bigger context window
      - token shape
    """
    X = []
    y = []
    for sent in sentences:
        tokens = [t for t, _ in sent]
        pos_tags = [p for _, p in sent]

        for i in range(len(tokens)):
            target_token = tokens[i]
            target_pos = pos_tags[i]

            features = {}
            # Current token (lowercased)
            features["token"] = target_token.lower()
            # Token shape
            features["shape"] = token_shape(target_token)

            # Bigger context window
            for w in range(1, window + 1):
                left_i = i - w
                right_i = i + w
                # left token
                if left_i >= 0:
                    features[f"left_{w}"] = tokens[left_i].lower()
                    features[f"left_{w}_shape"] = token_shape(tokens[left_i])
                else:
                    features[f"left_{w}"] = "<PAD>"
                    features[f"left_{w}_shape"] = "<PAD>"

                # right token
                if right_i < len(tokens):
                    features[f"right_{w}"] = tokens[right_i].lower()
                    features[f"right_{w}_shape"] = token_shape(tokens[right_i])
                else:
                    features[f"right_{w}"] = "<PAD>"
                    features[f"right_{w}_shape"] = "<PAD>"

            X.append(features)
            y.append(target_pos)

    return X, y

X_train_dict, y_train = build_improved_features(train_sentences, window=2)
X_validate_dict, y_validate = build_improved_features(validate_sentences, window=2)
X_test_dict, y_test = build_improved_features(test_sentences, window=2)

X_full_dict = X_train_dict + X_validate_dict
y_full = y_train + y_validate


# 3. Vectorize features

vec = DictVectorizer(sparse=True)
X_full = vec.fit_transform(X_full_dict)
X_validate = vec.transform(X_validate_dict)
X_test = vec.transform(X_test_dict)


# 4. Train classifiers (Naive Bayes, Decision Tree) on improved features

NB = MultinomialNB()
NB.fit(X_full, y_full)
y_pred_nb_test = NB.predict(X_test)
y_pred_nb_val = NB.predict(X_validate)

DT = DecisionTreeClassifier(
    random_state=0,
    max_depth=100
)
DT.fit(X_full, y_full)
y_pred_dt_test = DT.predict(X_test)
y_pred_dt_val = DT.predict(X_validate)


# 5. Evaluate

def evaluate(y_true, y_pred, model_name=""):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    accuracy = accuracy_score(y_true, y_pred)
    print(f"=== {model_name} ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("")
    
evaluate(y_validate, y_pred_nb_val, model_name="Naive Bayes (Validation)")
evaluate(y_validate, y_pred_dt_val, model_name="Decision Tree (Validation)")

evaluate(y_test, y_pred_nb_test, model_name="Naive Bayes (Test)")
evaluate(y_test, y_pred_dt_test, model_name="Decision Tree (Test)")


###############################################################
# Feature Engineering (Leave-One-Feature-Out)
###############################################################

# Let's define the "attributes" as the presence of:
#   - 'token'
#   - 'shape'
#   - 'left_X', 'left_X_shape', 'right_X', 'right_X_shape' for X in [1..window]

def leave_one_out_experiment(X_dict, y, vec_fit, features_to_remove):
    """
    Remove the specified features from each dictionary in X_dict,
    then transform and evaluate using a chosen model.
    """
    X_modified = []
    for feat_dict in X_dict:
        new_dict = {k: v for k, v in feat_dict.items() if k not in features_to_remove}
        X_modified.append(new_dict)
    
    X_modified_transformed = vec_fit.transform(X_modified)
    return X_modified_transformed

# Base performance with all features
base_nb_pred = NB.predict(X_validate)
base_dt_pred = DT.predict(X_validate)

base_nb_precision, base_nb_recall, base_nb_f1, _ = precision_recall_fscore_support(
    y_validate, base_nb_pred, average='macro', zero_division=0
)
base_nb_acc = accuracy_score(y_validate, base_nb_pred)

base_dt_precision, base_dt_recall, base_dt_f1, _ = precision_recall_fscore_support(
    y_validate, base_dt_pred, average='macro', zero_division=0
)
base_dt_acc = accuracy_score(y_validate, base_dt_pred)

feature_groups = [
    ["token"],
    ["shape"],
    ["left_1", "left_1_shape", "right_1", "right_1_shape"],
    ["left_2", "left_2_shape", "right_2", "right_2_shape"],
]

results = []
for group in feature_groups:
    # Remove one group from X_validate_dict
    X_val_no_group = leave_one_out_experiment(X_validate_dict, y_validate, vec, group)
    
    # Evaluate NB
    y_pred_nb_no_group = NB.predict(X_val_no_group)
    pr_nb, re_nb, f1_nb, _ = precision_recall_fscore_support(
        y_validate, y_pred_nb_no_group, average='macro', zero_division=0
    )
    acc_nb = accuracy_score(y_validate, y_pred_nb_no_group)
    
    # Evaluate DT
    y_pred_dt_no_group = DT.predict(X_val_no_group)
    pr_dt, re_dt, f1_dt, _ = precision_recall_fscore_support(
        y_validate, y_pred_dt_no_group, average='macro', zero_division=0
    )
    acc_dt = accuracy_score(y_validate, y_pred_dt_no_group)
    
    # Compute difference from the base
    # (performance with all features) - (performance with one group removed)
    nb_f1_diff = base_nb_f1 - f1_nb
    dt_f1_diff = base_dt_f1 - f1_dt
    
    results.append({
        "Feature Group Removed": group,
        "NB_Accuracy_Diff": base_nb_acc - acc_nb,
        "NB_F1_Diff": nb_f1_diff,
        "DT_Accuracy_Diff": base_dt_acc - acc_dt,
        "DT_F1_Diff": dt_f1_diff
    })

# Print results in a table
df_results = pd.DataFrame(results)
print("=== Leave-One-Feature-Out Results (Difference in performance) ===")
print(df_results.to_string(index=False))


=== Naive Bayes (Validation) ===
Accuracy: 0.5330
Precision: 0.0865
Recall: 0.0304
F1-score: 0.0329

=== Decision Tree (Validation) ===
Accuracy: 0.4762
Precision: 0.2502
Recall: 0.0937
F1-score: 0.1093

=== Naive Bayes (Test) ===
Accuracy: 0.4903
Precision: 0.0733
Recall: 0.0249
F1-score: 0.0264

=== Decision Tree (Test) ===
Accuracy: 0.4645
Precision: 0.1439
Recall: 0.0626
F1-score: 0.0707

=== Leave-One-Feature-Out Results (Difference in performance) ===
                         Feature Group Removed  NB_Accuracy_Diff  NB_F1_Diff  DT_Accuracy_Diff  DT_F1_Diff
                                       [token]          0.148015    0.018288          0.262512    0.099593
                                       [shape]          0.041189    0.003862          0.184374    0.031922
[left_1, left_1_shape, right_1, right_1_shape]         -0.004789   -0.001370          0.054460    0.063413
[left_2, left_2_shape, right_2, right_2_shape]         -0.070601   -0.016463          0.016558    0.026570
