In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

In [53]:
# 1. Load and Prepare Data
print("Loading dataset...")
dataset = pd.read_csv("/Users/gavinl/Desktop/XML_segmentation/Datasets/Training dataset/final_dataset_features.csv")  # Update path as needed

# Basic data cleaning
dataset = dataset.drop("Unnamed: 0", axis=1, errors='ignore')
dataset = dataset.drop("instruction", axis=1, errors='ignore')

# Check for missing values and handle them
print(f"Missing values before handling: {dataset.isnull().sum().sum()}")
dataset.fillna(0, inplace=True)  # Using 0 instead of 1 for missing values
print(f"Missing values after handling: {dataset.isnull().sum().sum()}")

# Display class distribution
print("\nClass distribution:")
print(dataset["output"].value_counts())
print("\nClass percentages:")
print(dataset["output"].value_counts(normalize=True) * 100)

# Separate features and target
X = dataset.drop("output", axis=1)
y = dataset["output"]

Loading dataset...
Missing values before handling: 0
Missing values after handling: 0

Class distribution:
output
MIDDLE    6011
NONE       775
START      695
END        695
Name: count, dtype: int64

Class percentages:
output
MIDDLE    73.520059
NONE       9.478963
START      8.500489
END        8.500489
Name: proportion, dtype: float64


In [54]:
# 2. Feature Selection
print("\nPerforming feature selection...")
# Select top k features based on ANOVA F-value
k = 30  # You can adjust this number
selector = SelectKBest(f_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_features_indices]

print(f"Top {k} features selected:")
for i, feature in enumerate(selected_features):
    print(f"{i+1}. {feature}")

# Create a new dataframe with only selected features
X_selected_df = X[selected_features]


Performing feature selection...
Top 30 features selected:
1. has_any_regions
2. total_region_count
3. content_density_score
4. has_header_region
5. header_size_ratio
6. header_top_position
7. prev_page_has_signature
8. prev_page_emptiness
9. content_increase_from_prev
10. first_content_vertical_position
11. paragraph_to_region_ratio
12. has_no_signature
13. has_small_header_only
14. surrounded_by_content
15. content_continuity_score
16. consistent_layout_with_neighbors
17. has_signature
18. signature_at_bottom
19. next_page_emptiness
20. content_decrease_to_next
21. bottom_region_is_final
22. is_structural_boundary
23. content_discontinuity_score
24. regions_vs_document_avg
25. is_outlier_in_sequence




In [55]:
# 3. Splitting & Scaling
X_train, X_test, y_train, y_test = train_test_split(X_selected_df, y, test_size=0.2, random_state=42,stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [56]:
# 4. Hyperparameter Tuning
print("\nPerforming hyperparameter tuning...")
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],  # saga supports all penalties
    'l1_ratio': [0.5]    # Only used if penalty='elasticnet'
}

grid_search = GridSearchCV(
    LogisticRegression(multi_class='multinomial', max_iter=2000, class_weight='balanced'),
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")


Performing hyperparameter tuning...
Fitting 5 folds for each of 18 candidates, totalling 90 fits





Best parameters: {'C': 10, 'l1_ratio': 0.5, 'penalty': 'l1', 'solver': 'saga'}
Best cross-validation score: 0.8084


In [57]:
# 5. Train the model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)



In [58]:
# 6. Model Evaluation
print("\nEvaluating model performance...")
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Evaluating model performance...
Accuracy: 0.8123

Classification Report:
              precision    recall  f1-score   support

         END       0.52      0.78      0.62       139
      MIDDLE       0.96      0.81      0.88      1203
        NONE       0.74      0.89      0.81       155
       START       0.49      0.81      0.61       139

    accuracy                           0.81      1636
   macro avg       0.68      0.82      0.73      1636
weighted avg       0.86      0.81      0.83      1636



In [None]:
# 7. Confusion Matrix Visualization
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
labels = sorted(y.unique())

# Normalize confusion matrix
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Create heatmap
sns.heatmap(cm_norm, annot=cm, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix.png')

In [None]:
# 8. Feature Importance
plt.figure(figsize=(12, 8))
importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': np.mean(np.abs(best_model.coef_), axis=0)
})
importance = importance.sort_values('Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=importance.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.savefig('feature_importance.png')

In [None]:
# 9. Performance per Class
plt.figure(figsize=(10, 6))
class_report = classification_report(y_test, y_pred, output_dict=True)
class_df = pd.DataFrame(class_report).transpose()
class_df = class_df.iloc[:-3]  # Remove avg rows

sns.barplot(x=class_df.index, y='f1-score', data=class_df)
plt.title('F1 Scores by Class')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('class_performance.png')


In [None]:
# 10. Prediction Errors Analysis
errors_idx = y_test.index[y_test != y_pred]
errors_df = pd.DataFrame({
    'True': y_test[errors_idx],
    'Predicted': y_pred[errors_idx],
    'Probabilities': [y_prob[i] for i in range(len(y_prob)) if y_test.iloc[i] != y_pred[i]]
})

print("\nSample of misclassified instances:")
print(errors_df.head(10))

In [None]:
# 11. Cross-Validation Scores
cv_scores = cross_val_score(best_model, X_selected_df, y, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
# 12. Save results and model
results = pd.DataFrame({"Predicted": y_pred, "Actual": y_test.values})
results.to_csv("results_multinomial.csv", index=False)

# Optionally save the model
from joblib import dump
dump(best_model, 'xml_page_classifier.joblib')

print("\nAnalysis complete. Results saved to 'results_multinomial.csv'")
print("Visualizations saved as PNG files")