In [None]:
import sys
sys.path.append("../../digitech_classify")

import joblib
import ast
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from pipeline.config import PROCESSED_DATA_DIR, MODELS_DIR
from pipeline.modeling.train import train_and_evaluate_multilabel, train_random_forest, evaluate_model
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict
from skmultilearn.model_selection import IterativeStratification
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from scipy.stats import uniform


In [None]:
data = np.load(PROCESSED_DATA_DIR / "training_set_multilabel_all-MiniLM-L6-v2.npz", allow_pickle=True)

X_train = data["embeddings"]             
y_train = data["sector_label"]            
org_ids = data["org_ID"]                  


print("Embeddings shape:", X_train.shape)
print("Labels shape:", y_train.shape)
print("First few labels:", y_train[:5])
print("org_ids dtype:", org_ids.dtype)
print("y_train dtype:", y_train.dtype)

In [None]:
# Parse string representations into actual lists : multilabel binarization requires list
parsed_labels = []
for label_str in y_train:
    labels = ast.literal_eval(label_str)
    parsed_labels.append(labels)


In [None]:


mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(parsed_labels)
print("Encoded labels shape:", y_encoded.shape)
print("Classes:", mlb.classes_)

MultiLabelBinarizer().fit_transform(parsed_labels)

In [None]:

print("Running complete multilabel training pipeline...")
print("=" * 60)

pipeline_results = train_and_evaluate_multilabel(
    X_train, y_encoded, 
    target_names=mlb.classes_,
    test_size=0.2,
    cv=5,
    max_iter=1000,
    random_state=42,
    show_results=True
)

print("\nPipeline completed!")
print(f"Final model type: {type(pipeline_results['model'])}")

In [None]:
model_name = "oneVrest_logreg_v1"
model_path = MODELS_DIR / f"{model_name}.joblib"
joblib.dump(pipeline_results['model'], model_path)
print(f"Model saved to: {model_path}")


mlb_path = MODELS_DIR / f"{model_name}__mlb.joblib"
joblib.dump(mlb, mlb_path)
print(f"MultiLabelBinarizer saved to: {mlb_path}")

# Save model metadata 
metadata = {
    'model_type': 'OneVsRestClassifier_LogisticRegression',
    'feature_dim': X_train.shape[1],
    'n_classes': len(mlb.classes_),
    'class_names': list(mlb.classes_),
    'training_samples': X_train.shape[0],
    'test_samples': pipeline_results['X_test'].shape[0],
    'performance_metrics': {
        'jaccard_score': pipeline_results['evaluation_metrics']['jaccard_score'],
        'hamming_loss': pipeline_results['evaluation_metrics']['hamming_loss'],
        'cv_jaccard_mean': pipeline_results['cv_scores']['test_jaccard_samples'].mean(),
        'cv_jaccard_std': pipeline_results['cv_scores']['test_jaccard_samples'].std()
    },
    'hyperparameters': {
        'max_iter': 1000,
        'solver': 'lbfgs',
        'random_state': 42,
        'test_size': 0.2,
        'cv_folds': 5
    }
}

# Random forest classification


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_encoded,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=None  # Do NOT stratify for multilabel indicator y
)


In [None]:
rf_clf = train_random_forest(
    X_train,
    y_train,
    n_estimators=200,
    max_depth=30,
    random_state=42,
    multilabel=True
)

In [None]:
results = evaluate_model(
    clf=rf_clf,
    X_test=X_test,
    y_test=y_test,
    target_names=mlb.classes_,
    multilabel=True,
    show_report=True
)

In [None]:
report_df = pd.DataFrame(results['classification_report']).T
display(report_df)

In [None]:
param_dist = {
    "estimator__n_estimators": [100, 200, 300, 400],
    "estimator__max_depth": [10, 20, 30, None],
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__bootstrap": [True, False]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_dist,
    n_iter=20,                 
    scoring='f1_samples',       
    cv=3,                       
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", random_search.best_params_)


print("Best estimator:", random_search.best_estimator_)


print("Best cross-validation score (mean f1_samples):", random_search.best_score_)

In [None]:

best_rf_model = random_search.best_estimator_

model_name = "random_forest_v1"
model_path = MODELS_DIR / f"{model_name}.joblib"
joblib.dump(best_rf_model, model_path)
print(f"Best Random Forest model saved to: {model_path}")


mlb_path = MODELS_DIR / f"{model_name}__mlb.joblib"
joblib.dump(mlb, mlb_path)
print(f"MultiLabelBinarizer saved to: {mlb_path}")


metadata = {
    'model_type': 'RandomForest_RandomizedSearchCV_Best',
    'best_params': random_search.best_params_,
    'best_cv_score': random_search.best_score_,
    'feature_dim': X_train.shape[1],
    'n_classes': len(mlb.classes_),
    'class_names': list(mlb.classes_),
    'training_samples': X_train.shape[0],
    'test_samples': X_test.shape[0],
    'cv_folds': 3,
    'scoring_metric': 'f1_samples',
    'n_iter': 20
}

metadata_path = MODELS_DIR / f"{model_name}_metadata.joblib"
joblib.dump(metadata, metadata_path)
print(f"Model metadata saved to: {metadata_path}")

### MLP Classification

In [None]:
param_dist = {
    'hidden_layer_sizes': [(128,), (256,), (128, 128), (256, 128)],
    'activation': ['relu', 'tanh'],
    'alpha': uniform(1e-5, 1e-3),
    'learning_rate_init': uniform(1e-4, 1e-2),
    'max_iter': [400, 500, 600, 700]
}

mlp = MLPClassifier(random_state=42)

stratifier = IterativeStratification(n_splits=5, order=1)
cv_splits = list(stratifier.split(X_train, y_encoded))

# Randomized search with multilabel-aware cross-validation (skmultilearn version)
random_search = RandomizedSearchCV(
    estimator=mlp,
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1_samples', #average_precision
    cv=cv_splits,
    verbose=2,
    n_jobs=2,
    random_state=42
)

random_search.fit(X_train, y_encoded)
print("Best params:", random_search.best_params_)
print("Best score:", random_search.best_score_)

In [None]:
best_mlp_model = random_search.best_estimator_

In [None]:
model_name = "mlp_multilabel_v1"
model_path = MODELS_DIR / f"{model_name}.joblib"
joblib.dump(best_mlp_model, model_path)
print(f"Best MLP model saved to: {model_path}")


mlb_path = MODELS_DIR / f"{model_name}__mlb.joblib"
joblib.dump(mlb, mlb_path)
print(f"MultiLabelBinarizer saved to: {mlb_path}")


metadata = {
    'model_type': 'MLPClassifier_RandomizedSearchCV_Best',
    'best_params': random_search.best_params_,
    'best_cv_score': random_search.best_score_,
    'feature_dim': X_train.shape[1],
    'n_classes': len(mlb.classes_),
    'class_names': list(mlb.classes_),
    'training_samples': X_train.shape[0],
    'test_samples': X_test.shape[0] if 'X_test' in locals() else None,
    'cv_folds': 5,
    'scoring_metric': 'f1_samples',
    'n_iter': random_search.n_iter
}

metadata_path = MODELS_DIR / f"{model_name}_metadata.joblib"
joblib.dump(metadata, metadata_path)
print(f"Model metadata saved to: {metadata_path}")


print("\nBest cross-validated F1 (samples):", random_search.best_score_)
print("Best hyperparameters:", random_search.best_params_)

In [14]:





best_mlp = MLPClassifier(**random_search.best_params_, random_state=42)


y_pred_cv = cross_val_predict(
    best_mlp, 
    X_train, 
    y_encoded, 
    cv=IterativeStratification(n_splits=5, order=1), 
    method='predict',
    n_jobs=2
)


print(classification_report(y_encoded, y_pred_cv, target_names=mlb.classes_, zero_division=0))


                                                           precision    recall  f1-score   support

                          advanced-digital-communications       0.72      0.71      0.72      6953
                                  artificial-intelligence       0.71      0.70      0.71     12250
                                               blockchain       0.89      0.89      0.89     10686
                                           cloud-edge-iot       0.77      0.77      0.77     18151
                                            cybersecurity       0.82      0.83      0.82     22750
                                           data-analytics       0.76      0.74      0.75     10416
                               high-performance-computing       0.51      0.46      0.48      3507
microelectronics, high frequency chips and semiconductors       0.29      0.25      0.27       785
                                 next-generation-internet       0.68      0.65      0.67      4851
         