In [1]:
import sys
sys.path.append("../../digitech_classify")

import joblib
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from pipeline.config import PROCESSED_DATA_DIR, MODELS_DIR
from pipeline.modeling.train import train_model, evaluate_model
from pipeline.plots import plot_roc, plot_pr, plot_confusion_matrix



In [None]:
data = np.load(PROCESSED_DATA_DIR / "training_set_multilabel_all-MiniLM-L6-v2.npz", allow_pickle=True)

X_train = data["embeddings"]             
y_train = data["sector_label"]            
org_ids = data["org_ID"]                  


print("Embeddings shape:", X_train.shape)
print("Labels shape:", y_train.shape)
print("First few labels:", y_train[:5])

In [None]:
from collections import Counter
import ast

import matplotlib.pyplot as plt


all_labels = []
for label_str in y_train:
    labels = ast.literal_eval(label_str)
    all_labels.extend(labels)

label_counts = Counter(all_labels)

plt.figure(figsize=(12, 6))
plt.bar(label_counts.keys(), label_counts.values())
plt.xticks(rotation=45, ha='right')
plt.xlabel('Tech Sector')
plt.ylabel('Count')
plt.title('Distribution of Labels in y_train')
plt.tight_layout()
plt.show()

In [None]:

mlb = MultiLabelBinarizer()
y_all_encoded = mlb.fit_transform(y_train)
print("Encoded labels shape:", y_all_encoded.shape)

In [None]:

X_train_split, X_test, y_train_split, y_test = train_test_split(X_train, y_all_encoded, test_size=0.2, random_state=42)

In [None]:
clf = train_model(X_train_split, y_train_split)

In [None]:
probs = clf.predict_proba(X_test)

preds = clf.predict(X_test)  


class_indices = np.searchsorted(clf.classes_, preds)
pred_confidence = probs[np.arange(len(preds)), class_indices]

In [None]:
string_labels = label_encoder.classes_

In [None]:
df = pd.DataFrame({
    "true_label": string_labels[y_test],      # original label
    "pred_label": string_labels[preds],       # predicted label
    "pred_confidence": pred_confidence
})

In [None]:
accuracy, report, preds = evaluate_model(clf, X_test, y_all_encoded)




report_df = pd.DataFrame(report).transpose()

# Only keep rows that are class indices (filter out 'accuracy', 'macro avg', etc.)
class_indices = [str(i) for i in range(len(label_encoder.classes_))]
class_rows = report_df.index.isin(class_indices)
sector_report_df = report_df[class_rows].copy()

# Map back to tech sector names
sector_report_df['sector_name'] = [label_encoder.classes_[int(idx)] for idx in sector_report_df.index]

sector_report_df = sector_report_df[['sector_name', 'precision', 'recall', 'f1-score', 'support']]

sector_report_df = sector_report_df.reset_index(drop=True)

print(sector_report_df.head())

In [None]:
MODELS_DIR.mkdir(parents=True, exist_ok=True)

save_path = MODELS_DIR / "LregModel_v1.joblib"
joblib.dump(clf, save_path)

