In [3]:
import pandas as pd
import numpy as np
from datasets import load_from_disk


In [4]:
aug_stream_data = load_from_disk("data/processed/aug_stream_data")


In [5]:
aug_stream_data.set_format(type="pandas")
train_df = aug_stream_data["train"][:]
valid_df = aug_stream_data["validation"][:]


In [6]:
X_train, y_train = train_df["text"], train_df["label"]
X_valid, y_valid = valid_df["text"], valid_df["label"]
labels = aug_stream_data["train"].features["label"].names


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
X_train_embed = model.encode(X_train, show_progress_bar=True)


In [None]:
X_valid_embed = model.encode(X_valid, show_progress_bar=True)


In [7]:
X_train_embed.shape, X_valid_embed.shape


((8914, 768), (1114, 768))

In [8]:
X_embed = np.vstack([X_train_embed, X_valid_embed])
y = pd.concat([y_train, y_valid])


In [None]:
from time import time
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVC

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {
    "C": loguniform(1e-3, 1e3),
    # "gamma": loguniform(1e-4, 1e-1),
}
clf = RandomizedSearchCV(
    LinearSVC(class_weight="balanced"), param_grid, n_iter=40
)

clf = clf.fit(X_embed, y)

print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


In [None]:
# Enable categorical support for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)

param = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': len(label_mapping),
    'eval_metric': 'mlogloss'
}

num_round = 100
bst = xgb.train(param, dtrain, num_round)

# Evaluate the model
y_pred = bst.predict(dtest)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

results_embed = []
for clf, name in (
    (LogisticRegression(class_weight="balanced"), "Logistic Regression"),
    (RidgeClassifier(class_weight="balanced"), "Ridge Classifier"),
    (KNeighborsClassifier(), "kNN"),
    (XGBClassifier(), "XGBoost"),
    (LinearSVC(class_weight="balanced"), "Linear SVC"),
    (SGDClassifier(class_weight="balanced"), "SGD Classifier"),
    (NearestCentroid(), "NearestCentroid"),
):
    print("=" * 80)
    print(name)
    results_embed.append(benchmark(clf, name))


In [None]:
from src.utils import plot_confusion_matrix
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(class_weight="balanced", random_state=42)
svm_clf.fit(X_train_embed, y_train)
y_preds = svm_clf.predict(X_valid_embed)

plot_confusion_matrix(y_preds, y_valid, labels)


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_valid, y_preds, target_names=labels))
