In [5]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# dataset = load_dataset("legacy-datasets/banking77")

from datasets import load_from_disk
dataset = load_from_disk("full_banking77_dataset")

train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])

X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']

vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm_model = SVC(probability=True,kernel='linear')

svm_model.fit(X_train_tfidf, y_train)

y_pred = svm_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 83.92%

Classification Report:
              precision    recall  f1-score   support

          -1       1.00      0.89      0.94         9
           0       0.97      0.95      0.96        40
           1       0.97      0.95      0.96        40
           2       0.97      0.97      0.97        40
           3       0.94      0.78      0.85        40
           4       0.97      0.95      0.96        40
           5       0.64      0.72      0.68        40
           6       0.85      0.88      0.86        40
           7       0.87      0.85      0.86        40
           8       0.90      0.95      0.93        40
           9       0.97      0.97      0.97        40
          10       0.74      0.57      0.65        40
          11       0.77      0.85      0.81        40
          12       0.77      0.82      0.80        40
          13       0.95      0.95      0.95        40
          14       0.63      0.82      0.72        40
          15       0.79      0.82      0

In [6]:
import joblib

In [7]:
joblib.dump(svm_model, 'svm_model_full.joblib')

['svm_model_full.joblib']

In [8]:
joblib.dump(vectorizer, 'vectorizer_full.joblib')

['vectorizer_full.joblib']

In [7]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the Banking77 dataset
dataset = load_dataset("legacy-datasets/banking77")

# Convert train and test splits to pandas DataFrames
train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])

# Features (X) and Labels (y) for training
X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Define the hyperparameter grid for tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],  # Test multiple kernels
    'gamma': ['scale', 'auto'],           # 'scale' or 'auto' (only for rbf, poly, and sigmoid kernels)
    'degree': [3, 4]                      # Only for 'poly' kernel
}

# Initialize the SVM model
svm_model = SVC()

# Set up GridSearchCV
grid_search = GridSearchCV(
    svm_model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy'
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters and model
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Get the best estimator (trained model)
best_model = grid_search.best_estimator_

# Predict on the test data
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 