In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib
import numpy as np

# Load the data into a DataFrame from labeled_transactions.csv
df = pd.read_csv("labeled_transactions.csv")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["Description"], df["Category"], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize SVM classifier
svm_clf = SVC(kernel="linear")

# Perform 10-fold cross-validation
cv = StratifiedKFold(n_splits=10)
cross_val_scores = cross_val_score(svm_clf, X_train_tfidf, y_train, cv=cv, scoring="accuracy")

# Fit the model using the entire training set
svm_clf.fit(X_train_tfidf, y_train)

# Predict on the test set
svm_y_pred = svm_clf.predict(X_test_tfidf)

# Evaluate the model
print("SVM Classifier Report:")
print(classification_report(y_test, svm_y_pred))

# Calculate the mean cross-validation score
mean_cross_val_score = np.mean(cross_val_scores)
print(f"Mean cross-validation accuracy: {mean_cross_val_score:.4f}")

# Save the best model (SVM) and the vectorizer to disk
joblib.dump(svm_clf, "svm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Best SVM model and vectorizer saved to disk.")


# Function to classify new transactions
def classify_transaction(description):
    description_tfidf = vectorizer.transform([description])
    svm_category = svm_clf.predict(description_tfidf)[0]
    print(f"SVM - Transaction category: {svm_category}")


# Example classification
new_transaction = "Rent for July"
classify_transaction(new_transaction)

SVM Classifier Report:
              precision    recall  f1-score   support

     Housing       0.83      0.71      0.77        62
      Income       0.97      0.95      0.96       192
 P2P Expense       0.80      1.00      0.89        99
    Transfer       0.85      0.86      0.86       176
     Utility       0.89      0.69      0.78        71

    accuracy                           0.88       600
   macro avg       0.87      0.84      0.85       600
weighted avg       0.88      0.88      0.88       600

Mean cross-validation accuracy: 0.8796
Best SVM model and vectorizer saved to disk.
SVM - Transaction category: Transfer
