# SVM Multilabel Classification with ReDSM5

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm

In [11]:
data = pd.read_csv('data/redsm5.csv')
data['labels'] = data['labels'].apply(lambda x: x.split(';'))  # Convert labels to list

# MultiLabel Binarization
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['labels'])
texts = data['text'].tolist()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    texts,              # Input features (text data)
    labels,             # Target labels corresponding to the input features
    test_size=0.2,      # The proportion of the dataset to include in the test split (20% in this case)
    random_state=42     # Seed for reproducibility (set to 42 in this case)
)

In [13]:
vectorizer = TfidfVectorizer(
    max_features=10000000,     # Maximum number of features to consider
    ngram_range=(1, 1),        # Considering unigrams (single words)
    stop_words='english',      # Ignoring common English stop words
    sublinear_tf=True,         # Applying sublinear scaling to term frequency
    use_idf=True               # Using Inverse Document Frequency (IDF)
)

In [14]:
X_train_vectorized = vectorizer.fit_transform(tqdm(X_train, desc='Fitting and transforming training data'))
X_test_vectorized = vectorizer.transform(tqdm(X_test, desc='Transforming testing data'))

Fitting and transforming training data: 100%|██████████| 1187/1187 [00:00<00:00, 6676.13it/s]
Transforming testing data: 100%|██████████| 297/297 [00:00<00:00, 5909.82it/s]


In [15]:
svm_model = OneVsRestClassifier(LinearSVC(random_state=0, tol=1e-5))
svm_model.fit(X_train_vectorized, y_train)

In [16]:
predictions = svm_model.predict(X_test_vectorized)

In [18]:
print(classification_report(y_test, predictions, target_names=mlb.classes_, zero_division=0))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

                   precision    recall  f1-score   support

        ANHEDONIA       0.75      0.24      0.36        25
  APPETITE_CHANGE       0.00      0.00      0.00        10
 COGNITIVE_ISSUES       0.00      0.00      0.00        10
   DEPRESSED_MOOD       0.62      0.19      0.29        70
          FATIGUE       0.77      0.36      0.49        28
      NO_SYMPTOMS       0.43      0.27      0.34        73
      PSYCHOMOTOR       0.00      0.00      0.00         8
     SLEEP_ISSUES       0.40      0.11      0.17        18
SUICIDAL_THOUGHTS       0.80      0.43      0.56        28
    WORTHLESSNESS       0.82      0.44      0.58        72

        micro avg       0.64      0.28      0.39       342
        macro avg       0.46      0.20      0.28       342
     weighted avg       0.60      0.28      0.37       342
      samples avg       0.30      0.28      0.28       342

Accuracy: 0.22895622895622897
