## Train support vector machine (SVM) classifier

In [1]:
use_extracted_data = False

In [2]:
import os

import pandas as pd
from joblib import dump
from sklearn.svm import SVC

from lib.check_files import check_prepared_data
from lib.pu import get_xy

In [3]:
# Random seed for reproducibility
random_seed = 1234

In [4]:
# Input/output files
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=True)
data_filename = os.path.join(data_dir, "training_data.csv")

output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, "svm_classifier.joblib")

### Load and process training data

In [5]:
data = pd.read_csv(data_filename)

# Drop unlabelled samples
data = data[data["label"].isin({"positive", "negative"})]

print(data.groupby(["region", "label"]).size())

region  label   
NAm     negative     57
        positive    217
SAm     negative    632
        positive    130
dtype: int64


In [6]:
# Extract relevant columns in NumPy array form
x, y = get_xy(data)
print(x.shape, y.shape)

(1036, 25) (1036,)


### Train the SVM classifier

In [7]:
svm_model = SVC(kernel="rbf", probability=True, random_state=random_seed)
svm_model.fit(x, y)

### Use `joblib` to save model to file

In [8]:
dump(svm_model, output_filename)

['outputs/svm_classifier.joblib']