In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC  

In [None]:
# Oversampled Data
df_oversampled = pd.read_csv("../data/resampled/oversampled_data.csv").drop(["Unnamed: 0"], axis=1)
df_oversampled.head(10)

In [None]:
# Undersampled Data
df_undersampled = pd.read_csv("../data/resampled/undersampled_data.csv").drop(["Unnamed: 0"], axis=1)
df_undersampled.head(10)

In [None]:
def createSVCModel(title, df):
  print("=============================================================")
  print(title)
  print("=============================================================")
  # Split features and target
  X = df.drop(["target"], axis=1)
  y = df["target"]
  
  # Split train-test set with 90% train set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=40)
  
  # Assign possible hyperparameters for SVC
  param_grid = {
    'C': [0.1,1, 10, 100],
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
  }
  
  # Search for best parameters for SVC, then train the dataset
  grid = GridSearchCV(SVC(), param_grid, refit=True)
  grid.fit(X_train, y_train)
  print(X_train.shape)
  
  # Print best hyperparameters tuning from GridSearchCV
  print("BEST PARAMETERS:")
  print(grid.best_estimator_)
  
  # Predict dataset from test set
  grid_predictions = grid.predict(X_test)
  
  # Get classification reports (precision, recall, and f1-score)
  print("CLASSIFICATION REPORT:")
  print(classification_report(y_test, grid_predictions))
  
  # Get confusion matrix
  cm = confusion_matrix(y_test, grid_predictions)
  cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
  cm_display.plot()
  plt.show()
  
  # Return the trained model
  return grid


In [None]:
oversampledModel = createSVCModel("OVERSAMPLED DATA", df_oversampled)
undersampledModel = createSVCModel("UNDERSAMPLED DATA", df_undersampled)

# Evaluation

The models performs best when it is trained using oversampled data. It has reached 100% for each of the evaluation metrics.