In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.metrics import classification_report, ConfusionMatrixDisplay

import joblib

In [3]:
df = pd.read_json(path_or_buf = '../../data/processed/dataset.json', orient = 'records')

# EDA

In [None]:
df

In [None]:
df['type'].value_counts().plot.pie(figsize=(5, 5))

In [6]:
X = df['pattern'].to_numpy().astype(str)
y = df['type'].to_numpy().astype(str)

In [None]:
len(X), len(y)

# Modeling

In [8]:
trainX, testX, trainY, testY = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify = y)

In [None]:
pipe = make_pipeline(TfidfVectorizer(input = 'content', lowercase = True, analyzer = 'char', max_features = 1024), SVC())

param_grid = {'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 4)], 'svc__C': [1, 10], 'svc__kernel': ['linear', 'rbf']}

grid = GridSearchCV(pipe, param_grid, cv = 2, verbose = 4)

grid.fit(trainX, trainY)

In [None]:
grid.score(testX, testY)

In [None]:
preds = grid.predict(testX)

print(classification_report(testY, preds))

In [None]:
ConfusionMatrixDisplay.from_estimator(grid, testX, testY)

In [None]:
grid.best_params_

In [9]:
pipe = make_pipeline(TfidfVectorizer(input = 'content', lowercase = True, analyzer = 'char', max_features = 1024, ngram_range = (1, 4)), SVC(C = 10, kernel = 'rbf'))

pipe.fit(trainX, trainY)

In [None]:
pipe.score(trainX, trainY)

In [None]:
pipe.score(testX, testY)

In [None]:
preds = pipe.predict(testX)

print(classification_report(testY, preds))

In [None]:
ConfusionMatrixDisplay.from_estimator(pipe, testX, testY)

In [None]:
joblib.dump(pipe, '../../metamaska/models/payload_clf.joblib', protocol=2, compress=3)