In [2]:
import numpy as np
from sklearn import svm
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import f1_score, confusion_matrix

In [3]:
trainSentences = np.load("data/training_sentences.npy", allow_pickle=True)
trainLabels = np.load("data/training_labels.npy", allow_pickle=True)

testSentences = np.load("data/test_sentences.npy", allow_pickle=True)
testLabels = np.load("data/test_labels.npy", allow_pickle=True)

In [4]:
def normalizeData(trainData, testData, metric=None):
    match metric:
        case "standard":
            scaler = StandardScaler()
            scaler.fit(trainData)
            scaledTrainData = scaler.transform(trainData)
            scaledTestData = scaler.transform(testData)
            return scaledTrainData, scaledTestData
        case "L1":
            normalizer = Normalizer(norm='l1')
            scaledTrainData = normalizer.transform(trainData)
            scaledTestData = normalizer.transform(testData)
            return scaledTrainData, scaledTestData
        case "L2":
            normalizer = Normalizer(norm='l2')
            scaledTrainData = normalizer.transform(trainData)
            scaledTestData = normalizer.transform(testData)
            return scaledTrainData, scaledTestData
        case _:
            return trainData, testData

In [5]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = dict()
    
    def buildVocabulary(self, data):
        nextID = len(self.vocabulary)
        for message in data:
            for word in message:
                if word not in self.vocabulary:
                    self.vocabulary[word] = nextID
                    nextID += 1
    
    def getFeatures(self, data):
        features = np.zeros((len(data), len(self.vocabulary)), dtype=int)
        for i, message in enumerate(data):
            for word in message:
                if word in self.vocabulary:
                    features[i][self.vocabulary[word]] += 1
        return features

In [6]:
bow = BagOfWords()
bow.buildVocabulary(trainSentences)

len(bow.vocabulary)

9522

In [7]:
normBowTrain, normBowTest = normalizeData(bow.getFeatures(trainSentences), bow.getFeatures(testSentences), "L2")

In [8]:
print(normBowTrain.shape)
normBowTrain

(3734, 9522)


array([[0.35355339, 0.35355339, 0.35355339, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.19611614, 0.19611614,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.33333333],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [9]:
print(normBowTest.shape)
normBowTest

(1840, 9522)


array([[0. , 0. , 0.5, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [10]:
svmBow = svm.SVC(C=1, kernel='linear')
svmBow.fit(normBowTrain, trainLabels)

SVC(C=1, kernel='linear')

In [11]:
predictedBowLabels = svmBow.predict(normBowTest)
predictedBowLabels

array([0, 0, 0, ..., 0, 0, 1])

In [12]:
accuracy = np.mean(predictedBowLabels == testLabels)
accuracy

0.9842391304347826

In [13]:
F1Score = f1_score(testLabels, predictedBowLabels)
F1Score

0.9409368635437881

In [14]:
confMatrix = confusion_matrix(testLabels, predictedBowLabels)

TN, FP, FN, TP = confMatrix.ravel()

print(TN, FP)
print(FN, TP)

1580 5
24 231


In [15]:
coeff = svmBow.coef_
indexes = np.argsort(coeff)[0]

In [16]:
mostPositive = np.array(list(bow.vocabulary.keys()))[indexes[:10]]
list(mostPositive)

['&lt#&gt', 'me', 'i', 'Going', 'him', 'Ok', 'I', 'Ill', 'my', 'Im']

In [17]:
mostNegative = np.array(list(bow.vocabulary.keys()))[indexes[-10:]]
list(mostNegative)

['Text', 'To', 'mobile', 'CALL', 'FREE', 'txt', '&', 'Call', 'Txt', 'STOP']