### SVM Fitting and Results Analysis

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn import preprocessing
import json

Define Class for running bagged SVM on a Multilabel class set:

In [None]:
class MultiLabelSVM:

    def __init__(self, B):
        '''
        Initialize a class instance.
        :param B: Number of bagged samples to run
        '''
        self.fit_models = None
        self.B = B

    def fit(self, X, y):
        '''
        Fit the features to given labels using bagged SVM method.
        :param X: features vectors for each example
        :param y: multilabel vectors for each example
        :return:
        '''
        classes = len(y[0])
        self.fit_models = []

        #work through each class in the multilabel
        for i in range(classes):

            #combine features and labels so we can split up
            #again for bagging
            X = pd.DataFrame(X)
            X['target'] = y[:,i]

            #split up into examples with negative labels and positive labels
            X_pos = X[X['target'] == 1]
            X_neg = X[X['target'] == 0]

            #store all bagged models
            models = []

            for n in range(self.B):

                #Bagging:
                #get the number of positive labels
                num_pos = len(X_pos)

                #create bootstrapped samples of positive example size
                p = X_pos.sample(n=num_pos, replace=True)
                neg = X_neg.sample(n=num_pos, replace=True)

                #re-combine the positive and negative examples
                data = pd.concat([p, neg])

                #Fit and train the SVC
                m_i = SVC()
                m_i.fit(data.drop(columns='target'), data['target'])

                #add to tracked models
                models.append(m_i)

            self.fit_models.append(models)


    def feature_accuracy(self, x_ex, y_known):
        '''
        Custom defined accuracy measure to track accuracy of each individual label.
        :param x_ex: Given examples to predict.
        :param y_known: Known multilabels for each example.
        :return: dictionary for each label class and their metrics.
        '''
        out = {}

        for f in range(len(self.fit_models)):
            true = y_known[:, f]

            avg_acc, avg_rec, avg_prec = 0,0,0

            #average over all bootsrapped models
            for m in range(len(self.fit_models[f])):

                pred = self.fit_models[f][m].predict(x_ex)
                acc = accuracy_score(true, pred)
                rec = recall_score(true, pred)
                prec = precision_score(true, pred)

                avg_acc += acc
                avg_rec += rec
                avg_prec += prec

            avg_acc = avg_acc / len(self.fit_models[f])
            avg_rec = avg_rec / len(self.fit_models[f])
            avg_prec = avg_prec / len(self.fit_models[f])
            out[f] = (avg_acc, avg_rec, avg_prec)

        print("results ", out)

        return out

Load and preprocess features.

In [None]:
features = np.load("../data/tp_source_trimmed.npy", allow_pickle=True)
moods = np.load("../data/moods_target_trimmed.npy", allow_pickle=True)[:, 1]
m_len = len(moods[0])

features = preprocessing.normalize(features)

for i in range(len(moods)):
    moods[i] = np.array(moods[i])
    moods = np.stack(moods)

Create test train split for trial.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, moods, test_size=0.2, shuffle=True)

Train features with bagged multi label SVM.

In [None]:
multi = MultiLabelSVM(B=30)
multi.fit(X_train, y_train)

Get results output for trained model and save to file.

In [None]:
stats = multi.feature_accuracy(X_test, y_test)

with open("../results/svm_bagged_results.json", "w") as outfile:
    json.dump(stats, outfile)

#### Part 2 - Results analysis

Load in saved results file.

In [None]:
with open("../results/svm_bagged_results.json", "r") as json_file:
    results = json.load(json_file)

Process file and collect results sets.

In [None]:
accuracy = []
recall = []
precision = []

for i in results.keys():
    accuracy.append(results[i][0])
    precision.append(results[i][1])
    recall.append(results[i][2])


Get mood labels for results display.

In [None]:
labels = "../data/moods.txt"
l = []

f = open(labels, "r")
for x in f:
    l.append(str.strip(x))

Display results as average, max and min for accuracy, precision, and recall.

In [None]:
print(max(accuracy), l[np.argmin(accuracy)])
print(min(accuracy), l[np.argmax(accuracy)])
print(np.mean(accuracy), '\n')

print(max(precision), l[np.argmin(precision)])
print(min(precision), l[np.argmax(precision)])
print(np.mean(precision), '\n')

print(max(recall), l[np.argmin(recall)])
print(min(recall), l[np.argmax(recall)])
print(np.mean(recall), '\n')


Create counts for each mood -- measure the number of positive examples for
each label out of all labels for that mood.

In [None]:
mood_counts = []

for i in range(len(l)):
    col = moods[:, i]
    ones = np.bincount(col)[1]
    mood_counts.append(ones)

Create a plot to compare percentage of positive labels versus recall
performance of model on that mood.

In [None]:
m = np.array(mood_counts)/len(mood_counts)
x = accuracy

plt.scatter(x, m)

plt.xlabel("Recall Performance of SVM Model")
plt.ylabel("Percentage of Positive Examples in Each Mood")
plt.title("Positive Class Frequency Versus Recall Score")

# This will fit the best line into the graph
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, m, 1))
(np.unique(x)), color='red')

plt.show()
