### Decision Tree and Random Forests for Mood Classification

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import json
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


Create class to run binary classification using either model on each of the moods in
the label set.

In [2]:
class MultiLabelClassify:
    '''
    Multilabel Classifier for fitting binary classification to each label.
    '''

    def __init__(self, model):
        '''
        Set up an instance of the classifier.
        :param model: Choice of 'RF' for random forest or 'DC' for decision tree
        '''
        self.fit_models = None
        if model != 'RF' and model != 'DC':
            raise Exception('Choose RF or DC as model type')
        self.model_choice = model


    def fit(self, X, y):
        '''
        Fit model with provided features and labels
        :param X: features vector
        :param y: matching labels, where length of each vector contains all label classifications
        for the given features vector
        '''
        classes = len(y[0])
        self.fit_models = []

        if len(X) != len(y):
            raise Exception("Incompatible featues and label lengths")

        for i in range(classes):
            if self.model_choice == 'RF':
                m_i = RandomForestClassifier(max_features='sqrt', bootstrap=True, n_estimators=100)
            else:
                m_i = DecisionTreeClassifier()
            m_i.fit(X,  y[:,i])
            self.fit_models.append(m_i)

    def predict(self, x_ex):
        '''
        Predict the classification for all provided examples
        :param x_ex: features vectors with example instances
        :return:
        '''
        out = []
        for i in range(len(x_ex)):
            classes = []
            for j in range(len(self.fit_models)):
                classes.append(self.fit_models[j].predict([x_ex[i]]))
            out.append(classes)

        return out

    def feature_accuracy(self, x_ex, y_known):
        '''
        Custom function to generate accuracy, precision, and recall results for overall
        and each class label.
        :param x_ex: examples of feature vector to be predicted
        :param y_known: known values of labels for each example
        :return: a dictionary with metric values for each class
        '''
        feats = {}
        t, f_p, f_n, t_n, t_p = 0, 0,0,0,0

        for f in range(len(self.fit_models)):
            true = y_known[:, f]
            pred = self.fit_models[f].predict(x_ex)

            true_pos, false_pos, false_neg,true_neg  = 0,0,0,0
            total = 0

            for i in range(len(pred)):
                total += 1
                if pred[i] == 1 and true[i] == 1:
                    true_pos += 1
                elif pred[i] == 1 and true[i] == 0:
                    false_pos += 1
                elif pred[i] == 0 and true[i] == 1:
                    false_neg += 1
                elif pred[i] == 0 and true[i] == 0:
                    true_neg += 1
                else:
                    raise Exception("Unknwon label encountered")

            t += total
            t_n += true_neg
            t_p += true_pos
            f_n += false_neg
            f_p += false_pos

            feats[f] = total, true_neg, true_pos, false_neg, false_pos

        print("Accuracy: ", (t_p + t_n)/t)
        print("Precision: ", t_p / (t_p + f_p))
        print("Recall: ", t_p / (t_p + f_n))

        return feats


Load in data for features and labels

In [4]:
features = np.load("../data/tp_source_trimmed.npy", allow_pickle=True)
moods = np.load("../data/moods_target_trimmed.npy", allow_pickle=True)[:, 1]
m_len = len(moods[0])

features = preprocessing.normalize(features)

moods = np.stack(moods)

Create basic 80/20 random split for testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, moods, test_size=0.2, shuffle=True)

Run classifier on Decision Tree:

In [6]:
multi_dc = MultiLabelClassify(model='DC')
multi_dc.fit(X_train, y_train)

KeyboardInterrupt: 

Report stats and save individual label results for analysis

In [None]:
stats = multi_dc.feature_accuracy(X_test, y_test)

with open("../results/dc_results.json", "w") as outfile:
    json.dump(stats, outfile)

Run classifier on Random Forest:

In [None]:
multi_dc = MultiLabelClassify(model='RF')
multi_dc.fit(X_train, y_train)

Report stats and save individual label results for analysis

In [None]:
stats = multi_dc.feature_accuracy(X_test, y_test)

with open("../results/rf_results.json", "w") as outfile:
    json.dump(stats, outfile)



#### Part 2: Results Analysis

Load in data from results and mood labels for reporting.

In [8]:
labels = "../data/moods.txt"
l = []

f = open(labels, "r")
for x in f:
    l.append(str.strip(x))

rf = "../results/rf_results.json"
dc = "../results/dc_results.json"

Define a function for reuse to analyze both results files:

In [None]:
def results_analysis(file_name, l):
    with open(file_name, "r") as json_file:
        results = json.load(json_file)

    accuracy = []
    recall = []
    precision = []
    zero_index = []

    for i in results.keys():
        accuracy.append((results[i][1] + results[i][2]) / results[i][0])

        if results[i][2] != 0:
            zero_index.append(int(i))
            precision.append(results[i][2] / (results[i][2] + results[i][4]))
            recall.append(results[i][2] / (results[i][2] + results[i][3]))

    print(max(accuracy), l[np.argmin(accuracy)])
    print(min(accuracy), l[np.argmax(accuracy)])
    print(np.mean(accuracy), '\n')

    print(max(precision), l[np.argmin(precision)])
    print(min(precision), l[np.argmax(precision)])
    print(np.mean(precision), '\n')

    print(max(recall), l[np.argmin(recall)])
    print(min(recall), l[np.argmax(recall)])
    print(np.mean(recall), '\n')

Report on Decision Tree results:

In [None]:
results_analysis(dc, l)


Report on Random Forest results:

In [None]:
results_analysis(rf, l)

