In [20]:
# General imports
import glob
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
import re
import gensim
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Validation packages
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from xml.etree import ElementTree as ET
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
stop_words = set(stopwords.words('english')) 

def get_lyrics():
    print("Loading Lyrics...")

    pattern = re.compile(r"\[.*\]|[oaOA]+h+|[\?\.\,\'\!\;\:\(\)\`]+")
    lyrics = []
    files = os.listdir("data/lyrics/")
    idx = [int(f.split(".")[0]) for f in files]

    for f in files:
        with open("data/lyrics/" + f, "r", encoding="utf-8") as lines:
            lyrics += [" ".join([w for w in word_tokenize(re.sub(pattern, "", " ".join(lines))) if not w in stop_words])]
    return np.array(lyrics), idx

def get_mood():
    print("Loading Mood Targets...")
    d = pd.read_csv("data/preprocessed/spotify-data-preprocessed.csv", ",")
    mood_vecs = [np.argmax(x) for x in d.iloc[:,-4:].to_numpy()]
    return np.array(mood_vecs)


X, idx = get_lyrics() ; y = get_mood()[idx]
axis = np.sort(np.hstack((np.where(y==0), np.where(y==1))))
X = X[axis,].reshape(axis.shape[1]) ; y = y[axis,].reshape(axis.shape[1])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 50

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

skf = StratifiedKFold(n_splits=5)

print("Loading Word Embeddings...")
with open("data/Twitter/glove.twitter.27B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}


Loading Lyrics...
Loading Mood Targets...


In [39]:
from sklearn import svm

accs = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index ], y[test_index]

    model = gensim.models.Word2Vec(X_train, size=50)

    w2v = dict(zip(model.wv.index2word, model.wv.vectors))

    w2v_tfidf = Pipeline([
        ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
        # ("extra trees", LogisticRegression(solver = 'lbfgs'))])
        # ("extra trees", RandomForestClassifier())])
        ("extra trees", svm.NuSVC())])

    w2v_tfidf.fit(X_train, y_train)

    y_pred_lr = w2v_tfidf.predict(X_test)

    accs += [accuracy_score(y_test, y_pred_lr)]
    print("Accuracy: ", accs[-1])
print("Average accuracy: ", np.mean(accs))


Accuracy:  0.7415730337078652
Accuracy:  0.7303370786516854
Accuracy:  0.7344632768361582
Accuracy:  0.7740112994350282
Accuracy:  0.7457627118644068
Average accuracy:  0.7452294800990288


# Accuracy Scores

## SVCs

- 0.7125245984891766
- ovo: 0.7023868469497873

## NuSVC
- 0.7441122325906177

## RandomForest
- 0.7396368945597664

## Logistic Regression

In [42]:
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = w2v_tfidf.decision_function(xy).reshape(XX.shape)
# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(w2v_tfidf.support_vectors_[:, 0], w2v_tfidf.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()

AttributeError: 'Pipeline' object has no attribute 'support_vectors_'

In [41]:
from matplotlib import pyplot as plt

In [47]:
import warnings


def versiontuple(v):
    return tuple(map(int, (v.split("."))))


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)

In [48]:
plot_decision_regions(X_test, y_test, svm)

IndexError: too many indices for array

In [46]:
from matplotlib.colors import ListedColormap
markers = ('s', 'x', 'o', '-')
colors = ('red', 'blue', 'lightgreen', 'yellow')
cmap = ListedColormap(colors[:len(np.unique(y_test))])

<__main__.TfidfEmbeddingVectorizer at 0x26553dd92e0>