In [9]:
# General imports
import glob
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
import re
import gensim
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Validation packages
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix

from xml.etree import ElementTree as ET
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
  
stop_words = set(stopwords.words('english')) 

def get_lyrics():
    print("Loading Lyrics...")

    lyrics = []
    files = os.listdir("data/lyrics/")
    idx = [int(f.split(".")[0]) for f in files]

    for f in files:
        with open("data/lyrics/" + f, "r", encoding="utf-8") as lines:
            lyrics += [" ".join(lines)]
    return np.array(lyrics), idx

def get_mood():
    print("Loading Mood Targets...")
    d = pd.read_csv("data/preprocessed/spotify-data-preprocessed.csv", ",")
    mood_vecs = [np.argmax(x) for x in d.iloc[:,-4:].to_numpy()]
    return np.array(mood_vecs)


print("Loading Word Embeddings...")
with open("data/glove.840B.300d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}


Loading Word Embeddings...


In [13]:
X, idx = get_lyrics() ; y = get_mood()[idx]
sentiment = pd.read_csv("sentiment.csv", index_col=0)
# sentiment = sentiment.iloc[idx,[7, 9, 10, 12, 13, 14, 15, 28, 31, 32]].to_numpy()
sentiment = np.nan_to_num(sentiment)
# Change AXIS
# axis = np.sort(np.hstack((np.where(y==2), np.where(y==3))))
# X = X[axis,].reshape(axis.shape[1]) ; y = y[axis,].reshape(axis.shape[1])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = 300

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

skf = StratifiedKFold(n_splits=5)

Loading Lyrics...
Loading Mood Targets...


In [16]:
from sklearn import svm


models = [
           LogisticRegression(solver = 'lbfgs'),
           RandomForestClassifier(),
           svm.NuSVC()
        ]

for m in models:
    accs = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index ], y[test_index]
        s_train, s_test = sentiment[train_index], sentiment[test_index]
        
        model = gensim.models.Word2Vec(X_train, size=300)
        w2v = dict(zip(model.wv.index2word, model.wv.vectors))
        tfidf = TfidfEmbeddingVectorizer(w2v)
        tfidf.fit(X_train, y_train)
        embeddings = tfidf.transform(X_train)
        e_test = tfidf.transform(X_test)

        #w2v_tfidf = np.hstack((embeddings, s_train))
        m.fit(embeddings, y_train)
        y_pred_lr = m.predict(e_test)

        accs += [accuracy_score(y_test, y_pred_lr)]
        
        # print("Accuracy: ", accs[-1])
        # print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_lr))
    print(f"{m}[0]")
    print("Average accuracy: ", np.mean(accs))


LogisticRegression()[0]
Average accuracy:  0.41146131805157593
RandomForestClassifier()[0]
Average accuracy:  0.44011461318051576
NuSVC()[0]
Average accuracy:  0.45386819484240687


In [6]:
model = gensim.models.Word2Vec(X, size=50)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

vectorizer = TfidfEmbeddingVectorizer(w2v).fit(X, y)
embeddings = vectorizer.transform(X)
embeddings.shape

(1745, 50)

In [13]:
d = pd.read_csv("data/preprocessed/spotify-data-preprocessed.csv", ",")
d.iloc[idx].to_csv()

In [8]:
embeddings.shape

(1396, 50)

In [34]:
embeddings.shape

(1396,)

In [38]:
sentiment.shape

(1745, 10)

Unnamed: 0,ECHOISMS,DUPLICATE_LINES,IS_TITLE_IN_LYRICS,VERB_PRESENT,VERB_PAST,VERB_FUTURE,ADJ_FREQUENCIES,PUNCT_FREQUENCIES,VERB_FREQUENCIES,X_FREQUENCIES
0,0.0,0.076087,False,0.901235,0.098765,0.000000,0.111413,0.040761,0.334239,0.000000
1,0.0,0.127479,False,0.555556,0.444444,0.000000,0.045326,0.186969,0.274788,0.000000
10,0.0,0.039683,True,0.625000,0.375000,0.000000,0.190476,0.055556,0.174603,0.000000
100,0.0,0.080247,False,0.541176,0.447059,0.011765,0.043210,0.061728,0.419753,0.000000
1000,0.0,0.000000,False,0.750000,0.250000,0.000000,0.024390,0.097561,0.243902,0.000000
...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.094203,False,0.838710,0.145161,0.016129,0.090580,0.003623,0.329710,0.000000
996,0.0,0.196078,True,0.600000,0.400000,0.000000,0.274510,0.000000,0.176471,0.078431
997,0.0,0.009009,False,0.950000,0.050000,0.000000,0.067568,0.067568,0.243243,0.000000
998,0.0,0.057279,False,0.823529,0.058824,0.117647,0.188544,0.040573,0.143198,0.000000


In [27]:
sentiment.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'