In [None]:


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud
import re
import nltk




In [None]:


df = pd.read_csv('../raw/id_tags_dict.tsv', sep='\t')
df = df.rename(columns={'id':'songID'})




In [None]:


df




In [None]:


df.head(30)




In [None]:


df = df[df['(tag, weight)'] != '{}']





<h1 id="Explore-the-distributions-of-tags">Explore the distributions of tags<a class="anchor-link" href="#Explore-the-distributions-of-tags">¶</a></h1><p>First, create a dictionary having a nested structure, as follows: <br/>
{tag1: [{song: weight}, {song: weight}, ..., {song: weight}],<br/>
tag2: [{song: weight}, {song: weight}, ..., {song: weight}],<br/>
...,<br/>
tagN: [{song: weight}, {song: weight}, ..., {song: weight}]}</p>


In [None]:


tag_songs_dict = dict()
for idx, row in df.iterrows():
    song = row['songID']
    tag_str = row['(tag, weight)'].strip('{}').replace('\'', '')
    if tag_str != '':
        clean_tag_list = tag_str.split(',')
        for couple in clean_tag_list:
            tag = couple.split(':')[0].strip()
            score = int(couple.split(':')[1].strip())
            if tag in tag_songs_dict.keys():
                old_list = tag_songs_dict[tag]
                old_list.append((song, score))
                tag_songs_dict[tag] = old_list
            else:
                new_list = list()
                new_list.append((song, score))
                tag_songs_dict[tag] = new_list
         




In [None]:


tag_songs_dict





<h1 id="Mood-tagging:-WNAffect">Mood tagging: WNAffect<a class="anchor-link" href="#Mood-tagging:-WNAffect">¶</a></h1><p>WNAffect paper: <a href="https://www.researchgate.net/profile/Tibor-Polya/publication/269110514_Linguistic_Structure_Narrative_Structure_and_Emotional_Intensity/links/633436f1ff870c55cee353b1/Linguistic-Structure-Narrative-Structure-and-Emotional-Intensity.pdf#page=24">https://www.researchgate.net/profile/Tibor-Polya/publication/269110514_Linguistic_Structure_Narrative_Structure_and_Emotional_Intensity/links/633436f1ff870c55cee353b1/Linguistic-Structure-Narrative-Structure-and-Emotional-Intensity.pdf#page=24</a> <br/>
As done in <a href="https://archives.ismir.net/ismir2009/paper/000095.pdf">https://archives.ismir.net/ismir2009/paper/000095.pdf</a> <br/>
WordNet-Affect: <a href="https://github.com/clemtoy/WNAffect/tree/master">https://github.com/clemtoy/WNAffect/tree/master</a> <br/>
Genre list: <a href="http://www.musicgenreslist.com">http://www.musicgenreslist.com</a></p>


In [None]:


from WNAffect.wnaffect import WNAffect
from WNAffect.emotion import Emotion
wna = WNAffect('../../utils/wordnet-1.6', '../../utils/wordnet-domains-sentiwords/wn-domains')




In [None]:


def assign_mood_tag(tag, level = -1):
    wordsList = nltk.word_tokenize(tag)
    tagged = nltk.pos_tag(wordsList)
    mood = list()
    trigger_words = list()
    for (w, pos) in tagged:
        emo = wna.get_emotion(w, pos)
        if emo is None:
            pass
        else:
            if level == -1:
                mood.append(emo)
            else:
                mood.append(emo.get_level(level))
            trigger_words.append(w)
    return mood, trigger_words





<ul>
<li>mood_songs_dict = dictionary containing the moods as keys and a list of songs assigned to it as values <br/></li>
<li>all_moods = dictionary having the emotion name (str) as key and corresponding Emotion object as value <br/></li>
<li>trigger_words = set containing the words of the tags that trigger the WNA to assign a mood</li>
</ul>


In [None]:


mood_songs_dict = dict()
all_moods = dict()
trigger_words = set()
for tag, songs_weights in tag_songs_dict.items():
    moods, t_words = assign_mood_tag(tag, 5)
    for mood, trigger in zip(moods, t_words):
        trigger_words.add(trigger)
        all_moods[mood.name] = mood
        if mood.name not in mood_songs_dict.keys():
            mood_songs_dict[mood.name] = songs_weights
        else:
            old_songs = mood_songs_dict[mood.name]
            old_songs.extend(songs_weights)
            mood_songs_dict[mood.name] = old_songs




In [None]:


all_moods.keys()




In [None]:


mood_songs_dict





<h3 id="Mood-distribution">Mood distribution<a class="anchor-link" href="#Mood-distribution">¶</a></h3>


In [None]:


graph_moods = list()
graph_songs = list()
for mood, songs in mood_songs_dict.items():
    graph_moods.append(mood)
    graph_songs.append(len(songs))
    
fig = plt.figure()
plt.bar(graph_moods, graph_songs)
_ = plt.xticks(rotation=90)





<p>A LOT of love songs - possibly problematic: maybe the tags that WNA brought back to love don't imply that the song is about love, but that those who tagged the song loved it, as suggested by <a href="https://archives.ismir.net/ismir2009/paper/000095.pdf">https://archives.ismir.net/ismir2009/paper/000095.pdf</a> <br/>
Let's dive deeper and see which words used in the LastFM tags are linked to mood classification, since they could give some answers. 
In <a href="https://archives.ismir.net/ismir2009/paper/000095.pdf">https://archives.ismir.net/ismir2009/paper/000095.pdf</a> they discarded both judgemental words (good, bad, ...), ambiguous words (love, loves, loved, like, likes, liked), and musical tags (upbeat, trance).</p>


In [None]:


with open('trigger_words.txt', 'w') as f:
    for word in trigger_words:
        f.write(f"{word}\n")





<p>! Trigger words cleaned in the file</p>


In [None]:


clean_triggers_file = open('trigger_words_clean.txt', 'r')
clean_triggers = clean_triggers_file.readlines()
clean_triggers = [trigger.strip() for trigger in clean_triggers]




In [None]:


len(clean_triggers)




In [None]:


def assign_mood_tag_clean(tag, clean_triggers, level = -1):
    wordsList = nltk.word_tokenize(tag)
    wordsList = [word for word in wordsList if word in clean_triggers]
    tagged = nltk.pos_tag(wordsList)
    moods = list()
    for (w, pos) in tagged:
        emo = wna.get_emotion(w, pos)
        if emo is None:
            pass
        else:
            if level == -1:
                moods.append(emo)
            else:
                moods.append(emo.get_level(level))
    return moods if len(moods) > 0 else None




In [None]:


mood_songs_dict = dict()
all_moods = dict()
trigger_words = set()
for tag, songs_weights in tag_songs_dict.items():
    moods = assign_mood_tag_clean(tag, clean_triggers, 5)
    if moods is not None:
        for mood in moods:
            all_moods[mood.name] = mood
            if mood.name not in mood_songs_dict.keys():
                mood_songs_dict[mood.name] = songs_weights
            else:
                old_songs = mood_songs_dict[mood.name]
                old_songs.extend(songs_weights)
                mood_songs_dict[mood.name] = old_songs




In [None]:


all_moods.keys()




In [None]:


mood_songs_dict




In [None]:


graph_moods = list()
graph_songs = list()
for mood, songs in mood_songs_dict.items():
    graph_moods.append(mood)
    graph_songs.append(len(songs))
    
fig = plt.figure()
plt.bar(graph_moods, graph_songs)
_ = plt.xticks(rotation=90)





<p>Now it's important to examinate how many tags each song has.</p>


In [None]:


labeled_songs_dict = dict()

for mood in mood_songs_dict.keys():
    song_list = mood_songs_dict[mood]
    for song_weight in song_list:
        tags = set()
        song = song_weight[0]
        weight = song_weight[1]
        if song in labeled_songs_dict.keys():
            tags = set(labeled_songs_dict[song])
            tags.add((mood, weight))
            labeled_songs_dict[song] = tags
        else:
            tags.add((mood, weight))
            labeled_songs_dict[song] = tags




In [None]:


labeled_songs_dict




In [None]:


num_tags = list()
for song in labeled_songs_dict.keys():
    num_tags.append(len(labeled_songs_dict[song]))




In [None]:


set(num_tags)




In [None]:


plt.hist(num_tags, bins=15)
plt.xlabel('# tags per song')
plt.ylabel('# songs')

plt.show()

plt.show()





<p>Some considerations:</p>
<ul>
<li>'thing'/'gravity' not an emotion --&gt; drop</li>
<li>many labels, varying number of songs per label --&gt; clustering the emotions</li>
</ul>


In [None]:


_ = mood_songs_dict.pop('gravity', None)
_ = mood_songs_dict.pop('thing', None)





<h2 id="Emotions-clusters">Emotions clusters<a class="anchor-link" href="#Emotions-clusters">¶</a></h2><ul>
<li>CL0: positive-fear, ambiguous-expectation, surprise, ambiguous-agitation, positive-expectation, ambiguous-fear</li>
<li>CL1: affection, love, gratitude</li>
<li>CL2: general-dislike, compassion, humility</li>
<li>CL3: shame, anxiety, negative-fear, despair, daze</li>
<li>CL4: fearlessness, joy, positive-hope, liking, self-pride, enthusiasm, levity</li>
<li>CL5: sadness, pensiveness</li>
<li>CL6: neutral-unconcern, apathy, calmness</li>
</ul>


In [None]:


clusters = [['positive-fear', 'ambiguous-expectation', 'surprise', 'ambiguous-agitation', 'positive-expectation', 'ambiguous-fear'],
           ['affection', 'love', 'gratitude'],
           ['general-dislike', 'compassion', 'humility'],
           ['shame', 'anxiety', 'negative-fear', 'despair', 'daze'],
           ['fearlessness', 'joy', 'positive-hope', 'liking', 'self-pride', 'enthusiasm', 'levity'],
           ['sadness', 'pensiveness'],
           ['neutral-unconcern', 'apathy', 'calmness']]

label_map = dict()

for i in range(0, len(clusters)):
    for emotion in clusters[i]:
        label_map[emotion] = i
        
label_map




In [None]:


clustered_label_dict = dict()
for label in mood_songs_dict.keys():
    idx = label_map[label]
    if idx in clustered_label_dict.keys():
        songs = clustered_label_dict[idx]
        songs.extend(mood_songs_dict[label])
        clustered_label_dict[idx] = songs
    else:
        clustered_label_dict[idx] = mood_songs_dict[label]




In [None]:


clustered_label_dict





<p>Let's analyze again the number of tags per song</p>


In [None]:


new_labeled_songs_dict = dict()

for mood in clustered_label_dict.keys():
    song_list = clustered_label_dict[mood]
    for song_weight in song_list:
        tags = list()
        song = song_weight[0]
        weight = song_weight[1]
        if song in new_labeled_songs_dict.keys():
            tags_weights = new_labeled_songs_dict[song]
            tags_weights.append((mood, weight))
            new_labeled_songs_dict[song] = tags_weights
        else:
            tags.append((mood, weight))
            new_labeled_songs_dict[song] = tags
            




In [None]:


new_labeled_songs_dict





<p>We can see how in this process we need to be careful with multiple tags: <br/></p>
<ul>
<li>it is possible that each song was tagged with the same number-label multiple times (different emotions belonging to the same cluster)</li>
<li>in order to fix this, when we create the new version of the labeled songs dictionary we check for other pre-existing tags of the same mood and keep the tag with the highest weight <br/></li>
</ul>
<p>The new version of the dictionary will be called unique_new_labeled_songs_dict to emphasize the unicity of tags per mood in each song</p>


In [None]:


unique_new_labeled_songs_dict = dict()

for mood in clustered_label_dict.keys():
    song_list = clustered_label_dict[mood]
    for song_weight in song_list:
        tags = list()
        song = song_weight[0]
        weight = song_weight[1]
        if song in unique_new_labeled_songs_dict.keys():
            tags_weights = unique_new_labeled_songs_dict[song]
            first_mood = True
            for (t, w) in tags_weights:
                if mood == t:
                    first_mood = False
                    if weight > w:
                        tags_weights.remove((mood, w))
                        tags_weights.append((mood, weight))
            if first_mood == True:
                tags_weights.append((mood,weight))
            unique_new_labeled_songs_dict[song] = tags_weights
        else:
            tags.append((mood, weight))
            unique_new_labeled_songs_dict[song] = tags
            




In [None]:


unique_new_labeled_songs_dict




In [None]:


new_num_tags = list()
for song in unique_new_labeled_songs_dict.keys():
    new_num_tags.append(len(unique_new_labeled_songs_dict[song]))

set(new_num_tags)




In [None]:


plt.hist(new_num_tags, bins=7)
plt.xlabel('# tags per song')
plt.ylabel('# songs')

plt.show()





<h2 id="Two-different-versions-of-the-dataset">Two different versions of the dataset<a class="anchor-link" href="#Two-different-versions-of-the-dataset">¶</a></h2><p>One with a single label (the one with the highest weight), one with all the labels</p>



<h3 id="Monolabel">Monolabel<a class="anchor-link" href="#Monolabel">¶</a></h3>


In [None]:


monolabel_df = pd.DataFrame(unique_new_labeled_songs_dict.keys(), columns = ['songID'])
labels = list()
for idx, row in monolabel_df.iterrows():
    max_weight = 0
    max_tag = None
    song = row['songID']
    tags = unique_new_labeled_songs_dict[song] 
    for tup in tags:
        if tup[1] > max_weight:
            max_tag = tup[0]
    max_cnt = 0
    tie_moods = list()
    for tup in tags:
        if tup[1] == max_weight:
            tie_moods.append(tup[0])
            max_cnt = max_cnt + 1
    if max_cnt > 1:
        full_tags = new_labeled_songs_dict[song]
        avg_dict = dict()
        for mood in tie_moods:
            selected_tuples = [t for t in full_tags if t[0]==mood]
            total = 0
            for tup in selected_tuples:
                total = total + tup[1]
            avg_dict[mood] = (total/len(selected_tuples))
        max_tag = max(avg_dict.iteritems(), key=operator.itemgetter(1))[0]
        
    labels.append(max_tag)

print(labels)




In [None]:


plt.hist(labels, bins=range(0,8))
plt.xlabel('Labels')
plt.ylabel('# songs')

plt.show()




In [None]:


from collections import Counter
c = Counter(labels)

print(c.items())




In [None]:


monolabel_df['Label'] = labels
monolabel_df.head(30)





<h4 id="Monolabel-one-hot-encoding">Monolabel one-hot encoding<a class="anchor-link" href="#Monolabel-one-hot-encoding">¶</a></h4>


In [None]:


encoded_df = pd.DataFrame(unique_new_labeled_songs_dict.keys(), columns = ['songID'])
for i in range(0, len(clusters)):
    encoded_df[f'CL{i}'] = 0

for idx, row in encoded_df.iterrows():
    value = monolabel_df.loc[idx, 'Label']
    label = f'CL{value}'
    encoded_df.at[idx, label] = 1

encoded_df





<h3 id="Multilabel">Multilabel<a class="anchor-link" href="#Multilabel">¶</a></h3><p>Encoding consideration: in order to be able to use the dataset, we need to decide how to encode multiple labels. The best way ( = for which the dataset is then ready to use) is to have as many label columns as labels, and use binary indicators to mark if a song has a certain tag)</p>


In [None]:


multilabel_df = pd.DataFrame(unique_new_labeled_songs_dict.keys(), columns = ['songID'])
for i in range(0, len(clusters)):
    multilabel_df[f'CL{i}'] = 0

for idx, row in multilabel_df.iterrows():
    song = row['songID']
    tagps = unique_new_labeled_songs_dict[song]
    for tup in tags:
        label = f'CL{tup[0]}'
        multilabel_df.at[idx, label] = 1

multilabel_df




In [None]:


monolabel_df





<h2 id="Save-datasets">Save datasets<a class="anchor-link" href="#Save-datasets">¶</a></h2>


In [None]:


monolabel_df.to_csv('../final_datasets/monolabel_df_no_lyrics.csv')
multilabel_df.to_csv('../final_datasets/multilabel_df_no_lyrics.csv')
encoded_df.to_csv('../final_datasets/encoded_df_no_lyrics.csv')





<h2 id="Add-lyrics">Add lyrics<a class="anchor-link" href="#Add-lyrics">¶</a></h2>


In [None]:


mono_lyr_column = list()
multi_lyr_column = list()
enc_lyr_column = list()

def clean_lyrics(lyrics, removable):
    pattern = re.compile(r'\b(' + r'|'.join(removable) + r')\b\s*')
    clean = pattern.sub(' ', lyrics)
    final = re.sub(r'\s{2,}', ' ', clean).strip()
    return final
    

for idx, row in monolabel_df.iterrows():
    filename = f'../raw/processed_lyrics/{row["songID"]}.txt'
    file = open(filename, 'r')
    lyr = file.read()
    clean_lyr = clean_lyrics(lyr, ['oh', 'na', 'la', 'eh', 'ah', 'yeah'])
    mono_lyr_column.append(clean_lyr)
    
for idx, row in encoded_df.iterrows():
    filename = f'../raw/processed_lyrics/{row["songID"]}.txt'
    file = open(filename, 'r')
    lyr = file.read()
    clean_lyr = clean_lyrics(lyr, ['oh', 'na', 'la', 'eh', 'ah', 'yeah'])
    enc_lyr_column.append(clean_lyr)
    
for idx, row in multilabel_df.iterrows():
    filename = f'../raw/processed_lyrics/{row["songID"]}.txt'
    file = open(filename, 'r')
    lyr = file.read()
    clean_lyr = clean_lyrics(lyr, ['oh', 'na', 'la', 'eh', 'ah', 'yeah'])
    multi_lyr_column.append(clean_lyr)




In [None]:


monolabel_lyrics_df = monolabel_df.copy()
monolabel_lyrics_df['Lyrics'] = mono_lyr_column
monolabel_lyrics_df = monolabel_lyrics_df[monolabel_lyrics_df['Lyrics'] != '']

encoded_lyrics_df = encoded_df.copy()
encoded_lyrics_df['Lyrics'] = enc_lyr_column
encoded_lyrics_df = encoded_lyrics_df[encoded_lyrics_df['Lyrics'] != '']

multilabel_lyrics_df = multilabel_df.copy()
multilabel_lyrics_df['Lyrics'] = multi_lyr_column
multilabel_lyrics_df = multilabel_lyrics_df[multilabel_lyrics_df['Lyrics'] != '']

monolabel_lyrics_df.to_csv('../final_datasets/monolabel_df_lyrics.csv')
encoded_lyrics_df.to_csv('../final_datasets/encoded_df_lyrics.csv')
multilabel_lyrics_df.to_csv('../final_datasets/multilabel_df_lyrics.csv')




In [None]:


encoded_lyrics_df





<h2 id="Other-attempts-for-clustering">Other attempts for clustering<a class="anchor-link" href="#Other-attempts-for-clustering">¶</a></h2>



<h3 id="Examine-co-occurrences-of-tags">Examine co-occurrences of tags<a class="anchor-link" href="#Examine-co-occurrences-of-tags">¶</a></h3>


In [None]:


from collections import defaultdict


def find_co_occurring_tags(tag_sets):
    co_occurrences = defaultdict(lambda: {'co_occurring_tags': set(), 'co_occurrence_counts': defaultdict(int)})
    for tag_set in tag_sets:
        for tag in tag_set:
            for other_tag in tag_set:
                if tag != other_tag:
                    co_occurrences[tag]['co_occurring_tags'].add(other_tag)
                    co_occurrences[tag]['co_occurrence_counts'][other_tag] += 1

      # Remove tags that don't co-occur with any other tag
    for tag, co_occurring_info in co_occurrences.items():
        if not co_occurring_info['co_occurring_tags']:
            del co_occurrences[tag]
    return co_occurrences

# Example usage
vals = list(labeled_songs_dict.values()) #list of sets
tag_sets = list()
for s in vals: #for each set in the list
    new_set = set()
    for tup in s: #for each tuple (tag, weight) in the set
        new_set.add(tup[0])
    tag_sets.append(new_set)
    
    
co_occurrences = find_co_occurring_tags(tag_sets)

# Print the co-occurring tags
for tag, co_occurring_tags in co_occurrences.items():
    print(f"Tag: {tag}, Co-occurring tags: {co_occurring_tags['co_occurrence_counts']}\n")




In [None]:


from apriori_python import apriori 

freqItemSet, rules = apriori(tag_sets, minSup=0.1, minConf=0.1)




In [None]:


freqItemSet


