# Notebook to start exploring the MusicO-Net Library

In [136]:
import pandas as pd
from collections import Counter
import numpy as np
import re

import torch
import torchtext
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, TensorDataset
import spacy
from sklearn.model_selection import train_test_split


spacy.load('en_core_web_sm')



<spacy.lang.en.English at 0x7fb383ebc400>

In [122]:

# Import relevant tabbles
df_features = pd.read_csv("data/musicoset_songfeatures/acoustic_features.csv", sep="\t")
df_lyrics = pd.read_csv("data/musicoset_songfeatures/lyrics.csv", sep="\t")
df_song = pd.read_csv("data/musicoset_metadata/songs.csv", sep="\t")

#SQL Join tables to create dataframe suitable for filtering and exploration
df = df_song.merge(right=df_features, how='inner', on="song_id")
df = df.merge(right=df_lyrics, how='inner', on="song_id")

In [123]:
df.head()

Unnamed: 0,song_id,song_name,billboard,artists,popularity,explicit,song_type,duration_ms,key,mode,...,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next","('Thank U, Next', 'Ariana Grande')",{'66CXWjxzNUsdJxJ2JdwvnR': 'Ariana Grande'},86,True,Solo,207320,1,1,...,0.229,0.717,0.653,0.0,0.101,-5.634,0.0658,0.412,106.966,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,Without Me,"('Without Me', 'Halsey')",{'26VFTg2z8YR0cCuwLzESi2': 'Halsey'},87,True,Solo,201661,6,1,...,0.297,0.752,0.488,9e-06,0.0936,-7.05,0.0705,0.533,136.041,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,"('Sicko Mode', 'Travis Scott')",{'0Y5tJX1MQlPlqiwlOH1tJY': 'Travis Scott'},85,True,Solo,312820,8,1,...,0.00513,0.834,0.73,0.0,0.124,-3.714,0.222,0.446,155.008,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,Sunflower - Spider-Man: Into the Spider-Verse,('Sunflower (Spider-Man: Into The Spider-Verse...,"{'246dkjvS1zLTtiykXe5h60': 'Post Malone', '1zN...",92,False,Collaboration,158040,2,1,...,0.556,0.76,0.479,0.0,0.0703,-5.574,0.0466,0.913,89.911,
4,1rqqCSm0Qe4I9rUvWncaom,High Hopes,"('High Hopes', 'Panic! At The Disco')",{'20JZFwl6HVl6yg8a4H3ZqK': 'Panic! At The Disco'},86,False,Solo,190947,5,1,...,0.193,0.579,0.904,0.0,0.064,-2.729,0.0618,0.681,82.014,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


In [124]:
# Filtering for relevant tracks
df_dance = df[df.danceability > 0.8]
df_dance = df_dance[df_dance.speechiness < 0.5]
df_dance = df_dance[df_dance.energy > 0.5]
df_dance = df_dance[df_dance.popularity >= 60]
df_dance = df_dance[df_dance.explicit == True]

print(len(df_dance))

129


In [128]:
#df_dance.lyrics = df_dance.lyrics.astype(str)
lyrics = df_dance.lyrics.tolist()
df_dance.lyrics = df_dance.lyrics.astype(str)

In [134]:
import string

translator = str.maketrans('', '', string.punctuation)


def split_text(x):
    text = x['lyrics']

    sections = text.split('\\n\\n')

    keys = {'Verse 1': np.nan, 'Verse 2': np.nan, 'Verse 3': np.nan, 'Verse 4': np.nan, 'Chorus': np.nan}

    lyrics = str()

    single_text = []

    res = {}

    for s in sections:

        key = s[s.find('[') + 1:s.find(']')].strip()

        if ':' in key:
            key = key[:key.find(':')]

        if key in keys:
            single_text += [x.lower().replace('(', '').replace(')', '').translate(translator) for x in
                            s[s.find(']') + 1:].split('\\n') if len(x) > 1]

        res['single_text'] = ' \n '.join(single_text)

    return pd.Series(res)


df = df_dance.join(df_dance.apply(split_text, axis=1))



In [135]:
df.dropna(inplace=True)
print(len(df))

129


In [150]:
text_as_list = []

frequencies = {}

uncommon_words = set()

MIN_FREQUENCY = 7

MIN_SEQ = 5

BATCH_SIZE = 32


def extract_text(text):
    global text_as_list

    text_as_list += [w for w in text.split(' ') if w.strip() != '' or w == '\n']



In [154]:



df['single_text'].apply(extract_text)


text_as_list = [e for e in text_as_list if e != '\n']
print(text_as_list)




In [155]:

print('Total words: ', len(text_as_list))

for w in text_as_list:
    frequencies[w] = frequencies.get(w, 0) + 1

uncommon_words = set([key for key in frequencies.keys() if frequencies[key] < MIN_FREQUENCY])

words = sorted(set([key for key in frequencies.keys() if frequencies[key] >= MIN_FREQUENCY]))

num_words = len(words)

word_indices = dict((w, i) for i, w in enumerate(words))

indices_word = dict((i, w) for i, w in enumerate(words))

print('Words with less than {} appearances: {}'.format(MIN_FREQUENCY, len(uncommon_words)))

print('Words with more than {} appearances: {}'.format(MIN_FREQUENCY, len(words)))

valid_seqs = []

end_seq_words = []

for i in range(len(text_as_list) - MIN_SEQ):

    end_slice = i + MIN_SEQ + 1

    if len(set(text_as_list[i:end_slice]).intersection(uncommon_words)) == 0:
        valid_seqs.append(text_as_list[i: i + MIN_SEQ])

        end_seq_words.append(text_as_list[i + MIN_SEQ])

print('Valid sequences of size {}: {}'.format(MIN_SEQ, len(valid_seqs)))

X_train, X_test, y_train, y_test = train_test_split(valid_seqs, end_seq_words, test_size=0.02, random_state=666)

print(X_train[2:7])

Total words:  240524
Words with less than 7 appearances: 3533
Words with more than 7 appearances: 2902
Valid sequences of size 5: 176243
[['wouldnt', 'piss', 'on', 'fire', 'to'], ['me', 'the', 'way', 'you', 'really'], ['to', 'look', 'in', 'i', 'brought'], ['at', 'the', 'os', 'i', 'drank'], ['right', 'thurr', 'swing', 'your', 'hips']]


In [85]:
def build_vocab(corpus, tokenizer):
    counter = Counter()
    for text in corpus:
        print(text)
        counter.update(tokenizer(text))

    return Vocab(counter)

In [86]:
# building a vocabulary and tokenizer
tokenizer = get_tokenizer('spacy', language='en')
vocab = build_vocab(lyrics, tokenizer)




['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun is down, freezin\' cold\nThat\'s how we already know, winter\'s here\nMy dawg would probably do it for a Louis belt\nThat\'s just all he know, he don\'t know nothin\' else\nI tried to show \'em, yeah\nI tried to show \'em, yeah, yeah\nYeah, yeah, yeah\nGoin\' on you with the pick and roll\nYoung La Flame, he in sicko mode\n\n[Part II]\n\n[Verse 1: Travis Scott & The Notorious B.I.G.]\nWoo, made this here with all the ice on in the booth\nAt the gate outside, when they pull up, they get me loose\nYeah, Jump Out boys, that\'s Nike boys, hoppin\' out coupes\nThis shit way too big, when we pull up, give me the loot\n(Gimme the loot!)\nWas off the Remy, had a Papoose\nHad to hit my old town to duck the news\nTwo four hour lockdown, we made no moves\nNow it\'s 4 a.m. and I\'m back up poppin\' with the crew\nI just landed in, Chase B mixes pop like Jamba Juice\nDifferent colored chains, think my jeweler really sellin\' fruits\nAnd they chokin\', 

In [84]:
print(vocab)

Vocab()


In [73]:

def data_process(corpus, vocab):
    data = list()
    for text in corpus:
        token_list = [vocab[token] for token in tokenizer(text)]
        for i in range(1, len(token_list)):
            n_gram_seq = torch.tensor(token_list[:i + 1], dtype=torch.long)
            data.append(n_gram_seq)
    return data


train_data = data_process(lyrics, vocab)

In [75]:
print(train_data[:10])

[]
