# Notebook to start exploring the MusicO-Net Library

In [50]:
import pandas as pd
from collections import Counter
import numpy as np

import torch
import torchtext
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, TensorDataset


In [36]:

# Import relevant tabbles
df_features = pd.read_csv("data/musicoset_songfeatures/acoustic_features.csv", sep="\t")
df_lyrics = pd.read_csv("data/musicoset_songfeatures/lyrics.csv", sep="\t")
df_song = pd.read_csv("data/musicoset_metadata/songs.csv", sep="\t")

#SQL Join tables to create dataframe suitable for filtering and exploration
df = df_song.merge(right=df_features, how='inner', on="song_id")
df = df.merge(right=df_lyrics, how='inner', on="song_id")

In [37]:
df.head()

Unnamed: 0,song_id,song_name,billboard,artists,popularity,explicit,song_type,duration_ms,key,mode,...,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,valence,tempo,lyrics
0,3e9HZxeyfWwjeyPAMmWSSQ,"thank u, next","('Thank U, Next', 'Ariana Grande')",{'66CXWjxzNUsdJxJ2JdwvnR': 'Ariana Grande'},86,True,Solo,207320,1,1,...,0.229,0.717,0.653,0.0,0.101,-5.634,0.0658,0.412,106.966,['[Verse 1]\nThought I\'d end up with Sean\nBu...
1,5p7ujcrUXASCNwRaWNHR1C,Without Me,"('Without Me', 'Halsey')",{'26VFTg2z8YR0cCuwLzESi2': 'Halsey'},87,True,Solo,201661,6,1,...,0.297,0.752,0.488,9e-06,0.0936,-7.05,0.0705,0.533,136.041,"[""[Verse 1]\nFound you when your heart was bro..."
2,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,"('Sicko Mode', 'Travis Scott')",{'0Y5tJX1MQlPlqiwlOH1tJY': 'Travis Scott'},85,True,Solo,312820,8,1,...,0.00513,0.834,0.73,0.0,0.124,-3.714,0.222,0.446,155.008,"['[Part I]\n\n[Intro: Drake]\nAstro, yeah\nSun..."
3,3KkXRkHbMCARz0aVfEt68P,Sunflower - Spider-Man: Into the Spider-Verse,('Sunflower (Spider-Man: Into The Spider-Verse...,"{'246dkjvS1zLTtiykXe5h60': 'Post Malone', '1zN...",92,False,Collaboration,158040,2,1,...,0.556,0.76,0.479,0.0,0.0703,-5.574,0.0466,0.913,89.911,
4,1rqqCSm0Qe4I9rUvWncaom,High Hopes,"('High Hopes', 'Panic! At The Disco')",{'20JZFwl6HVl6yg8a4H3ZqK': 'Panic! At The Disco'},86,False,Solo,190947,5,1,...,0.193,0.579,0.904,0.0,0.064,-2.729,0.0618,0.681,82.014,"[""[Intro]\nHigh, high hopes\n\n[Chorus]\nHad t..."


In [38]:
# Filtering for relevant tracks
df_dance = df[df.danceability > 0.8]
df_dance = df_dance[df_dance.speechiness < 0.5]
df_dance = df_dance[df_dance.energy > 0.5]
df_dance = df_dance[df_dance.popularity >= 60]
df_dance = df_dance[df_dance.explicit == True]

print(len(df_dance))

129


In [65]:
df_dance.lyrics = df_dance.lyrics.astype(str)
lyrics = df_dance.lyrics.tolist()
lyrics = "".join(lyrics)
tokenizer = get_tokenizer('basic_english', language='en')

print(tokenizer(lyrics))



In [59]:
print(len(lyrics))
print(lyrics)

509365


In [43]:
def build_vocab(corpus, tokenizer):
    counter = Counter()
    for text in corpus:
        counter.update(tokenizer(text))

    return Vocab(counter)

In [62]:
# building a vocabulary and tokenizer
tokenizer = get_tokenizer('basic_english', language='en')
vocab = build_vocab(lyrics, tokenizer)




In [49]:
print(tokenizer('intro  wanting my, '))

['intro', 'wanting', 'my', ',']


In [None]:

def data_process(corpus, vocab):
    data = list()
    for text in corpus:
        token_list = [vocab[token] for token in tokenizer(text)]
        for i in range(1, len(token_list)):
            n_gram_seq = torch.tensor(token_list[:i+1], dtype=torch.long)
            data.append(n_gram_seq)
    return data

train_data = data_process(lyrics, vocab)

In [35]:
print(train_data[:10])

[]
