In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import numpy as np
import re
import random
import nltk
from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs
import pickle
from datetime import datetime

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

read data

In [2]:
if os.path.exists("data"):  # automatically recognize
    rootPath = "data" + os.sep
elif os.path.exists("dataset"):
    rootPath = "dataset" + os.sep
else:
    rootPath = input("Relative Folder: ").replace("\"", "").replace("\'", "").replace(os.sep, "") + os.sep

if not rootPath.endswith(os.sep) and not rootPath.endswith("/"):
    rootPath += os.sep


classes = []
data = []
for directory in os.listdir(rootPath):
    if os.path.isdir(rootPath + directory):
        classes.append(directory)
        currentWork = os.path.join(rootPath + directory)
        for file in os.listdir(currentWork):
            song = str(file).replace(".txt", "")
            currentFile = os.path.join(currentWork, file)
            '''
            with open(currentFile, 'r', encoding='UTF-8') as f:
                lines = f.readlines()
            with open(currentFile, 'w', encoding='UTF-8') as f:
                f.writelines(lines[1:])
            '''
            with open(currentFile, 'r', encoding='UTF-8') as f:
                lyric = f.read()
                size = os.path.getsize(currentFile)
                if size != 0:
                    data.append((directory, song, lyric))
    	            #print(currentFile)
                # lyric = "".join(ch for ch in lyric if unicodedata.category(ch)[0] != 'C')
                #if lyric!='' or lyric.isspace()==False:
                    #data.append((directory, song, lyric))

song_df = pd.DataFrame(data, columns=['artist','song', 'text'])
song_df

Unnamed: 0,artist,song,text
0,Adele,All I Ask - Adele,[00:14.230]I will leave my heart at the door\n...
1,Adele,All Night Parking (with Erroll Garner) Interlu...,[00:10.722]\n[00:29.181] I know you've got thi...
2,Adele,Best For Last - Adele,"[00:12.599]Wait, do you see my heart on my sle..."
3,Adele,Can I Get It - Adele,[00:04.566]\n[00:04.866] Pave me a path to fol...
4,Adele,Chasing Pavements - Adele,"[00:06.316]I've made up my mind,\n[00:08.777]\..."
...,...,...,...
3563,ZAYN,Vibez - ZAYN,[00:18.811]Don't keep me waiting\n[00:21.892]I...
3564,ZAYN,When Love's Around - ZAYN,"[00:21.724]Mm, never been in love\n[00:25.257]..."
3565,ZAYN,Windowsill - ZAYN,"[00:11.816]F!ckin' on the countertop, window t..."
3566,ZAYN,wRoNg - ZAYN,[00:00.00]wRoNg (Explicit) - ZAYN/Kehlani\n[00...


In [3]:
print(song_df['text'].iloc[300])

[00:05.55]Lil Mama and Avril Lavinge REMIX
[00:07.97]
[00:08.49]Lil Mama and Avril Lavinge REMIX
[00:11.48]Lil Mama and Avril Lavinge
[00:12.87]
[00:13.70]Hey hey you you I don't like your girlfriend
[00:16.58]No way no way I think you need a new one
[00:19.79]Hey hey you you I could be your girlfriend
[00:22.70]I could be your girl Lil Mama be your girlfriend
[00:25.43]Hey hey you you I know that you like me
[00:28.44]No way no way you know it's no a secret
[00:31.24]Hey hey you you I want to be your girlfriend
[00:34.50]I could be your girl Lil Mama be your girlfriend
[00:36.59]
[00:37.53]Don't get it twisted Lil Mama got my paper on
[00:40.46]That means im a paper chaser I chase my paper on
[00:43.34]And I know we chasin paper that you be chasin on
[00:45.99]I just deliver the lyrics that people focus on
[00:48.99]Hands and a boob and a chopped up song
[00:50.47]Put it in a store and they go cops on
[00:52.18]Everybody know that its no combo
[00:53.40]Ya Betta lay low like popo
[00:

brackets

In [4]:
text_in_round_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
print('Number of round brackets: {}'.format(len(text_in_round_brackets)))

Number of round brackets: 8327


In [5]:
random.seed(0)
random.choices(text_in_round_brackets, k=20)

['loving me',
 'Oh, yeah-yeah-yeah-yeah',
 'ooh, yeah',
 "You\xa0wanna\xa0roll,\xa0that's\xa0how\xa0I\xa0roll",
 '*****, pick up your phone',
 'I might just',
 'Why you so mad?',
 'Alright',
 'All we got are',
 'And I like you',
 'Hudson Valley, NY',
 'understand it',
 'Glow',
 'Oh-oh',
 'Gimme your love, gimme your love',
 'on you',
 'Hope it never ends',
 'Mark',
 'no sleep',
 'No one has to know']

In [6]:
text_in_square_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
# text_in_square_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\[.*?\] ', s))), [])
print('Number of square brackets: {}'.format(len(text_in_square_brackets)))

Number of square brackets: 185075


In [7]:
random.seed(0)
random.choices(text_in_square_brackets, k=20)
text_in_square_brackets[0]

'00:14.230'

In [8]:
text_in_curly_brackets = sum(list(song_df['text'].map(lambda s: re.findall(r'\{(.*)\}',s))), [])
print('Number of square brackets: {}'.format(len(text_in_curly_brackets)))

Number of square brackets: 3


In [9]:
# remove round brackets but not text within
song_df['text'] = song_df['text'].map(lambda s: re.sub(r'\(|\)', '', s))

# remove square brackest and text within
song_df['text'] = song_df['text'].map(lambda s: re.sub(r'\[(.*?)\]', '', s))
song_df['text'] = song_df['text'].map(lambda s: re.sub(r'\{(.*?)\}', '', s))

In [10]:
# count number of lines
song_df['lines'] = song_df['text'].map(lambda t: len(re.findall(r'\n', t)))
# remove line breaks
song_df['text'] = song_df['text'].map(lambda s: re.sub(r' \n|\n', ' ', s))

abbreviation reduction

In [11]:
replacement_pattern = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'I\'m', 'I am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would')
]

In [12]:
class RegexpReplacer(object):
    def __init__(self, patterns = replacement_pattern):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    def replace(self, text):
        s = text
        for (patterns, repl) in self.patterns:
            (s, count) = re.subn(patterns, repl, s)
        return s 

In [13]:
replacer = RegexpReplacer()
song_df['text'] = song_df['text'].map(replacer.replace)
print(song_df['text'].iloc[0])

I will leave my heart at the door I will not say a word They have all been said before you know So why do not we just play pretend Like we are not scared of what is coming next Or scared of having nothing left look do not get me wrong I know there is no tomorrow All I ask  Is if this is my last night with you Hold me like I am more than just a friend Give me a memory I can use Take me by the hand while we do what lovers do It matters how this ends Cause what if I never love again  I do not need your honesty It is already in your eyes and I am sure my eyes they speak for me No one knows me like you do And since you are the only one that mattered, tell me who do I run to  Look do not get me wrong I know there is no tomorrow All I ask  Is if this is my last night with you Hold me like I am more than just a friend Give me a memory I can use Take me by the hand while we do what lovers do It matters how this ends Cause what if I never love again  Let this be our lesson in love Let this be th

In [14]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
song_df['tokens'] = song_df['text'].map(tokenizer.tokenize)

print('Text:')
print(song_df['text'].iloc[0])

print('Tokens:')
print(song_df['tokens'].iloc[0])

Text:
I will leave my heart at the door I will not say a word They have all been said before you know So why do not we just play pretend Like we are not scared of what is coming next Or scared of having nothing left look do not get me wrong I know there is no tomorrow All I ask  Is if this is my last night with you Hold me like I am more than just a friend Give me a memory I can use Take me by the hand while we do what lovers do It matters how this ends Cause what if I never love again  I do not need your honesty It is already in your eyes and I am sure my eyes they speak for me No one knows me like you do And since you are the only one that mattered, tell me who do I run to  Look do not get me wrong I know there is no tomorrow All I ask  Is if this is my last night with you Hold me like I am more than just a friend Give me a memory I can use Take me by the hand while we do what lovers do It matters how this ends Cause what if I never love again  Let this be our lesson in love Let this

stop words

In [15]:

from nltk.corpus import stopwords
eng_stopwords = set(stopwords.words("english"))
print(eng_stopwords)
song_df['n_stop_words'] = song_df['tokens'].apply(lambda x: len([w for w in x if w.lower() in eng_stopwords]))
print(song_df['n_stop_words'].iloc[0])
print(len(song_df['tokens'].iloc[0]))

{'further', 'its', 'ourselves', 'if', 'until', 'out', 's', 'my', 'to', 'didn', 'between', 'than', 'been', 're', 'were', 'these', 'a', 'over', "needn't", 'while', 'above', 'own', 'd', 'yourself', 'ain', 'wouldn', 'where', 'why', 'but', 'whom', 'which', 'all', 'because', 'each', 'few', 'o', 'it', "won't", 'mightn', 'won', 'needn', 'at', 'himself', 'up', 'him', 'does', 'hers', 'any', 'not', 'aren', "isn't", 'through', "hasn't", "mightn't", 'only', "you'd", 'again', "wouldn't", 'yours', 'under', 'our', "it's", 'before', 'being', 'doing', 'no', 'ours', "shouldn't", 'has', 'those', 'her', 'with', "weren't", 'have', 'what', 'weren', 'am', 'will', "you'll", 'mustn', 'don', "mustn't", 'then', 'in', "couldn't", 'myself', 'during', 've', "aren't", 'an', 'too', 'by', 'more', 'they', 'are', 'when', 'about', 'doesn', 'having', 'other', 't', 'the', 'them', 'into', 'is', 'hadn', 'now', "she's", 'that', 'haven', 'there', 'down', 'off', 'yourselves', 'of', 'nor', 'once', 'wasn', 'did', 'how', 'after', '

probability of parts of speech

In [16]:
class Fraction(object):
    def fraction_noun(self, text):
        word_count = text.__len__()
        pos_list = nltk.pos_tag(text)
        noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
        if word_count==0:
            return 1
        return (noun_count/word_count)

    def fraction_adj(self, text):
        word_count = text.__len__()
        pos_list = nltk.pos_tag(text)
        adj_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
        if word_count==0:
            return 1
        return (adj_count/word_count)
    
    def fraction_verbs(self, text):
        word_count = text.__len__()
        pos_list = nltk.pos_tag(text)
        verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
        if word_count==0:
            return 1
        return (verb_count/word_count)

    def fraction_adverbs(self, text):
        word_count = text.__len__()
        pos_list = nltk.pos_tag(text)
        adverb_count = len([w for w in pos_list if w[1] in ('RB','RBR','RBS')])
        if word_count==0:
            return 1
        return (adverb_count/word_count)

In [None]:
fraction = Fraction()
song_df['p_noun'] = song_df['tokens'].apply(fraction.fraction_noun)
song_df['p_adj'] = song_df['tokens'].apply(fraction.fraction_adj)
song_df['p_verb'] = song_df['tokens'].apply(fraction.fraction_verbs)
song_df['p_adv'] = song_df['tokens'].apply(fraction.fraction_adverbs)
print(song_df['p_noun'].iloc[0])
print(song_df['p_adj'].iloc[0])
print(song_df['p_verb'].iloc[0])
print(song_df['p_adv'].iloc[0])

In [None]:
# initialise stemmer
stemmer = nltk.stem.porter.PorterStemmer()

token = 'make'
print('{} -> {}'.format(token, stemmer.stem(token)))

token = 'makes'
print('{} -> {}'.format(token, stemmer.stem(token)))

token = 'making'
print('{} -> {}'.format(token, stemmer.stem(token)))

token = 'made'
print('{} -> {}'.format(token, stemmer.stem(token)))

make -> make
makes -> make
making -> make
made -> made


In [None]:
# create dictionary to map tokens their stem
token_to_stem = {}
# initialise word count
token_count = 0
# iterate through all songs
for lst in song_df['tokens']:
    # iterate through all tokens of song
    for token in lst:
        token_count += 1
        # check if token is in dictionary
        if token not in token_to_stem:
            # add token to dictionary
            token_to_stem[token] = stemmer.stem(token)
            
song_df['stems'] = song_df['tokens'].map(lambda lst: [token_to_stem[token] for token in lst])

print('Number of tokens: {}'.format(token_count))
print('Number of unique tokens: {}'.format(len(token_to_stem.keys())))
print('Number of unique stems: {}'.format(len(set(token_to_stem.values()))))

Number of tokens: 1177041
Number of unique tokens: 21785
Number of unique stems: 12966


In [None]:
# number of songs
print('number of songs: ', str(len(song_df)))

# number of artists
print('number of artists: ', str(len(song_df['artist'].unique())))

# distribution songs per artist
song_count_df = song_df.groupby('artist')[['song']].count()
fig = px.histogram(song_count_df, x='song', title='Songs per artist', labels={'song': 'Songs'})
fig.show()

number of songs:  3569
number of artists:  41


In [None]:
# words per song
song_df['n_stems'] = song_df['stems'].map(len)

fig = px.histogram(song_df, x='n_stems', title='Words per song')
fig.show()

In [None]:
# create dataframe with lists of artists
song_df['stems_str'] = song_df['stems'].map(lambda lst: ' '.join(lst))

# map text to artists
stems_to_artist = {}
for tp in song_df[['artist', 'stems_str']].itertuples(index=False):
    artist = tp[0]
    stems = tp[1]
    if stems in stems_to_artist:
        stems_to_artist[stems].append(artist)
    else:
        stems_to_artist[stems] = [artist]

# insert list of artists to dataframe
song_df['artists'] = song_df['stems_str'].map(stems_to_artist)
song_df['duplicates'] = song_df['artists'].map(len) - 1

# convert list of artists to set of artists
song_df['artists'] = song_df['artists'].map(set)
song_df['n_artists'] = song_df['artists'].map(len)

# remove duplicate songs
artist_text_df = song_df.drop_duplicates('stems_str')

In [None]:
# number of unique songs
print('Number of unique lyrics: {}'.format(sum(artist_text_df['duplicates'] == 0)))
# number of duplicate songs
print('Number of duplicate lyrics: {}'.format(sum(artist_text_df['duplicates'] > 0) + \
                                              sum(artist_text_df['duplicates'])))
# number of duplicates from same artist
print('Number of duplicate lyrics from same artist: {}'.format(sum(artist_text_df['duplicates'] + 1 - \
                                                                   artist_text_df['n_artists'])))
# number of duplicates from different artists
print('Number of duplicate lyrics from different artists: {}'.format(sum(artist_text_df['n_artists']\
                                                                         .loc[artist_text_df['duplicates'] > 0])))


song_df

Number of unique lyrics: 3551
Number of duplicate lyrics: 18
Number of duplicate lyrics from same artist: 10
Number of duplicate lyrics from different artists: 8


Unnamed: 0,artist,song,text,lines,tokens,n_stop_words,p_noun,p_adj,p_verb,p_adv,stems,n_stems,stems_str,artists,duplicates,n_artists
0,Adele,All I Ask - Adele,I will leave my heart at the door I will not s...,45,"[I, will, leave, my, heart, at, the, door, I, ...",197,0.122924,0.043189,0.282392,0.066445,"[i, will, leav, my, heart, at, the, door, i, w...",301,i will leav my heart at the door i will not sa...,{Adele},0,1
1,Adele,All Night Parking (with Erroll Garner) Interlu...,I know you have got things to do I do too I...,21,"[I, know, you, have, got, things, to, do, I, d...",107,0.133333,0.050000,0.227778,0.111111,"[i, know, you, have, got, thing, to, do, i, do...",180,i know you have got thing to do i do too i jus...,{Adele},0,1
2,Adele,Best For Last - Adele,"Wait, do you see my heart on my sleeve? It is ...",50,"[Wait, do, you, see, my, heart, on, my, sleeve...",278,0.094595,0.042793,0.254505,0.085586,"[wait, do, you, see, my, heart, on, my, sleev,...",444,wait do you see my heart on my sleev it is bee...,{Adele},0,1
3,Adele,Can I Get It - Adele,Pave me a path to follow And I will tread a...,54,"[Pave, me, a, path, to, follow, And, I, will, ...",224,0.128852,0.028011,0.210084,0.103641,"[pave, me, a, path, to, follow, and, i, will, ...",357,pave me a path to follow and i will tread ani ...,{Adele},0,1
4,Adele,Chasing Pavements - Adele,"I have made up my mind, Do not need to think ...",69,"[I, have, made, up, my, mind, Do, not, need, t...",179,0.146259,0.027211,0.306122,0.136054,"[i, have, made, up, my, mind, do, not, need, t...",294,i have made up my mind do not need to think it...,{Adele},0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3564,ZAYN,Vibez - ZAYN,Do not keep me waiting I have been waiting all...,52,"[Do, not, keep, me, waiting, I, have, been, wa...",181,0.159763,0.047337,0.292899,0.076923,"[do, not, keep, me, wait, i, have, been, wait,...",338,do not keep me wait i have been wait all night...,{ZAYN},0,1
3565,ZAYN,When Love's Around - ZAYN,"Mm, never been in love Never really loved some...",39,"[Mm, never, been, in, love, Never, really, lov...",100,0.228856,0.014925,0.233831,0.099502,"[mm, never, been, in, love, never, realli, lov...",201,mm never been in love never realli love someon...,{ZAYN},0,1
3566,ZAYN,Windowsill - ZAYN,"F!ckin' on the countertop, window to the floor...",63,"[F, ckin, on, the, countertop, window, to, the...",271,0.183932,0.033827,0.213531,0.090909,"[f, ckin, on, the, countertop, window, to, the...",473,f ckin on the countertop window to the floor s...,{ZAYN},0,1
3567,ZAYN,wRoNg - ZAYN,wRoNg Explicit - ZAYN/Kehlani Oh when I in th...,64,"[wRoNg, Explicit, ZAYN, Kehlani, Oh, when, I, ...",245,0.164062,0.072917,0.252604,0.091146,"[wrong, explicit, zayn, kehlani, oh, when, i, ...",384,wrong explicit zayn kehlani oh when i in the s...,{ZAYN},0,1


In [None]:
# randomly select artists
n_artist = 10
random.seed(0)

artist_select = random.choices(song_df['artist'].unique(), k=n_artist)

song_filter_df = song_df.loc[song_df['artist'].isin(artist_select)]
print('Total number of songs: {}'.format(len(song_filter_df)))
song_filter_df.groupby('artist')[['song']].count().reset_index().rename(columns={'song':'songs'})

Total number of songs: 938


Unnamed: 0,artist,songs
0,Coldplay,101
1,Drake,86
2,Harry Styles,22
3,Katy Perry,86
4,Lady Gaga,80
5,Lana Del Rey,114
6,Mariah Carey,194
7,Post Malone,51
8,Queen,163
9,Selena Gomez,41


In [None]:
fig = px.box(song_filter_df, x='artist', y='n_stems', title='Word count per song by artist')
fig.show()

In [None]:
# number of unique stems
song_df['n_unique_stems'] = song_df['stems'].map(lambda lst: len(set(lst)))
# ratio of unique stems
song_df['unique_stems_ratio'] = song_df['n_unique_stems'] / song_df['n_stems']

# attach column to selected artists
song_filter_df = song_filter_df.join(song_df['unique_stems_ratio'])

In [None]:
fig = px.box(song_filter_df, x='artist', y='unique_stems_ratio', title='Ratio of unique words to all words')
fig.show()

In [None]:
# calculate number of words per line
song_df['stems_per_line'] = song_df['n_stems'] / song_df['lines'].astype(float)

song_filter_df = song_filter_df.join(song_df[['stems_per_line']])

In [None]:
fig = px.box(song_filter_df, x='artist', y='stems_per_line', title='Words per line')
fig.show()

In [None]:
# calculate the ratio of stop words
song_df['p_stop_words'] = song_df['n_stop_words']/song_df['n_stems'].astype(float)
song_filter_df = song_filter_df.join(song_df[['p_stop_words']])
fig = px.box(song_filter_df, x='artist', y='p_stop_words', title='Ratio of Stop Words to All Words')
fig.show()

In [None]:
# calculate the ratio of noun words
song_df['p_noun_words'] = song_df['p_noun']
song_filter_df = song_filter_df.join(song_df[['p_noun_words']])
fig = px.box(song_filter_df, x='artist', y='p_noun_words', title='Ratio of Noun Words to All Words')
fig.show()

In [None]:
# calculate the ratio of adj words
song_df['p_adj_words'] = song_df['p_adj']
song_filter_df = song_filter_df.join(song_df[['p_adj_words']])
fig = px.box(song_filter_df, x='artist', y='p_adj_words', title='Ratio of Adjective Words to All Words')
fig.show()

In [None]:
# calculate the ratio of verb words
song_df['p_verb_words'] = song_df['p_verb']
song_filter_df = song_filter_df.join(song_df[['p_verb_words']])
fig = px.box(song_filter_df, x='artist', y='p_verb_words', title='Ratio of Verb Words to All Words')
fig.show()

In [None]:
# calculate the ratio of adv words
song_df['p_adv_words'] = song_df['p_adv']
song_filter_df = song_filter_df.join(song_df[['p_adv_words']])
fig = px.box(song_filter_df, x='artist', y='p_adv_words', title='Ratio of Adverb Words to All Words')
fig.show()

In [None]:
# initialise count vectorizer
cv = CountVectorizer()

# generate word counts
stem_count_vector = cv.fit_transform(song_df['stems_str'])

# compute idf
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(stem_count_vector)

TfidfTransformer()

In [None]:
# print idf values
tfidf_df = pd.DataFrame({'stem': cv.get_feature_names(), 'weight': tfidf_transformer.idf_})
 
# get lowest weights
tfidf_df.sort_values('weight').head()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Unnamed: 0,stem,weight
10989,the,1.048202
12497,you,1.06003
11168,to,1.074108
517,and,1.076827
5652,is,1.119781


In [None]:
# get highest weights
tfidf_df.sort_values('weight', ascending=False).head()

Unnamed: 0,stem,weight
6449,lovеsick,8.487174
7153,mural,8.487174
7177,muth,8.487174
7175,musti,8.487174
7174,muster,8.487174


In [None]:
# assign tf idf scores to each song
tf_idf_vector = tfidf_transformer.transform(stem_count_vector)

# attach count vectors to dataframe
tf_idf_vector_lst = [-1] * len(song_df)
for i in range(len(song_df)):
    tf_idf_vector_lst[i] = tf_idf_vector[i]
song_df['tf_idf_vector'] = tf_idf_vector_lst    

song_df['tf_idf_score'] = song_df['tf_idf_vector'].map(lambda vec: np.sum(vec.todense()))

# join valus to selected artists
song_filter_df = song_filter_df.join(song_df[['tf_idf_vector', 'tf_idf_score']])

In [None]:
fig = px.box(song_filter_df, x='artist', y='tf_idf_score', title='TFIDF scores of songs per artist')
fig.show()

In [None]:
# caclculate mean vector
def get_mean_vector(vec_lst):
    return csr_matrix(vstack(vec_lst).mean(axis=0))

In [None]:
# calculate mean vector over all songs of same artist
artist_df = song_df.groupby('artist').agg({'tf_idf_vector': get_mean_vector, 'song': len}).reset_index()\
                   .rename(columns={'song': 'songs'})

# get selected artists
artist_filter_df = artist_df.loc[artist_df['artist'].isin(song_filter_df['artist'])]

In [None]:
similarity_matrix = cosine_similarity(vstack(artist_filter_df['tf_idf_vector']), 
                                      vstack(artist_filter_df['tf_idf_vector']))
artist_names = artist_filter_df['artist'].tolist()
fig = go.Figure(data=go.Heatmap(z=np.flipud(similarity_matrix), x=artist_names, y=list(reversed(artist_names)), 
                                colorscale='balance', zmin=0.5, zmax=1.1))
fig.show()

In [None]:
artist_song_filter_df = pd.merge(artist_filter_df[['artist', 'tf_idf_vector', 'songs']].assign(key = 0), 
                                 song_filter_df[['artist', 'tf_idf_vector', 'song']].assign(key = 0), on='key', 
                                 suffixes=['_artist', '_song']).drop('key', axis=1).reset_index(drop=True)
artist_song_filter_df['same_artist'] = artist_song_filter_df['artist_artist'] == artist_song_filter_df['artist_song']

In [None]:
# calculate similarity of artist tf idf vector and song vector
def tf_idf_vector_similarity(artist_vector, song_vector, songs, same_artist):
    # check if song is from same artist
    if same_artist:
        # deduct song vector from artist vector
        artist_vector = (songs * artist_vector - song_vector) / (songs - 1)
    # calculate similarity
    return cosine_similarity(artist_vector, song_vector)[0][0]

In [None]:
artist_song_filter_df['vector_similarity'] = \
    artist_song_filter_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_artist'], 
                                                                     row['tf_idf_vector_song'], 
                                                                     row['songs'], row['same_artist']), axis=1)

In [None]:
df = artist_song_filter_df

fig = go.Figure()

fig.add_trace(go.Violin(x=df['artist_artist'][df['same_artist']],
                        y=df['vector_similarity'][df['same_artist']],
                        legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                        side='negative')
             )
fig.add_trace(go.Violin(x=df['artist_artist'][~df['same_artist']],
                        y=df['vector_similarity'][~df['same_artist']],
                        legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                        side='positive')
             )

fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode='overlay')
fig.update_layout(title='Similarity of Songs')
fig.update_xaxes(range=[-0.5, 9.5])
fig.update_yaxes(range=[-0.1, 0.8], title='Similarity')
fig.show()

In [None]:
polarity_lst = [-1] * len(song_df)
subjectivity_lst = [-1] * len(song_df)
for i, text in enumerate(song_df['text']):
    sentiment = TextBlob(text)
    polarity_lst[i] = sentiment.polarity
    subjectivity_lst[i] = sentiment.subjectivity
    
song_df['polarity'] = polarity_lst
song_df['subjectivity'] = subjectivity_lst

song_filter_df = song_filter_df.join(song_df[['polarity', 'subjectivity']])

In [None]:
fig = px.scatter(song_filter_df, x='polarity', y='subjectivity', color='artist', hover_data=['song'], 
                 title='Polarity and Subjectivity of Songs')
fig.show()

In [None]:
fig = px.box(song_filter_df, x='artist', y='polarity', title='Polarity by artist')
fig.show()

In [None]:
# parameter
# number of sets
n_set = {'train': 20, 'val': 20}
# number of artists per set
n_artist = 3
# minimum number of songs of one artist
n_song_min = 20
# maximum number of song - artist pairs per artist set
n_song_artist_max = 300

In [None]:
def select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max):
    song_count_df = song_df.groupby('artist')[['artist']].count().rename(columns={'artist': 'count'})
    artist_lst = list(song_count_df.loc[song_count_df['count'] >= n_song_min].index.values)

    n_set_total = sum(n_set.values())

    artist_set = []
    while len(artist_set) < n_set_total:
        new_artist = tuple(np.random.choice(artist_lst, size=n_artist, replace=False))
        if new_artist not in artist_set:
            artist_set.append(new_artist)

    # split artist sets
    artist_select = {}
    for field, n in n_set.items():
        i_select = np.random.choice(range(len(artist_set)), size=n, replace=False)
        artist_list = list(artist_set)
        artist_select[field] = [artist_list[i] for i in i_select]
        artist_set = [s for s in artist_set if s not in artist_select[field]]

    # create dataframe with all features
    feature_dict = {}
    # dictionary to map artist set id to list of artists
    set_id_to_artist_tp = {}

    i = 0
    for field, artist_set in artist_select.items():
        df_lst = []
        for artist_tp in artist_set:
            i += 1
            df = song_df.loc[song_df['artist'].isin(artist_tp), 
                             ['artist', 'song', 'n_stems', 'unique_stems_ratio', 'stems_per_line', 'p_stop_words',
                              'p_noun_words', 'p_adj_words', 'p_verb_words', 'p_adv_words',
                              'tf_idf_vector', 'tf_idf_score', 'polarity']]
            # check if number of songs is too high
            if len(df) * n_artist > n_song_artist_max:
                df = df.sample(int(n_song_artist_max / n_artist), random_state=0)
                
            df['artist_set_id'] = i
            set_id_to_artist_tp[i] = artist_tp
            df_lst.append(df)
        feature_dict[field] = pd.concat(df_lst)  
        print('Number of songs in {}: {}'.format(field, len(feature_dict[field])))

    # get all selected artists
    artist_select_set = set.union(*[set(sum(tp_lst, ())) for tp_lst in artist_select.values()])

    # create artist dataframe from training data
    df_lst = []
    for artist, df in song_df.loc[song_df['artist'].isin(artist_select_set)].groupby('artist'):
        dic = {'artist': artist}
        # calculate averages and standard diviations
        for field in ['n_stems', 'unique_stems_ratio', 'stems_per_line', 'p_stop_words',
                      'p_noun_words', 'p_adj_words', 'p_verb_words', 'p_adv_words','tf_idf_score', 'polarity']:
            dic[field + '_mean'] = df[field].mean()
            dic[field + '_std'] = df[field].std()

        # number of songs
        dic['songs'] = len(df)

        # calculate average tf idf vector
        dic['tf_idf_vector_mean'] = get_mean_vector(df['tf_idf_vector'])

        df_lst.append(pd.DataFrame(dic, index=[0]))
    artist_feature_df = pd.concat(df_lst)

    def get_features(df):
        # get artist set id
        artist_set_id = df['artist_set_id'].iloc[0]
        
        # get all artists
        artist_feature_select_df = artist_feature_df.loc[artist_feature_df['artist']\
                                                         .isin(set_id_to_artist_tp[artist_set_id])]

        # merge dataframes
        artist_song_feature_df = pd.merge(artist_feature_select_df.assign(key=0), df.assign(key=0), on='key', 
                                          suffixes=['_artist', '_song']).drop('key', axis=1)    
        artist_song_feature_df['same_artist'] = \
            artist_song_feature_df['artist_artist'] == artist_song_feature_df['artist_song']

        # calculate features
        for feature in ['n_stems', 'unique_stems_ratio', 'stems_per_line', 'p_stop_words',
                        'p_noun_words', 'p_adj_words', 'p_verb_words', 'p_adv_words','tf_idf_score', 'polarity']:
            artist_song_feature_df[feature + '_diff'] = \
                artist_song_feature_df[feature] - artist_song_feature_df[feature + '_mean']
            artist_song_feature_df[feature + '_diff_std'] = \
                artist_song_feature_df[feature + '_diff'] / artist_song_feature_df[feature + '_std']

        # calculate vector similarity between artist and song
        artist_song_feature_df['vector_similarity'] = \
            artist_song_feature_df.apply(lambda row: tf_idf_vector_similarity(row['tf_idf_vector_mean'], 
                                                      row['tf_idf_vector'], row['songs'], row['same_artist']), 
                                         axis=1)    
        return artist_song_feature_df

    artist_song_feature = {}
    for field in feature_dict:
        artist_song_feature[field] = feature_dict[field].groupby('artist_set_id').apply(get_features)\
                                                        .reset_index(drop=True)
        
    return artist_song_feature

In [None]:
np.random.seed(0)
artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

Number of songs in train: 2000
Number of songs in val: 2000


In [None]:
artist_song_feature['train'].iloc[0]

artist_artist                                                           Coldplay
n_stems_mean                                                          195.306931
n_stems_std                                                            99.225777
unique_stems_ratio_mean                                                 0.400452
unique_stems_ratio_std                                                  0.145555
stems_per_line_mean                                                     5.316034
stems_per_line_std                                                      1.574704
p_stop_words_mean                                                       0.520169
p_stop_words_std                                                        0.102326
p_noun_words_mean                                                       0.250816
p_noun_words_std                                                        0.152965
p_adj_words_mean                                                        0.059844
p_adj_words_std             

In [None]:
feature = ['n_stems_diff', 'n_stems_diff_std',
       'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std',
       'stems_per_line_diff', 'stems_per_line_diff_std', 
       'p_stop_words_diff', 'p_stop_words_diff_std', 'p_noun_words_diff',
       'p_noun_words_diff_std', 'p_adj_words_diff', 'p_adj_words_diff_std',
       'p_verb_words_diff', 'p_verb_words_diff_std', 'p_adv_words_diff', 'p_adv_words_diff_std',
       'tf_idf_score_diff','tf_idf_score_diff_std', 'polarity_diff', 'polarity_diff_std',
       'vector_similarity']
df_lst = []
for f in feature:
    df = artist_song_feature['train'][['same_artist']]
    df['feature'] = f
    df['value'] = artist_song_feature['train'][f]
    df_lst.append(df)
feature_df = pd.concat(df_lst)
feature_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,same_artist,feature,value
0,False,n_stems_diff,34.693069
1,False,n_stems_diff,2.693069
2,False,n_stems_diff,56.693069
3,False,n_stems_diff,1.693069
4,False,n_stems_diff,77.693069


In [None]:
def violine_feature_plot(feature_df, feature_select):

    fig = go.Figure()
    df = feature_df.loc[feature_df['feature'].isin(feature_select)]

    fig.add_trace(go.Violin(x=df['feature'][df['same_artist']],
                            y=df['value'][df['same_artist']],
                            legendgroup='Same Artist', scalegroup='Same Artist', name='Same Artist',
                            side='negative')
                 )
    fig.add_trace(go.Violin(x=df['feature'][~df['same_artist']],
                            y=df['value'][~df['same_artist']],
                            legendgroup='Different Artists', scalegroup='Different Artists', name='Different Artists',
                            side='positive')
                 )

    fig.update_traces(meanline_visible=True)
    fig.update_layout(violingap=0, violinmode='overlay')
    fig.update_layout(title='Feature Comparison')
    fig.update_xaxes(title='Feature')
    return fig

In [None]:
fig = violine_feature_plot(feature_df, ['p_stop_words_diff_std',
                                        'p_noun_words_diff_std', 'p_adj_words_diff_std', 'p_verb_words_diff_std', 'p_adv_words_diff_std',
                                        ])
fig.update_xaxes(range=[-0.5, 4.5])
fig.show()

In [None]:
fig = violine_feature_plot(feature_df, ['vector_similarity'])
fig.update_xaxes(range=[-1, 1])
fig.show()

In [None]:
def prepare_data(df, feature_org, feature_abs):
    for f in feature_abs:
        df[f] = df[f].abs()
    X = df[feature_org + feature_abs].values
    y = df['same_artist'].values
    
    return X, y

def select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, feature_org, feature_abs, 
                                pipeline):
    artist_song_feature = select_artist_song_create_feature(song_df, n_set, n_artist, n_song_min, n_song_artist_max)

    # prepare data
    X, y = prepare_data(artist_song_feature['train'], feature_org, feature_abs)

    pipeline = pipeline.fit(X, y)
    
    return artist_song_feature, pipeline


In [None]:
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
# prepare data create and train pipeline
n_artist = 3
n_song_min = 5
n_set = {'train': 100}
n_song_artist_max = 200

feature_org = ['n_stems', 'unique_stems_ratio', 'stems_per_line', 'p_stop_words', 'p_noun_words', 
               'p_adj_words', 'p_verb_words', 'p_adv_words', 'tf_idf_score', 'polarity', 'vector_similarity']
feature_abs = ['n_stems_diff', 'n_stems_diff_std', 'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std', 
               'stems_per_line_diff', 'stems_per_line_diff_std', 'p_stop_words_diff', 'p_stop_words_diff_std', 'p_noun_words_diff',
               'p_noun_words_diff_std', 'p_adj_words_diff', 'p_adj_words_diff_std','p_verb_words_diff', 'p_verb_words_diff_std', 
               'p_adv_words_diff', 'p_adv_words_diff_std','tf_idf_score_diff', 'tf_idf_score_diff_std', 
               'polarity_diff', 'polarity_diff_std']

# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
#                                                 class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', svm.SVC(decision_function_shape='ovr', probability=True))])


# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', tree.DecisionTreeClassifier())])


# pipeline = Pipeline([('scale', StandardScaler()), ('clf', RandomForestClassifier(n_estimators=10))])


pipeline = Pipeline([('scale', StandardScaler()), ('clf', AdaBoostClassifier(n_estimators=100))])


# clf1 = LogisticRegression(random_state=1)
# clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
# clf3 = svm.SVC(decision_function_shape='ovr', probability=True)

# pipeline = Pipeline([('scale', StandardScaler()), 
#                         ('clf',VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft'))])


np.random.seed(1)
artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, n_song_artist_max, 
                                                            feature_org, feature_abs, pipeline)
                                                            


Number of songs in train: 6600


In [None]:

# feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})

# feature_importance_df = pd.DataFrame({'feature': feature_org+feature_abs, 'coefficient':pipeline['clf'].coef_[0]})


# px.bar(feature_importance_df.sort_values('coefficient'), x='feature', y='coefficient')

In [None]:
def predict_artist(df, feature_org, feature_abs, pipeline, top_n):
    # prepare data
    X, y = prepare_data(df, feature_org, feature_abs)
    
    # get probability
    proba = pipeline.predict_proba(X)
    # attach to dataframe
    df['probability'] = proba[:, 1]
    df['correct_prediction'] = df['artist_artist'] == df['artist_song']
    
    # get artist song pairs with highest probability
    predict_select = df.sort_values('probability', ascending=False).groupby(['artist_set_id']).head(top_n)\
                       .groupby(['artist_set_id'])['correct_prediction'].max()
    
    # get accuracy
    print('Accuracy: {}'.format(predict_select.mean()))
    
    return predict_select

In [None]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=1)

Accuracy: 0.84


In [None]:
artist_predict_df = predict_artist(artist_song_feature['train'], feature_org, feature_abs, pipeline, top_n=2)

Accuracy: 0.99


In [None]:
n_artist_lst = [2, 4, 8, 16]
top_n_lst = [1, 2, 4, 8]
n_song_artist_max = 16
np.random.seed(2)

n_set = {'train': 50, 'val': 50}

feature_org = ['n_stems', 'unique_stems_ratio', 'stems_per_line', 'p_stop_words', 'p_noun_words', 
               'p_adj_words', 'p_verb_words', 'p_adv_words', 'tf_idf_score', 'polarity', 'vector_similarity']
feature_abs = ['n_stems_diff', 'n_stems_diff_std', 'unique_stems_ratio_diff', 'unique_stems_ratio_diff_std', 
               'stems_per_line_diff', 'stems_per_line_diff_std', 'p_stop_words_diff', 'p_stop_words_diff_std', 'p_noun_words_diff',
               'p_noun_words_diff_std', 'p_adj_words_diff', 'p_adj_words_diff_std','p_verb_words_diff', 'p_verb_words_diff_std', 
               'p_adv_words_diff', 'p_adv_words_diff_std','tf_idf_score_diff', 'tf_idf_score_diff_std', 
               'polarity_diff', 'polarity_diff_std']

# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', LogisticRegression(solver='lbfgs', max_iter=3000, 
#                                                 class_weight={False: 1/n_artist, True:(n_artist - 1)/n_artist}))])

# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', svm.SVC(decision_function_shape='ovr', probability=True))])


# pipeline = Pipeline([('scale', StandardScaler()), 
#                      ('clf', tree.DecisionTreeClassifier())])

# pipeline = Pipeline([('scale', StandardScaler()), ('clf', RandomForestClassifier(n_estimators=10))])

pipeline = Pipeline([('scale', StandardScaler()), ('clf', AdaBoostClassifier(n_estimators=100))])

# clf1 = LogisticRegression(random_state=1)
# clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
# clf3 = svm.SVC(decision_function_shape='ovr', probability=True)

# pipeline = Pipeline([('scale', StandardScaler()), 
#                         ('clf',VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft'))])

result_lst = []

for n_artist in n_artist_lst:
    print(datetime.now())
    print('n_artist: {}'.format(n_artist))
    
    artist_song_feature, pipeline = select_songs_train_pipeline(song_df, n_set, n_artist, n_song_min, 
                                                                n_song_artist_max, feature_org, feature_abs, pipeline)
    
    for top_n in [n for n in top_n_lst if n < n_artist]:
        print('top_n: {}'.format(top_n))
        
        predict_select = predict_artist(artist_song_feature['val'], feature_org, feature_abs, pipeline, top_n=top_n)
        
        result_dict = {'n_artist': n_artist, 'top_n': top_n, 'accuracy': predict_select.mean()}
        result_lst.append(result_dict)
        
    print('')
    
result_df = pd.DataFrame(result_lst)

2022-05-07 11:08:16.894297
n_artist: 2
Number of songs in train: 400
Number of songs in val: 400
top_n: 1
Accuracy: 0.88

2022-05-07 11:08:19.460629
n_artist: 4
Number of songs in train: 200
Number of songs in val: 200
top_n: 1
Accuracy: 0.62
top_n: 2
Accuracy: 0.78

2022-05-07 11:08:21.914111
n_artist: 8
Number of songs in train: 100
Number of songs in val: 100
top_n: 1
Accuracy: 0.18
top_n: 2
Accuracy: 0.46
top_n: 4
Accuracy: 0.84

2022-05-07 11:08:24.359996
n_artist: 16
Number of songs in train: 50
Number of songs in val: 50
top_n: 1
Accuracy: 0.22
top_n: 2
Accuracy: 0.32
top_n: 4
Accuracy: 0.5
top_n: 8
Accuracy: 0.78



In [None]:
fig = px.line(result_df, x='n_artist', y='accuracy', color='top_n', 
              title='Accuracy vs number of artist and number of top selections', 
              labels={'n_artist': 'Number of artists per set', 'top_n': 'Top predictions'})\
        .update_traces(mode='lines+markers')
fig.show()