In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
import re
import numpy as np
import spacy
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


In [2]:
#given the artist's main metrolyrics html page, the function extracts lyrics of all the songs that are on the metrolyrics website for the artist 
def download_lyrics(html):  
    list_of_links = []
    individual_links = []
    artist = (str(re.findall('com\/.+\.', html))[6:-10])
    df = pd.DataFrame(columns = ['song', 'artist', 'song name']) 
    artist_page = requests.get(html)
    artist_page_bs = soup(artist_page.text, 'html.parser')
    links_all = artist_page_bs.find_all(attrs = {'class' : 'pagination'})
    links_all_a = str(links_all[0].find_all('a'))
    final_links = re.findall(f'http:\/\/www\.metrolyrics\.com\/{artist}-alpage-.+html', links_all_a)
    final_links = final_links[:-1]
    

    for i in final_links: #get all individual links for the artist 
        each_page = requests.get(i)
        each_page_text = each_page.text
        separate_links = re.findall(f'http:\/\/www\.metrolyrics\.com\/.+lyrics-{artist}.html', each_page_text)
        individual_links.append(separate_links)
    all_links = [item for sublist in individual_links for item in sublist]  
      
    for i in all_links:
        each_song = requests.get(i)
        each_song_text = each_song.text
        html = soup(each_song_text, 'html.parser')
        results = html.find_all(attrs = {'id' : "lyrics-body-text"})
        l = str([results[0].find_all('p')[i].text for i in range(len(results[0].find_all('p')))])
        df = df.append({'song' : l, 'artist': artist, 'song name' : i}, ignore_index=True)
    
    return df
    
    

In [3]:
df_taylorswift = download_lyrics('http://www.metrolyrics.com/taylor-swift-lyrics.html')
df_lana_del_rey = download_lyrics('http://www.metrolyrics.com/lana-del-rey-lyrics.html')
df_maroon_5 = download_lyrics('http://www.metrolyrics.com/maroon-5-lyrics.html')

complete_df = df_taylorswift.append(df_lana_del_rey, ignore_index=True)
complete_df = complete_df.append(df_maroon_5, ignore_index = True)

281

In [4]:
#this section cleans the generated lyrics

pattern = r'\[|\]' #remove the start of the list []
clean_list = []

for i in complete_df['song']: 
    new = re.sub(pattern, '', i)
    clean_list.append(new)

final_lyrics = [] #replace the start of the string
for i in clean_list:
    new = i.replace('"', "'")
    new1 = new.strip()
    final_lyrics.append(new1)    

new_pattern = r'\\n' #replace new line with blank 
cleanest_list = []
for i in final_lyrics:
    new = re.sub(new_pattern, ' ', i)
    cleanest_list.append(new)
    
new_pattern = r'\\' #remove \\ 
the_final_lyrics = []
for i in cleanest_list:
    new = re.sub(new_pattern,'', i)
    the_final_lyrics.append(new)  
    
new_pattern = r'âx80x94|âx80|âx80x92|âx80x94|ãx88|' #remove random characters 
my_lyrics = []
for i in the_final_lyrics:
    new = re.sub(new_pattern,'', i)
    my_lyrics .append(new)      
    

In [5]:
#this section uses spacy to lemmatize the lyrics 

nlp = spacy.load('en_core_web_sm') 

corpus = my_lyrics
tokenized_corpus = []
for each in corpus:
    tokens = nlp(each)
    tokenized_corpus.append(tokens)

final_lemma = []
for i in tokenized_corpus:
    intermediate_list = []
    for word in i: 
        if not word.is_stop:
            intermediate_list.append(word.lemma_)
    final_lemma.append(' '.join(intermediate_list))    

In [6]:
y =['Taylor Swift'] * len(df_taylorswift) + ['Lana Del Rey'] * len(df_lana_del_rey) + ['Maroon 5'] * len(df_maroon_5) 


In [7]:
df_before_split = pd.DataFrame({'lyrics' : final_lemma, 'artist' : y})

df_before_split['artist'].value_counts() 

#there is inbalance of classes, hence decided to upscale 

maximum = df_before_split['artist'].value_counts().max()

for la in df_before_split['artist'].value_counts().index:
    label_df = df_before_split[df_before_split['artist'] == la]
    diff = maximum - len(label_df)
    df_before_split = df_before_split.append(label_df.sample(diff))


In [8]:
X = df_before_split['lyrics']
y = df_before_split['artist']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
cv = CountVectorizer(stop_words = 'english')
out = cv.fit_transform(X_train)
df = pd.DataFrame(out.todense(), columns = cv.get_feature_names()) 

In [10]:
tf = TfidfTransformer()
transformed = tf.fit_transform(out) #convert the original sparse matrix to the TfidTransformer 
tdf = pd.DataFrame(transformed.todense(), columns=cv.get_feature_names()) 

In [11]:
X_train_transformed = tdf.values

In [20]:
m = MultinomialNB(alpha = 2.0) 
m.fit(X_train_transformed, y_train)
training_accuracy_score = m.score(X_train_transformed, y_train) 
print('Training accuracy score:', training_accuracy_score)

Training accuracy score: 0.8434782608695652


In [22]:
#cross_validation_Scores
acc = cross_val_score(m, X_train_transformed, y_train, cv = 5)

print("Mean of accuracy scores:", acc.mean())
print("Scores array:", acc)

Mean of accuracy scores: 0.6841451117113291
Scores array: [0.61870504 0.69565217 0.75362319 0.67391304 0.67883212]


In [23]:
cv_test = cv.transform(X_test)
tfcounts_test = tf.transform(cv_test)
tf_test = pd.DataFrame(tfcounts_test.todense(), columns=cv.get_feature_names()) 
X_test_transformed = tf_test.values

In [24]:
predictions = m.predict(X_test_transformed)

In [27]:
testing_accuracy_score = m.score(X_test_transformed, y_test)

print("Testing accuracy score:", testing_accuracy_score)

Testing accuracy score: 0.683982683982684


In [29]:
song = ["I want summertime play"]
counts = cv.transform(song)
tfcounts = tf.transform(counts)
tfcounts.todense()
m.predict(tfcounts), m.predict_proba(tfcounts).round(2)

#predicts Lana Del Rey 

(array(['Lana Del Rey'], dtype='<U12'), array([[0.52, 0.29, 0.2 ]]))

In [30]:
song_2 = ["Christmas photographs fall"]
counts = cv.transform(song_2)
tfcounts = tf.transform(counts)
m.predict(tfcounts), m.predict_proba(tfcounts).round(2)

#predicts Taylor Swift 

(array(['Taylor Swift'], dtype='<U12'), array([[0.27, 0.28, 0.45]]))