#!/usr/bin/env python
#coding: utf-8

# Lyrics Predictor


In [104]:
import requests
from bs4 import BeautifulSoup
import re
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from fuzzywuzzy import fuzz
import itertools
import pandas as pd
import numpy as np
import operator

In [17]:
# Global variables
ARTIST_1 = []
ARTIST_2 = []
LYRICS_ARTIST_1 = []
LYRICS_ARTIST_2 = []

In [18]:
def url_finder(artist_name):
    '''
    Finds url of an artist's page on lyrics.com given artist name.
    
    Parameters
    ----------
    artist_name : str
        Name of the artist
    
    Returns
    -------
    url_artist_page : str
        url path of the artist page on lyrics.com
    '''
    artist_search_path_name = artist_name.replace(' ', '%20')
    search_url = 'https://www.lyrics.com/serp.php?st=' + artist_search_path_name
    response = requests.get(search_url)
    search_page_html = response.text
    search_page_soup = BeautifulSoup(search_page_html)#, features="lxml")
    artist_page_path_name = search_page_soup.find(attrs={'class': 'table tdata'}).tbody.tr.td.a.get('href')
    url_artist_page = 'https://www.lyrics.com/' + artist_page_path_name
    # TODO: Iterate the several options in the search page to confirm selection
    # Example: There are 2 links for artist 'Nirvana'. This function selects the first one, which only has 1 song.
    # The main Nirvana page is the second link. 
    return url_artist_page

In [19]:
def scrape_links(url):
    '''
    Scrapes the links to all songs of an artist provided their page on lyrics.com.
    
    Parameters
    ----------
    url : str
        the url of artist page on lyrics.com

    Returns
    -------
    list
        a list of strings with the links to the all song lyrics on the artist page on lyrics.com
    '''
    response = requests.get(url)
    html_artist = response.text
    artist_name = re.findall('Albums by <strong>([^<]+)', html_artist)[0] # correct spelling of the artist's name
    soup_artist = BeautifulSoup(html_artist)#, features="lxml")
    html_links = soup_artist.find_all(attrs={'class': 'tal qx'})
    lyrics_links = []
    song_titles = []
    for entry in html_links:
        str_entry = str(entry.find_all('a')[0])
        path_end, song_title = re.findall('\/l[^<]+', str_entry)[0].split(sep='">') # extracts links and titles
        if song_title not in song_titles: # this removes some duplicates before downloading songs
            lyrics_links.append('https://www.lyrics.com' + path_end)
            song_titles.append(song_title)
    return lyrics_links, artist_name

In [20]:
def list_lyrics_artist(artist_name, num_links):
    '''
    Creates two lists of strings:
        - one with all the lyrics;
        - one with the name of the artist the same number of times.
    '''
    #TODO: find out a way to add the num_links by reading the file names
    # if I don't want to scrape lyrics but read  files already downloaded
    num_len = len(str(num_links))
    lyrics_list = []
    for i in range(1, num_links + 1):
        try:
            with open('./songs/' + artist_name.replace(' ', '_') + '_' + str(i).zfill(num_len) + '.txt', 'r') as f:
                lyrics_list.append(f.read())
        except: # file is not found because num_links > number of actual lyrics (some were empty and skipped)
            continue 
    purged_list = purge_duplicates(lyrics_list) # calls function to purge duplicates
    artist_list = [artist_name for i in range(len(purged_list))]
    # return lyrics_list
    return purged_list, artist_list

In [21]:
def purge_duplicates(lyrics_list):
    '''
    Purges lyrics that are more than 50% similar to one another
    '''
    lyrics_purged = list(set(lyrics_list))
    # Purges exact duplicates. Order is changed. David Bowie went down to 498 from 1170.
    fuzzy_suspects = []
    wuzzy_suspects = [] 
    fw_ratios = [] 
    for i, j in itertools.combinations(lyrics_purged, 2):
        if fuzz.ratio(i, j) > 50:
            fuzzy_suspects.append(i)
            wuzzy_suspects.append(j) # Lists of songs with similarity ratio above 50%.
            fw_ratios.append(fuzz.ratio(i, j)) # List of fuzzywuzzy ratios.
    lyrics_to_purge = []
    for i, j in zip(fuzzy_suspects, wuzzy_suspects):
        if len(i) > len(j):
            lyrics_to_purge.append(j) 
        else:
            lyrics_to_purge.append(i) # Chooses the shortest of each suspect to be purged
    #return fuzzy_suspects, wuzzy_suspects, fw_ratios
    return [item for item in lyrics_purged if item not in lyrics_to_purge]

In [22]:
def scrape_lyrics(artist_name):
    '''
    Scrapes the lyrics of all songs of an artist provided their name.
    The lyrics are then saved as separate files in the folder 'songs' in current directory.
    
    Parameters
    ----------
    url : str
        the url of artist page on lyrics.com
    '''
    url = url_finder(artist_name) # calls function to find artist website
    # TODO: Confirm the artist selected is the one the user called.
    lyrics_links, artist_name = scrape_links(url) # calls function to scrape links to all lyrics of the artist
    # also: artist_name reassignment corrects capitalization and accentuation in input
    for i, song in enumerate(lyrics_links, 1):
        response_song = requests.get(song)
        html_song = response_song.text
        soup_song = BeautifulSoup(html_song)#, features="lxml")
        html_lyrics = str(soup_song.find(attrs={'class': "lyric-body"}))
        if html_lyrics == 'None':
            continue # Skips dead links
        else:
            lyrics = soup_song.find(attrs={'class': "lyric-body"}).text
            num_len = len(str(len(lyrics_links))) # length of number of songs
            with open('./songs/' + artist_name.replace(' ', '_') + '_' + str(i).zfill(num_len) + '.txt', 'w') as f:
                #zfill(num_len) makes sure there are enough leading 0's before the number to sort files easily.
                f.writelines(lyrics)
            continue
    #return artist_name, i
    return list_lyrics_artist(artist_name, i) # NOTE: i is the number of links/iterations, not lyrics/songs!


In [None]:
def build_model(X, y):
    '''
    Trains Logistic Regression model
    '''
    Xtrain, Xtest, ytrain, ytest = tts(X, y)
    cv = CountVectorizer()

## Main Program

In [23]:
artist_name_1 = input('Insert the name of an artist: ')

Insert the name of an artist: metronomy


In [24]:
# scrape lyrics
LYRICS_ARTIST_1, ARTIST_1 = scrape_lyrics(artist_name_1)
# get already scraped files
#LYRICS_ARTIST_1, ARTIST_1 = list_lyrics_artist(artist_name_1, num_links)

In [25]:
artist_name_2 = input('Insert the name of another artist: ')

Insert the name of another artist: blood orange


In [26]:
LYRICS_ARTIST_2, ARTIST_2 = scrape_lyrics(artist_name_2)

In [34]:
X = LYRICS_ARTIST_1 + LYRICS_ARTIST_2
y = ARTIST_1 + ARTIST_2

In [35]:
Xtrain, Xtest, ytrain, ytest = tts(X, y) 

In [148]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
Xtrain_t = vectorizer.fit_transform(Xtrain) # Fit and transform X

In [118]:
pipe = Pipeline([('count', CountVectorizer()), ('tfid', TfidfTransformer())]).fit(Xtrain)

In [119]:
pipe['count'].transform(Xtrain).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [120]:
pipe['tfid'].idf_

array([4.73766962, 4.73766962, 4.73766962, ..., 3.35137526, 4.73766962,
       4.73766962])

In [123]:
Xtrain_t = pipe.transform(Xtrain)

In [149]:
m = LogisticRegression(C=0.001, class_weight='balanced') # <-- regularize here
m.fit(Xtrain_t, ytrain)
m.score(Xtrain_t, ytrain)

1.0

In [150]:
cross_val_score(m, Xtrain_t, ytrain)

array([0.52941176, 0.70588235, 0.76470588, 0.8125    , 0.6875    ])

In [151]:
Xtest_t = vectorizer.transform(Xtest)

In [152]:
Xtest_t

<28x6312 sparse matrix of type '<class 'numpy.float64'>'
	with 851 stored elements in Compressed Sparse Row format>

In [153]:
ypred = m.predict(Xtest_t)
ypred

array(['Metronomy', 'Blood Orange', 'Blood Orange', 'Blood Orange',
       'Metronomy', 'Blood Orange', 'Blood Orange', 'Blood Orange',
       'Metronomy', 'Blood Orange', 'Blood Orange', 'Blood Orange',
       'Blood Orange', 'Metronomy', 'Blood Orange', 'Blood Orange',
       'Blood Orange', 'Blood Orange', 'Metronomy', 'Blood Orange',
       'Metronomy', 'Metronomy', 'Metronomy', 'Blood Orange', 'Metronomy',
       'Blood Orange', 'Metronomy', 'Blood Orange'], dtype='<U12')

In [146]:
len(np.array(ytest))

28

In [154]:
Xtest_t_df = pd.DataFrame(Xtest_t.todense(), columns=vectorizer.get_feature_names(), index=ytest)
Xtest_t_df

Unnamed: 0,18,18 ig3,1992,1992 playing,19s,19s plain,20,20 20,20 bag,20 thinking,...,young,young girl,young man,young mystery,young oh,young seen,ça,ça elle,étaient,étaient moins
Metronomy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.041665,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Metronomy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Metronomy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.120145,0.0,0.156416,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Metronomy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
Xtrain_t_df = pd.DataFrame(Xtrain_t.todense(), columns=vectorizer.get_feature_names(), index=ytrain)

In [101]:
Xtrain_t_df

Unnamed: 0,18,1992,19s,20,279,45,aaaaah,ache,acted,actin,...,year,years,yellin,yellow,yes,yo,yoda,young,ça,étaient
Blood Orange,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Blood Orange,0,0,0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Blood Orange,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Blood Orange,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Count Vectorizer

In [258]:
vectorizer = CountVectorizer(stop_words='english', )

In [259]:
Xtrain_t = vectorizer.fit_transform(Xtrain) # Fit and transform X

In [260]:
m = LogisticRegression(class_weight='balanced', C=0.01) # <-- regularize here
m.fit(Xtrain_t, ytrain)
m.score(Xtrain_t, ytrain)

0.966903073286052

In [261]:
Xtrain_t

<423x5945 sparse matrix of type '<class 'numpy.int64'>'
	with 21174 stored elements in Compressed Sparse Row format>

In [262]:
print(operator.itemgetter(*np.argsort(m.coef_[0]))(vectorizer.get_feature_names())[-20:])

('songs', 'queda', 'sending', 'longing', 'trade', 'milk', 'él', 'las', 'come', 'mami', 'let', 'una', 'just', 'ck', 'heal', 'papi', 'que', 'en', 'care', 'hot')


In [263]:
print(operator.itemgetter(*np.argsort(m.coef_[0]))(vectorizer.get_feature_names())[:20])

('ll', 'baby', 'oh', 've', 'away', 'yeah', 'want', 'think', 'feel', 'tonight', 'way', 'know', 'need', 'dance', 'life', 'man', 'got', 'home', 'going', 'tell')


In [264]:
cv_score_5 = cross_val_score(m, Xtrain_t, ytrain)

In [265]:
Xtest_t = vectorizer.transform(Xtest) # ONLY TRANSFORM!
m_score_5 = m.score(Xtest_t, ytest)

In [266]:
ypred_5 = m.predict(Xtest_t)

In [267]:
probs = m.predict_proba(Xtest_t)
probs_5 = pd.DataFrame(probs)

In [268]:
Xtest_t_df = pd.DataFrame(Xtest_t.todense(), columns=vectorizer.get_feature_names(), index=ytest)

In [269]:
Xtest_t_df

Unnamed: 0,000,10,1000,12,13th,14,15,16,16th,18,...,zoos,zu,zéro,ça,égal,él,és,étaient,était,être
Metronomy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Blood Orange,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Model scores**

In [184]:
# Stop-words: english
cv_score_1, m_score_1, ypred_1[:20], probs_1

(array([0.8       , 0.8       , 0.81176471, 0.8452381 , 0.80952381]),
 0.7588652482269503,
 array(['David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie'],
       dtype='<U12'),
             0         1         2         3
 0    0.018180  0.042287  0.771481  0.168052
 1    0.019576  0.095716  0.792189  0.092519
 2    0.004560  0.015230  0.930564  0.049646
 3    0.014871  0.067229  0.726680  0.191221
 4    0.011651  0.039250  0.910426  0.038673
 ..        ...       ...       ...       ...
 136  0.003943  0.031233  0.949779  0.015045
 137  0.015138  0.160839  0.742500  0.081523
 138  0.010321  0.043672  0.882175  0.063832
 139  0.008726  0.044003  0.918291  0.028981
 140  0.007550  0.127557  0.842493  0.022400
 
 [1

In [194]:
# max_df=0.75
cv_score_2, m_score_2, ypred_2[:20], ytest[:20], probs_2.head(20)

(array([0.8       , 0.8       , 0.8       , 0.83333333, 0.79761905]),
 0.7659574468085106,
 array(['David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie',
        'Metronomy', 'Blood Orange', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie'],
       dtype='<U12'),
 ['Metronomy',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'Blood Orange',
  'Arca',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie'],
            0         1         2         3
 0   0.004763  0.010995  0.958049  0.026192
 1   0.019473  0.050947  0.804932  0.124649
 2   0.000910  0.003842  0.977248  0.018001
 3   0.008764  0.166602  0.725740  

In [202]:
# Stop-words: english, max_df = 0.75
cv_score_3, m_score_3, ypred_3[:20], probs_3

(array([0.8       , 0.8       , 0.81176471, 0.8452381 , 0.80952381]),
 0.7588652482269503,
 array(['David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie'],
       dtype='<U12'),
             0         1         2         3
 0    0.018180  0.042287  0.771481  0.168052
 1    0.019576  0.095716  0.792189  0.092519
 2    0.004560  0.015230  0.930564  0.049646
 3    0.014871  0.067229  0.726680  0.191221
 4    0.011651  0.039250  0.910426  0.038673
 ..        ...       ...       ...       ...
 136  0.003943  0.031233  0.949779  0.015045
 137  0.015138  0.160839  0.742500  0.081523
 138  0.010321  0.043672  0.882175  0.063832
 139  0.008726  0.044003  0.918291  0.028981
 140  0.007550  0.127557  0.842493  0.022400
 
 [1

In [236]:
# no stop words, ngram_range=(1, 2)
cv_score_4, m_score_4, ypred_4[:20], ytest[:20], probs_4

(array([0.8       , 0.8       , 0.81176471, 0.82142857, 0.80952381]),
 0.7801418439716312,
 array(['David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie',
        'Metronomy', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie'],
       dtype='<U12'),
 ['Metronomy',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'Blood Orange',
  'Arca',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie'],
             0         1         2         3
 0    0.002155  0.024390  0.942810  0.030645
 1    0.024651  0.041639  0.833693  0.100017
 2    0.000477  0.004440  0.981766  0.013316
 3    0.007024  0.164456  0.7462

In [270]:
# stop words english, model: class_weight_balanced
cv_score_5, m_score_5, ypred_5[:20], ytest[:20], probs_5

(array([0.75294118, 0.8       , 0.77647059, 0.75      , 0.83333333]),
 0.7588652482269503,
 array(['Metronomy', 'David Bowie', 'David Bowie', 'Metronomy',
        'David Bowie', 'David Bowie', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie',
        'David Bowie', 'Blood Orange', 'David Bowie', 'David Bowie',
        'David Bowie', 'David Bowie', 'Blood Orange', 'David Bowie'],
       dtype='<U12'),
 ['Metronomy',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'Blood Orange',
  'David Bowie',
  'Blood Orange',
  'Arca',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie',
  'David Bowie'],
             0         1         2         3
 0    0.047231  0.067850  0.429456  0.455464
 1    0.084392  0.228246  0.435303  0.252059
 2    0.011752  0.030865  0.803500  0.153883
 3    0.040510  0.114479  0.3619

## TF-IDF

In [238]:
vectorizer = TfidfVectorizer()
Xtest_t = vectorizer.fit_transform(Xtest)
Xtest_t_df = pd.DataFrame(Xtest_t.todense(), columns=vectorizer.get_feature_names(), index=ytest)

In [239]:

Xtest_t_df

Unnamed: 0,15,1984,1st,32nd,52,6x,aaa,aaaaah,aah,abc,...,yore,york,you,young,your,yourself,youth,ziggy,zombies,zone
Metronomy,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.202371,0.0,0.018575,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.055381,0.0,0.021785,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.127828,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.033827,0.0,0.067655,0.0,...,0.0,0.026766,0.007840,0.0,0.021587,0.089241,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.051842,0.0,0.071375,0.000000,0.0,0.0,0.0,0.0
Blood Orange,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.220091,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.074970,0.000000,0.0,0.0,0.0,0.0
David Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.131787,0.0,0.045360,0.000000,0.0,0.0,0.0,0.0


In [None]:
# class weights= balanced
# pipeline: count vectorizer + tfidf
