In [2]:
import re
import requests
import time
import os
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
base = 'https://www.lyrics.com'

In [9]:
def scrape_songs(artist, number):
    """Download the pages listing all songs for chosen artist"""
    url = f"https://www.lyrics.com/artist/{artist}/{number}"
    result = requests.get(url)
    r = result.text
    pattern = 'href="(/lyric/\d+/[^\/]+/[^"]+)"'
    list_of_songs = re.findall(pattern, r, re.IGNORECASE)
    return list_of_songs

In [13]:
def get_links(songs):
    """Creates URL from list of songs"""
    list_of_links = []
    for i in songs:
        url = base + i
        list_of_links.append(url)
    return list_of_links

In [None]:
def download_songs(artist, links):
    """Writes downloaded songs to txt files"""
    brackets = '%5B.+%5D'
    parens = '%28.+%29'
    whitespace = '/s+'
    pluses = '\+'
    fullstops = '\.'
    other_punct = '%\w\w'
    pattern = "|".join([brackets, parens, whitespace, pluses, fullstops, other_punct])
    for link in links:
        f = requests.get(link)
        e = f.text
        ln = link.find(artist)
        filename = f'{link[ln+len(artist)+1:]}'
        filename = re.sub(pattern, '', filename)
        filename = f'{artist}-{filename}.txt'
        file = open(filename, 'w', encoding='utf-8')
        file.write(e)
        time.sleep(5)
        file.close()

In [29]:
def clean_lyrics(doc):
    """Does some further cleaning on the lyrics files"""
    soup = BeautifulSoup(doc)
    clean = soup.find_all('pre')[0].text.replace("\n", " ")
    apostrophes = r"â€™"
    squarebrackets = r"\[.+\]"
    clean = re.sub(squarebrackets, '', clean)
    clean = re.sub(apostrophes, "'", clean)
    return clean

In [28]:
def parse(path):
    """iterates through txt files, cleans them and saves to a list of strings"""
    list_of_lyrics = []
    for filename in os.listdir(path):
        # filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            html = open(path+filename)
            doc = html.read()
            html.close()
            clean = clean_lyrics(doc)
            list_of_lyrics.append(clean)
    return list_of_lyrics

In [10]:
rhead_songs = scrape_songs('Radiohead', 41092)

In [16]:
rhead_songs[:3]

['/lyric/1943143/Radiohead/Lurgee',
 '/lyric/10519958/Radiohead/Street+Spirit+%28Fade+Out%29+%5BAlbum+Version%5D',
 '/lyric/14704093/Radiohead/Exit+Music']

In [17]:
rhead_links = get_links(rhead_songs)

In [None]:
download_songs('Radiohead', rhead_links)

In [25]:
lyrics = parse('./')

### We now have a cleaned text corpus! We can now begin to train and apply our Naive Bayes classifier to predict whether a lyric "belongs" more to Radiohead or Kate Bush:

We create our labels - a list of Kate Bush 125 times followed by Radiohead 148 times:

In [42]:
labels = []
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".txt"):
        if filename.startswith("Kate"):
            labels.append("Kate Bush")
        else:
            labels.append("Radiohead")

labels

['Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate Bush',
 'Kate

Then we define the Count Vectorizer and tell it to remove English stop words:

In [43]:
cv = CountVectorizer(stop_words="english")

In [44]:
vec = cv.fit_transform(corpus_rhkb)

In [45]:
vec

<273x3688 sparse matrix of type '<class 'numpy.int64'>'
	with 10235 stored elements in Compressed Sparse Row format>

sklearn stores the data as a 'sparse matrix', which we can't directly see yet. This is a special, memory-saving way to save data in long-format. We can see that it's 273x3688 because there are 273 songs and 3,688 distinct words

In [47]:
cv.vocabulary_

{'drifting': 939,
 'twisting': 3402,
 'whiteout': 3585,
 'blackbird': 314,
 'braille': 394,
 'wenceslasaire': 3560,
 'avalanche': 183,
 'come': 654,
 'man': 1910,
 've': 3453,
 'got': 1342,
 '44': 43,
 'swans': 3151,
 'melting': 1954,
 'deamondi': 810,
 'pavlova': 2253,
 'eiderfalls': 1004,
 '10': 0,
 'santanyeroofdikov': 2687,
 '11': 1,
 'stellatundra': 3053,
 '12': 2,
 'hunter': 1547,
 'dream': 929,
 '13': 3,
 'faloop': 1097,
 'njoompoola': 2136,
 '14': 4,
 'zebranivem': 3683,
 '15': 6,
 'spangladasha': 2975,
 '16': 7,
 'albadune': 104,
 '17': 8,
 'hironocrashka': 1491,
 '18': 9,
 'hooded': 1514,
 'wept': 3562,
 'joe': 1641,
 '32': 28,
 'don': 910,
 'know': 1707,
 'just': 1659,
 'eskimo': 1046,
 'let': 1776,
 'hear': 1444,
 '50': 50,
 'words': 3634,
 'snow': 2929,
 '19': 11,
 'phlegm': 2282,
 'neige': 2110,
 '20': 12,
 'mountainsob': 2059,
 '21': 13,
 'anklebreaker': 130,
 '22': 15,
 'erase': 1040,
 'dust': 973,
 '23': 17,
 'shnamistoflopp': 2807,
 '24': 18,
 'terrablizza': 3239,
 '2

In [48]:
vec.data.shape

(10235,)

In [49]:
vec.todense()

matrix([[1, 1, 1, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [50]:
vec.todense().shape

(273, 3688)

To further explore our count-vector, we apply a TF-IDF transformation. This measures the "originality" of a word by comparing the number of times it appears in a document with the number of documents that the word appears in, overall.

In [51]:
tf = TfidfTransformer()

In [52]:
vec_tf = tf.fit_transform(vec)

In [53]:
vec_tf.todense()

matrix([[0.05994988, 0.05584385, 0.05994988, ..., 0.        , 0.05994988,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

So now our word data is normalised according to "originality", we can set up the model for training:

In [54]:
X = vec_tf

In [55]:
y = labels

In [56]:
m = MultinomialNB()
m.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [57]:
m.score(X, y)

0.9926739926739927

In [58]:
X.todense().shape

(273, 3688)

## Now to answer that age-old question: are 2pac's lyrics closer to Radiohead's or Kate Bush's?

We read in a txt file of 2pac lyrics:

In [85]:
cd old_lyric_files

C:\Users\Dave\Desktop\Week_4\song_lyrics\old_lyric_files


In [86]:
tupactxt = open('tupac.txt')

In [87]:
cd ..

C:\Users\Dave\Desktop\Week_4\song_lyrics


In [88]:
tupac = tupactxt.read()

In [93]:
soupac = BeautifulSoup(tupac)

In [118]:
tupactxt.close()

In [104]:
cleanpac_str = soupac.text.replace('\\n', ' ').replace("[with my mind", "(with my mind").replace("\\\'", "\'").replace('\n', '')
cleanpac_str

'["The nights grow cold My search for gold is leading nowhere Whichever lonely road I take It seems to go nowhere It\'s a fight to survive just until tomorrow How can I display what I know I\'m worthy of When they turn me away  The doors are closed to such as I A boy from nowhere But not to those who merely buy the right to go where They\'ll be met with respect, not humiliation A man\'s place on earth I have come to realize is decided by birth  So what\'s the future No matter where I go I will still belong in Andalusia Where we don\'t know where The next penny\'s coming from Something\'s wrong  I\'m bound to Spain, I won\'t remain A boy from nowhere There has to be a place for me And I must go there I don\'t fantasize, unlike a million others Who must bow and scrape For my one means of escape Is to flourish a cape  I\'ll fight all odds And fight the Gods if they oppose me I have to win, I won\'t give in No one who knows me would expect me to fail For the want of trying Not a man alive 

But this is still a single string - we need to make it into a list before we clean it further (remove square brackets etc. - if we remove anything within square brackets now, we will delete almost the whole string as there are square brackets around the outside!)

In [107]:
strtolst = '\[\"|\"\]'

In [110]:
cleanpac_str2 = re.sub(strtolst, '', cleanpac_str)
cleanpac_str2

'The nights grow cold My search for gold is leading nowhere Whichever lonely road I take It seems to go nowhere It\'s a fight to survive just until tomorrow How can I display what I know I\'m worthy of When they turn me away  The doors are closed to such as I A boy from nowhere But not to those who merely buy the right to go where They\'ll be met with respect, not humiliation A man\'s place on earth I have come to realize is decided by birth  So what\'s the future No matter where I go I will still belong in Andalusia Where we don\'t know where The next penny\'s coming from Something\'s wrong  I\'m bound to Spain, I won\'t remain A boy from nowhere There has to be a place for me And I must go there I don\'t fantasize, unlike a million others Who must bow and scrape For my one means of escape Is to flourish a cape  I\'ll fight all odds And fight the Gods if they oppose me I have to win, I won\'t give in No one who knows me would expect me to fail For the want of trying Not a man alive Ha

In [115]:
cleanpac_lst = cleanpac_str2.split('", "')
cleanpac_lst

["The nights grow cold My search for gold is leading nowhere Whichever lonely road I take It seems to go nowhere It's a fight to survive just until tomorrow How can I display what I know I'm worthy of When they turn me away  The doors are closed to such as I A boy from nowhere But not to those who merely buy the right to go where They'll be met with respect, not humiliation A man's place on earth I have come to realize is decided by birth  So what's the future No matter where I go I will still belong in Andalusia Where we don't know where The next penny's coming from Something's wrong  I'm bound to Spain, I won't remain A boy from nowhere There has to be a place for me And I must go there I don't fantasize, unlike a million others Who must bow and scrape For my one means of escape Is to flourish a cape  I'll fight all odds And fight the Gods if they oppose me I have to win, I won't give in No one who knows me would expect me to fail For the want of trying Not a man alive Had to beg or 

Now we have a list of 2pac lyrics. We finally remove the square brackets and their contents (as they are non-lyrical):

In [119]:
tupac = []
for lyric in cleanpac_lst:
    lyric = re.sub(squarebrackets, '', lyric)
    tupac.append(lyric)
tupac

["The nights grow cold My search for gold is leading nowhere Whichever lonely road I take It seems to go nowhere It's a fight to survive just until tomorrow How can I display what I know I'm worthy of When they turn me away  The doors are closed to such as I A boy from nowhere But not to those who merely buy the right to go where They'll be met with respect, not humiliation A man's place on earth I have come to realize is decided by birth  So what's the future No matter where I go I will still belong in Andalusia Where we don't know where The next penny's coming from Something's wrong  I'm bound to Spain, I won't remain A boy from nowhere There has to be a place for me And I must go there I don't fantasize, unlike a million others Who must bow and scrape For my one means of escape Is to flourish a cape  I'll fight all odds And fight the Gods if they oppose me I have to win, I won't give in No one who knows me would expect me to fail For the want of trying Not a man alive Had to beg or 

In [120]:
tupac_vec = cv.transform(tupac)

In [121]:
tupac_vec = tf.transform(tupac_vec)

In [122]:
m.predict(tupac_vec)

array(['Radiohead', 'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead',
       'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead',
       'Radiohead', 'Radiohead', 'Kate Bush', 'Radiohead', 'Radiohead',
       'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead', 'Kate Bush',
       'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead', 'Kate Bush',
       'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead', 'Radiohead',
       'Radiohead', 'Radiohead'], dtype='<U9')

In [123]:
m.predict_proba(tupac_vec)

array([[0.43196507, 0.56803493],
       [0.49194176, 0.50805824],
       [0.33738744, 0.66261256],
       [0.38682172, 0.61317828],
       [0.33285259, 0.66714741],
       [0.34651345, 0.65348655],
       [0.45981743, 0.54018257],
       [0.3092086 , 0.6907914 ],
       [0.40541017, 0.59458983],
       [0.45156938, 0.54843062],
       [0.48153247, 0.51846753],
       [0.37708111, 0.62291889],
       [0.56631176, 0.43368824],
       [0.4486827 , 0.5513173 ],
       [0.46137944, 0.53862056],
       [0.49600862, 0.50399138],
       [0.39545822, 0.60454178],
       [0.41879573, 0.58120427],
       [0.45787546, 0.54212454],
       [0.51415237, 0.48584763],
       [0.39537537, 0.60462463],
       [0.40427306, 0.59572694],
       [0.41003558, 0.58996442],
       [0.46508063, 0.53491937],
       [0.50259179, 0.49740821],
       [0.39514772, 0.60485228],
       [0.40455128, 0.59544872],
       [0.4640867 , 0.5359133 ],
       [0.35057183, 0.64942817],
       [0.38338287, 0.61661713],
       [0.

So Radiohead's lyrics match 2pac's more than Kate Bush's do. Who would have thought it?!

Finally, we calculate the empirical log probability of features given a class, `P(x_i|y)`

In [124]:
m.feature_log_prob_

array([[-8.31541457, -8.26307152, -8.31541457, ..., -8.29778259,
        -8.31541457, -8.37363619],
       [-8.36780064, -8.36780064, -8.36780064, ..., -8.36780064,
        -8.36780064, -8.29089547]])

Then we can take the difference between the 2 in order to find out which words were the most "discriminating" features.

In [125]:
one = m.feature_log_prob_[0]
two = m.feature_log_prob_[1]

diff = one - two

In [126]:
df = pd.DataFrame(diff, index= list(sorted(cv.vocabulary_.keys())) ).sort_values(by=0, ascending=False)
df

Unnamed: 0,0
ooh,1.217529
love,1.061749
woman,1.003803
mmh,0.946821
rolling,0.864306
night,0.863766
water,0.793761
hear,0.735749
deeper,0.734851
tell,0.729337


SpaCy stuff:

In [None]:
vec2 = cv.fit_transform(corpus_tupac)

In [None]:
nlp = spacy.load(en_core_web_sm)

In [None]:
first = nlp(corpus_rhead[0])

In [None]:
first

In [None]:
first.text

In [None]:
corpus_rhead[0]

In [None]:
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [None]:
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

In [None]:
lemmas = lemmatizer('i ended up where i belonged', univ_pos = 'VERB')

In [None]:
lemmas