In [1]:
import pandas as pd
import os, codecs
import spacy
from collections import Counter

In [2]:
FOLDER = "data/"

In [3]:
filename = "lyrics_per_song.csv"
df = pd.read_csv(FOLDER + filename)
df.head()

Unnamed: 0,Song,Performer,Year,Decade,Lyrics
0,eastside,"benny blanco, halsey",2019,2010,"Uh\nYeah, yeah\n\nWhen I was young, I fell in ..."
1,wait for you,elliott yamin,2007,2000,I never felt nothing in the world like this be...
2,wildflower,skylark,1973,1970,She's faced the hardest times you could imagin...
3,even though i'm leaving,luke combs,2019,2010,"Daddy, I'm afraid, won't you stay a little whi..."
4,do re mi,blackbear,2017,2010,"Do, re, mi, fa, so\nYeah, yeah, yeah, oh\nDo, ..."


In [4]:
nlp = spacy.load('en')

In [5]:
docs = list()
for lyrics_doc in os.listdir(FOLDER):
    if ".txt" in lyrics_doc:
        print(lyrics_doc)
        with codecs.open(os.path.join(FOLDER,lyrics_doc),encoding="utf8") as f:
            docs.append(f.read())

2010_lyrics.txt
1990_lyrics.txt
1970_lyrics.txt
2020_lyrics.txt
1960_lyrics.txt
1980_lyrics.txt
1950_lyrics.txt
2000_lyrics.txt


In [6]:
#preview first lines of 2020_lyrics.txt
print(docs[3][0:300])

I'm like the water when your ship rolled in that night
Rough on the surface, but you cut through like a knife
And if it was an open-shut case
I never would've known from that look on your face
Lost in your current like a priceless wine

The more that you say, the less I know
Wherever you stray, I fo


In [7]:
#remove new lines
docs = [" ".join(d.split()) for d in docs]
#preview
print(docs[3][0:300])

I'm like the water when your ship rolled in that night Rough on the surface, but you cut through like a knife And if it was an open-shut case I never would've known from that look on your face Lost in your current like a priceless wine The more that you say, the less I know Wherever you stray, I fol


In [8]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(stopwords))
print('First ten stop words:',list(stopwords)[:10])

Number of stop words: 326
First ten stop words: ['which', 'hers', 'meanwhile', 'me', 'move', 'your', '’m', 'keep', 'well', 'except']


In [24]:
# add some more stopwords
for w in ["got", "know", "\n", "\n\n", " "]:
    nlp.vocab[w].is_stop = True

**Most common words, without stop words and punctuation:**

In [12]:
def get_common_words(doc, n):
    text = nlp(doc)
    words = [token.text for token in text if token.is_stop != True and token.is_punct != True]

    word_freq = Counter(words)
    common_words = word_freq.most_common()
    
    # print five most common tokens
    print(f"{common_words[0:n]}\n")
    return common_words[0:n]

In [13]:
for doc in docs:
    get_common_words(doc, 5)

[('yeah', 408), ('oh', 270), ('love', 189), ('gon', 189), ('ft', 178)]

[('love', 262), ('baby', 259), ('HOUSE', 166), ('ai', 155), ('time', 136)]

[('love', 251), ('oh', 147), ('baby', 126), ('Oh', 125), ('ron', 124)]

[('na', 90), ('hope', 31), ('hold', 30), ('baby', 25), ('Yeah', 23)]

[('State', 342), ('court', 236), ('love', 228), ('baby', 203), ('suit', 179)]

[('love', 282), ('BLOOM', 276), ('PG', 205), ('G', 185), ('Bloom', 154)]

[('HARRY', 465), ('says', 392), ('PERRY', 237), ('HARMONY', 214), ('Bloom', 186)]

[('love', 222), ('oh', 190), ('girl', 167), ('baby', 166), ('time', 163)]



In [22]:
nlp.max_length = 2084661

In [23]:
with codecs.open(os.path.join(FOLDER,"all_lyrics.txt"),encoding="utf8") as f:
            all_doc = f.read()
get_common_words(all_doc, 100)

[('\n', 33442), ('\n\n', 1957), ('love', 1573), ('baby', 1003), ('oh', 896), ('yeah', 893), ('Oh', 768), ('time', 720), ('na', 643), ('way', 592), ('gon', 592), ('want', 588), ('right', 570), ('let', 566), ('girl', 563), ('ai', 562), ('little', 541), ('wanna', 514), (' ', 509), ('come', 505), ('man', 487), ('Yeah', 476), ('need', 474), ('HARRY', 465), ('day', 458), ('night', 445), ('says', 445), ('think', 407), ('good', 394), ('said', 388), ('away', 382), ('life', 376), ('feel', 376), ('heart', 374), ('State', 354), ('Bloom', 352), ('tell', 351), ('Come', 347), ('ooh', 344), ('world', 328), ('old', 323), ('eyes', 321), ('ft', 309), ('Baby', 304), ('wo', 302), ('Hey', 296), ('head', 293), ('Let', 292), ('hand', 291), ("'Cause", 289), ('thing', 280), ('home', 278), ('BLOOM', 277), ('find', 271), ('court', 266), ('real', 265), ('ta', 262), ('ya', 259), ('face', 240), ('la', 240), ('better', 237), ('PERRY', 237), ('long', 235), ('shit', 223), ('going', 221), ('God', 219), ('case', 219), ('

[('\n', 33442),
 ('\n\n', 1957),
 ('love', 1573),
 ('baby', 1003),
 ('oh', 896),
 ('yeah', 893),
 ('Oh', 768),
 ('time', 720),
 ('na', 643),
 ('way', 592),
 ('gon', 592),
 ('want', 588),
 ('right', 570),
 ('let', 566),
 ('girl', 563),
 ('ai', 562),
 ('little', 541),
 ('wanna', 514),
 (' ', 509),
 ('come', 505),
 ('man', 487),
 ('Yeah', 476),
 ('need', 474),
 ('HARRY', 465),
 ('day', 458),
 ('night', 445),
 ('says', 445),
 ('think', 407),
 ('good', 394),
 ('said', 388),
 ('away', 382),
 ('life', 376),
 ('feel', 376),
 ('heart', 374),
 ('State', 354),
 ('Bloom', 352),
 ('tell', 351),
 ('Come', 347),
 ('ooh', 344),
 ('world', 328),
 ('old', 323),
 ('eyes', 321),
 ('ft', 309),
 ('Baby', 304),
 ('wo', 302),
 ('Hey', 296),
 ('head', 293),
 ('Let', 292),
 ('hand', 291),
 ("'Cause", 289),
 ('thing', 280),
 ('home', 278),
 ('BLOOM', 277),
 ('find', 271),
 ('court', 266),
 ('real', 265),
 ('ta', 262),
 ('ya', 259),
 ('face', 240),
 ('la', 240),
 ('better', 237),
 ('PERRY', 237),
 ('long', 235),
