In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ccal0507/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data

In [4]:
df = pd.read_csv('lyrics.csv')
# Removing any songs without lyrics 
df = df[~(pd.isna(df['lyrics']) | pd.isna(df['release date']))]
df.head()

# Defining a dataset with nonempty descriptions for *maybe* future use
data = df.query("description != '?'")

In [5]:
# Original data description 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        373 non-null    object
 1   title         373 non-null    object
 2   lyrics        373 non-null    object
 3   description   373 non-null    object
 4   release date  373 non-null    object
dtypes: object(5)
memory usage: 17.5+ KB


In [6]:
# Nonempty description data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        313 non-null    object
 1   title         313 non-null    object
 2   lyrics        313 non-null    object
 3   description   313 non-null    object
 4   release date  313 non-null    object
dtypes: object(5)
memory usage: 14.7+ KB


# Data Cleaning


### Cleaning `lyrics`, `artist`, and `title`  
In this section we are removing any unecessary text found in the lyrics column. In addition, we are lowercasing all other textual columns. 

In [7]:
def clean_lyrics(test_str):
    
    test_str = str(test_str)
    # 0) Removing apostrophes for tokenization purposes
    test_str = test_str.lower().replace("'", "")
    
    # 1) Removing unecessary textual data 
    res = re.search(r'lyrics' , test_str)
    emb_res = re.search(r'\d*embed$', test_str)

    test_str = test_str[res.end():emb_res.start()]

    # 2) Removing any punctuation (except parantheses)
    test_str = re.sub(r'[.,\-?:!;]', '', test_str)

    return test_str

In [8]:
df['lyrics'] = df['lyrics'].apply(clean_lyrics)
df['artist'] = df['artist'].str.lower()
df['title'] = df['title'].str.lower()

In [9]:
df

Unnamed: 0,artist,title,lyrics,description,release date
0,ray charles,hit the road jack,\nhit the road jack and doncha come back\nno m...,This tongue and cheek verbal duel of a couple ...,August 1961
1,ray charles,georgia on my mind,\ngeorgia\ngeorgia\nthe whole day through\n(th...,Written by Hoagy Carmichael and Stuart Gorrell...,September 1960
2,ray charles,i’ve got a woman,\nwell\n\ni got a woman way over town\nthats g...,Ray Charles released “I’ve Got a Woman” as a s...,December 1954
3,ray charles,i can’t stop loving you,\n(i cant stop loving you)\nive made up my min...,?,1962
4,ken nordine,yellow,in the beginning\noh long before that\nwhen li...,?,"January 1, 1966"
...,...,...,...,...,...
380,raveena,temptation,ahahah\nahahah\n\nmiss temptation i dont think...,"In “Temptation”, Raveena opens up about her bi...","October 23, 2018"
381,the notorious b.i.g.,juicy,"\n(""fuck all you hoes"" get a grip motherfucker...",“Juicy” is the first single from Big’s debut a...,"August 9, 1994"
382,the notorious b.i.g.,big poppa,\nuh uh check it out (yeah) uh\njunior mafia u...,“Big Poppa” was The Notorious B.I.G.’s first t...,"February 20, 1995"
383,the notorious b.i.g.,suicidal thoughts,\nhello\naw shit nigga the fuck time is it man...,In this final track off of The Notorious B.I.G...,"September 13, 1994"


In [11]:
df['year'] = pd.to_datetime(df['release date']).dt.year
df

Unnamed: 0,artist,title,lyrics,description,release date,year
0,ray charles,hit the road jack,\nhit the road jack and doncha come back\nno m...,This tongue and cheek verbal duel of a couple ...,August 1961,1961
1,ray charles,georgia on my mind,\ngeorgia\ngeorgia\nthe whole day through\n(th...,Written by Hoagy Carmichael and Stuart Gorrell...,September 1960,1960
2,ray charles,i’ve got a woman,\nwell\n\ni got a woman way over town\nthats g...,Ray Charles released “I’ve Got a Woman” as a s...,December 1954,1954
3,ray charles,i can’t stop loving you,\n(i cant stop loving you)\nive made up my min...,?,1962,1962
4,ken nordine,yellow,in the beginning\noh long before that\nwhen li...,?,"January 1, 1966",1966
...,...,...,...,...,...,...
380,raveena,temptation,ahahah\nahahah\n\nmiss temptation i dont think...,"In “Temptation”, Raveena opens up about her bi...","October 23, 2018",2018
381,the notorious b.i.g.,juicy,"\n(""fuck all you hoes"" get a grip motherfucker...",“Juicy” is the first single from Big’s debut a...,"August 9, 1994",1994
382,the notorious b.i.g.,big poppa,\nuh uh check it out (yeah) uh\njunior mafia u...,“Big Poppa” was The Notorious B.I.G.’s first t...,"February 20, 1995",1995
383,the notorious b.i.g.,suicidal thoughts,\nhello\naw shit nigga the fuck time is it man...,In this final track off of The Notorious B.I.G...,"September 13, 1994",1994


### Building our Feature Matrix 

In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.utils import np_utils

In [8]:
curr_df = df[df['artist'] == 'the notorious b.i.g.']
corpus = list(curr_df['lyrics'])

In [9]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [10]:
inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [11]:
total_words

816

In [12]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = np_utils.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [13]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [14]:
max_sequence_len

812

In [15]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)

In [16]:
model.fit(predictors, label, epochs=10, verbose=2)

Epoch 1/10
88/88 - 62s - loss: 6.2524
Epoch 2/10
88/88 - 59s - loss: 5.8697
Epoch 3/10
88/88 - 60s - loss: 5.8151
Epoch 4/10
88/88 - 61s - loss: 5.7586
Epoch 5/10
88/88 - 60s - loss: 5.6797
Epoch 6/10
88/88 - 60s - loss: 5.6311
Epoch 7/10
88/88 - 61s - loss: 5.4989
Epoch 8/10
88/88 - 62s - loss: 5.4054
Epoch 9/10
88/88 - 61s - loss: 5.3115
Epoch 10/10
88/88 - 59s - loss: 5.2034


<keras.callbacks.History at 0x7f21c83e5100>

In [25]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        
        predict_x=model.predict(token_list) 
        classes_x=np.argmax(predict_x,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
                break
        seed_text += " "+output_word        
    return seed_text.title()

In [28]:
print(generate_text("You want", 20, model, max_sequence_len))

You Want I The Fuckin To My Fuckin To To My Back To My Back To My Fuckin To My Back To


## Text Summarization Baseline Model 1

In [14]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.2.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
     |████████████████████████████████| 6.2 MB 6.3 MB/s            
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp38-cp38-manylinux2014_x86_64.whl (13.7 MB)
     |████████████████████████████████| 13.7 MB 48.1 MB/s            
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     |████████████████████████████████| 181 kB 54.9 MB/s            
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (663 kB)
     |████████████████████████████████| 663 kB 4.2 MB/s            
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux20

Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
You should consider upgrading via the '/home/ccal0507/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [19]:
import spacy 
import spacy.lang.en.stop_words as STOP_WORDS
from string import punctuation 

Will first attempt to summarize one song.

In [32]:
text = df['lyrics'][383]
text

'\nhello\naw shit nigga the fuck time is it man\noh god damn\nnigga do you know what time it is\naw shit what the fucks goin on\nyou aight\nah nigga what the fuck is wrong with you\n\nwhen i die fuck it i wanna go to hell\ncause im a piece of shit it aint hard to fuckin tell (what you talkin bout man)\nit dont make sense goin to heaven with the goodiegoodies\ndressed in white i like black timbs and black hoodies (aw man)\ngodll probably have me on some real strict shit\nno sleepin all day no gettin my dick licked\nhangin with the goodiegoodies loungin in paradise\nfuck that shit i wanna tote guns and shoot dice (you talkin some crazy shit now nigga)\nall my life i been considered as the worst\nlyin to my mother even stealin out her purse (ah)\ncrime after crime from drugs to extortion\ni know my mother wish she got a fuckin abortion\nshe dont even love me like she did when i was younger (yo get a hold of yourself nigga)\nsuckin on her chest just to stop my fuckin hunger\ni wonder if i 

In [28]:
stopwords = list(STOP_WORDS.STOP_WORDS)

nlp = spacy.load('en_core_web_sm')

In [33]:
doc = nlp(text)

In [36]:
tokens = [token.text for token in doc]
print(tokens)

['\n', 'hello', '\n', 'aw', 'shit', 'nigga', 'the', 'fuck', 'time', 'is', 'it', 'man', '\n', 'oh', 'god', 'damn', '\n', 'nigga', 'do', 'you', 'know', 'what', 'time', 'it', 'is', '\n', 'aw', 'shit', 'what', 'the', 'fucks', 'goin', 'on', '\n', 'you', 'aight', '\n', 'ah', 'nigga', 'what', 'the', 'fuck', 'is', 'wrong', 'with', 'you', '\n\n', 'when', 'i', 'die', 'fuck', 'it', 'i', 'wanna', 'go', 'to', 'hell', '\n', 'cause', 'i', 'm', 'a', 'piece', 'of', 'shit', 'it', 'ai', 'nt', 'hard', 'to', 'fuckin', 'tell', '(', 'what', 'you', 'talkin', 'bout', 'man', ')', '\n', 'it', 'do', 'nt', 'make', 'sense', 'goin', 'to', 'heaven', 'with', 'the', 'goodiegoodies', '\n', 'dressed', 'in', 'white', 'i', 'like', 'black', 'timbs', 'and', 'black', 'hoodies', '(', 'aw', 'man', ')', '\n', 'godll', 'probably', 'have', 'me', 'on', 'some', 'real', 'strict', 'shit', '\n', 'no', 'sleepin', 'all', 'day', 'no', 'gettin', 'my', 'dick', 'licked', '\n', 'hangin', 'with', 'the', 'goodiegoodies', 'loungin', 'in', 'parad

In [37]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
# Encoding how many times each token appears in the song 

word_frequencies = {}

for word in doc:
    if word.text not in word_frequencies.keys():
        word_frequencies[word.text] = 1
    else:
        word_frequencies[word.text] += 1

# Ideas: Deal with \n and () differently --> they mean something else in lyrics 

In [43]:
word_frequencies

{'\n': 47,
 'hello': 1,
 'aw': 3,
 'shit': 8,
 'nigga': 12,
 'the': 15,
 'fuck': 5,
 'time': 2,
 'is': 7,
 'it': 5,
 'man': 8,
 'oh': 1,
 'god': 3,
 'damn': 1,
 'do': 3,
 'you': 11,
 'know': 3,
 'what': 5,
 'fucks': 1,
 'goin': 3,
 'on': 8,
 'aight': 1,
 'ah': 2,
 'wrong': 1,
 'with': 3,
 '\n\n': 1,
 'when': 4,
 'i': 31,
 'die': 2,
 'wanna': 3,
 'go': 1,
 'to': 16,
 'hell': 1,
 'cause': 1,
 'm': 10,
 'a': 6,
 'piece': 1,
 'of': 7,
 'ai': 2,
 'nt': 8,
 'hard': 1,
 'fuckin': 6,
 'tell': 2,
 '(': 19,
 'talkin': 3,
 'bout': 1,
 ')': 19,
 'make': 1,
 'sense': 1,
 'heaven': 1,
 'goodiegoodies': 2,
 'dressed': 1,
 'in': 5,
 'white': 1,
 'like': 6,
 'black': 2,
 'timbs': 1,
 'and': 6,
 'hoodies': 1,
 'godll': 1,
 'probably': 1,
 'have': 1,
 'me': 10,
 'some': 2,
 'real': 1,
 'strict': 1,
 'no': 3,
 'sleepin': 1,
 'all': 2,
 'day': 1,
 'gettin': 1,
 'my': 16,
 'dick': 1,
 'licked': 1,
 'hangin': 1,
 'loungin': 1,
 'paradise': 1,
 'that': 3,
 'tote': 1,
 'guns': 1,
 'shoot': 1,
 'dice': 1,
 'cra