In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ccal0507/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Data

In [4]:
df = pd.read_csv('lyrics.csv')
# Removing any songs without lyrics 
df = df[~(pd.isna(df['lyrics']) | pd.isna(df['release date']))]
df.head()

# Defining a dataset with nonempty descriptions for *maybe* future use
data = df.query("description != '?'")

In [5]:
# Original data description 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        373 non-null    object
 1   title         373 non-null    object
 2   lyrics        373 non-null    object
 3   description   373 non-null    object
 4   release date  373 non-null    object
dtypes: object(5)
memory usage: 17.5+ KB


In [6]:
# Nonempty description data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313 entries, 0 to 384
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   artist        313 non-null    object
 1   title         313 non-null    object
 2   lyrics        313 non-null    object
 3   description   313 non-null    object
 4   release date  313 non-null    object
dtypes: object(5)
memory usage: 14.7+ KB


# Data Cleaning


### Cleaning `lyrics`, `artist`, and `title`  
In this section we are removing any unecessary text found in the lyrics column. In addition, we are lowercasing all other textual columns. 

In [7]:
def clean_lyrics(test_str):
    
    test_str = str(test_str)
    # 0) Removing apostrophes for tokenization purposes
    test_str = test_str.lower().replace("'", "")
    
    # 1) Removing unecessary textual data 
    res = re.search(r'lyrics' , test_str)
    emb_res = re.search(r'\d*embed$', test_str)

    test_str = test_str[res.end():emb_res.start()]

    # 2) Removing any punctuation (except parantheses)
    test_str = re.sub(r'[.,\-?:!;]', '', test_str)

    return test_str

In [8]:
df['lyrics'] = df['lyrics'].apply(clean_lyrics)
df['artist'] = df['artist'].str.lower()
df['title'] = df['title'].str.lower()

In [103]:
df.head(2)

Unnamed: 0,artist,title,lyrics,description,release date,year
0,ray charles,hit the road jack,\nhit the road jack and doncha come back\nno m...,This tongue and cheek verbal duel of a couple ...,August 1961,1961
1,ray charles,georgia on my mind,\ngeorgia\ngeorgia\nthe whole day through\n(th...,Written by Hoagy Carmichael and Stuart Gorrell...,September 1960,1960


In [104]:
df['year'] = pd.to_datetime(df['release date']).dt.year
df.head(2)

Unnamed: 0,artist,title,lyrics,description,release date,year
0,ray charles,hit the road jack,\nhit the road jack and doncha come back\nno m...,This tongue and cheek verbal duel of a couple ...,August 1961,1961
1,ray charles,georgia on my mind,\ngeorgia\ngeorgia\nthe whole day through\n(th...,Written by Hoagy Carmichael and Stuart Gorrell...,September 1960,1960


### Building our Feature Matrix 

In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.utils import np_utils

In [8]:
curr_df = df[df['artist'] == 'the notorious b.i.g.']
corpus = list(curr_df['lyrics'])

In [9]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

In [10]:
inp_sequences, total_words = get_sequence_of_tokens(corpus)

In [11]:
total_words

816

In [12]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = np_utils.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [13]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [14]:
max_sequence_len

812

In [15]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)

In [16]:
model.fit(predictors, label, epochs=10, verbose=2)

Epoch 1/10
88/88 - 62s - loss: 6.2524
Epoch 2/10
88/88 - 59s - loss: 5.8697
Epoch 3/10
88/88 - 60s - loss: 5.8151
Epoch 4/10
88/88 - 61s - loss: 5.7586
Epoch 5/10
88/88 - 60s - loss: 5.6797
Epoch 6/10
88/88 - 60s - loss: 5.6311
Epoch 7/10
88/88 - 61s - loss: 5.4989
Epoch 8/10
88/88 - 62s - loss: 5.4054
Epoch 9/10
88/88 - 61s - loss: 5.3115
Epoch 10/10
88/88 - 59s - loss: 5.2034


<keras.callbacks.History at 0x7f21c83e5100>

In [25]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        
        predict_x=model.predict(token_list) 
        classes_x=np.argmax(predict_x,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
                break
        seed_text += " "+output_word        
    return seed_text.title()

In [28]:
print(generate_text("You want", 20, model, max_sequence_len))

You Want I The Fuckin To My Fuckin To To My Back To My Back To My Fuckin To My Back To


## Text Summarization Baseline Model 1

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [19]:
import spacy 
import spacy.lang.en.stop_words as STOP_WORDS
from string import punctuation 

Will first attempt to summarize one song.

In [106]:
text = df['lyrics'][0]
text

'\nhit the road jack and doncha come back\nno more no more no more no more\nhit the road jack and doncha come back no more\nwhatd you say\nhit the road jack and doncha come back\nno more no more no more no more\nhit the road jack and doncha come back no more\nold woman old woman oh you treat me so mean\nyoure the meanest old woman that ive ever seen\ni guess if you say so\nill have to pack my things and go (thats right)\nhit the road jack and doncha come back\nno more no more no more no more\nhit the road jack and doncha come back no more\nwhatd you say\nhit the road jack and doncha come back\nno more no more no more no more\nhit the road jack and doncha come back no more\nnow baby listen baby dont you treat me this way\ncause ill be back on my feet some day\ndont care if you do cause its understood\nyou aint got no money you just a no good\nwell i guess if you say so\nill have to pack my things and go (thats right)\nhit the road jack and doncha come back\nno more no more no more no mo

In [85]:
stopwords = list(STOP_WORDS.STOP_WORDS)

nlp = spacy.load('en_core_web_sm')

In [86]:
doc = nlp(text)

In [87]:
tokens = [token.text for token in doc]
print(tokens)

['\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', '\n', 'no', 'more', 'no', 'more', 'no', 'more', 'no', 'more', '\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', 'no', 'more', '\n', 'what', 'd', 'you', 'say', '\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', '\n', 'no', 'more', 'no', 'more', 'no', 'more', 'no', 'more', '\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', 'no', 'more', '\n', 'old', 'woman', 'old', 'woman', 'oh', 'you', 'treat', 'me', 'so', 'mean', '\n', 'you', 're', 'the', 'meanest', 'old', 'woman', 'that', 'i', 've', 'ever', 'seen', '\n', 'i', 'guess', 'if', 'you', 'say', 'so', '\n', 'ill', 'have', 'to', 'pack', 'my', 'things', 'and', 'go', '(', 'that', 's', 'right', ')', '\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', '\n', 'no', 'more', 'no', 'more', 'no', 'more', 'no', 'more', '\n', 'hit', 'the', 'road', 'jack', 'and', 'doncha', 'come', 'back', 'no', 'more', '\n', 'what', 

In [90]:
# Encoding how many times each token appears in the song 

all_frequencies = {}
word_frequencies = {}

for word in doc:
    if word.text not in punctuation and word.text != '\n' and word.text not in stopwords: 
        if word.text not in word_frequencies.keys():
            word_frequencies[word.text] = 1
        else:
            word_frequencies[word.text] += 1
    if word.text not in all_frequencies.keys():
        all_frequencies[word.text] = 1
    else:
        all_frequencies[word.text] += 1

# Ideas: Deal with \n and () and stopwords differently --> they mean something else in lyrics 
    #    and encode their positional index to encode chorus vs. verse vs. bridge 

In [91]:
print(word_frequencies)
print(all_frequencies)

{'hit': 12, 'road': 12, 'jack': 12, 'doncha': 18, 'come': 18, 'd': 4, 'old': 3, 'woman': 3, 'oh': 2, 'treat': 2, 'mean': 2, 'meanest': 1, 've': 1, 'seen': 1, 'guess': 2, 'ill': 3, 'pack': 2, 'things': 2, 's': 2, 'right': 2, 'baby': 3, 'listen': 1, 'nt': 5, 'way': 1, 'cause': 2, 'feet': 1, 'day': 1, 'care': 1, 'understood': 1, 'ai': 1, 'got': 1, 'money': 1, 'good': 1, 'uh': 1, 'understand': 1, 'trying': 1}
{'\n': 37, 'hit': 12, 'the': 13, 'road': 12, 'jack': 12, 'and': 14, 'doncha': 18, 'come': 18, 'back': 19, 'no': 38, 'more': 36, 'what': 5, 'd': 4, 'you': 15, 'say': 6, 'old': 3, 'woman': 3, 'oh': 2, 'treat': 2, 'me': 3, 'so': 3, 'mean': 2, 're': 2, 'meanest': 1, 'that': 4, 'i': 4, 've': 1, 'ever': 1, 'seen': 1, 'guess': 2, 'if': 3, 'ill': 3, 'have': 2, 'to': 4, 'pack': 2, 'my': 3, 'things': 2, 'go': 2, '(': 8, 's': 2, 'right': 2, ')': 8, 'now': 2, 'baby': 3, 'listen': 1, 'do': 4, 'nt': 5, 'this': 1, 'way': 1, 'cause': 2, 'be': 1, 'on': 1, 'feet': 1, 'some': 1, 'day': 1, 'care': 1, 'it

In [92]:
max_all_frequency = max(all_frequencies.values())
max_word_frequency = max(word_frequencies.values())

max_all_frequency, max_word_frequency

(38, 18)

In [93]:
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word]/max_word_frequency

In [94]:
print(word_frequencies)

{'hit': 0.6666666666666666, 'road': 0.6666666666666666, 'jack': 0.6666666666666666, 'doncha': 1.0, 'come': 1.0, 'd': 0.2222222222222222, 'old': 0.16666666666666666, 'woman': 0.16666666666666666, 'oh': 0.1111111111111111, 'treat': 0.1111111111111111, 'mean': 0.1111111111111111, 'meanest': 0.05555555555555555, 've': 0.05555555555555555, 'seen': 0.05555555555555555, 'guess': 0.1111111111111111, 'ill': 0.16666666666666666, 'pack': 0.1111111111111111, 'things': 0.1111111111111111, 's': 0.1111111111111111, 'right': 0.1111111111111111, 'baby': 0.16666666666666666, 'listen': 0.05555555555555555, 'nt': 0.2777777777777778, 'way': 0.05555555555555555, 'cause': 0.1111111111111111, 'feet': 0.05555555555555555, 'day': 0.05555555555555555, 'care': 0.05555555555555555, 'understood': 0.05555555555555555, 'ai': 0.05555555555555555, 'got': 0.05555555555555555, 'money': 0.05555555555555555, 'good': 0.05555555555555555, 'uh': 0.05555555555555555, 'understand': 0.05555555555555555, 'trying': 0.0555555555555

In [95]:
sentence_tokens = [sent for sent in doc.sents]
sentence_tokens

[
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 whatd you say
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 old woman old woman,
 oh you treat me so mean
 youre the meanest old woman that ive ever seen,
 i guess if you say so
 ill have to pack my things and go (thats right),
 
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 whatd you say
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 now baby listen baby dont you treat me this way
 cause ill be back on my feet some day
 dont care if you do cause its understood
 you aint got no money you just a no good,
 well i guess if you say so
 ill have to pack my things and go (thats right),
 
 hit the road jack and doncha come back
 no more no more no 

In [96]:
sentence_scores = {}

for sent in sentence_tokens:
    for word in sent: 
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] = word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent] += word_frequencies[word.text.lower()] 

In [97]:
print(sentence_scores)

{
hit the road jack and doncha come back
no more no more no more no more
hit the road jack and doncha come back no more
whatd you say
hit the road jack and doncha come back
no more no more no more no more
hit the road jack and doncha come back no more
old woman old woman: 16.88888888888889, oh you treat me so mean
youre the meanest old woman that ive ever seen
: 0.8333333333333333, i guess if you say so
ill have to pack my things and go (thats right): 0.7222222222222223, 
hit the road jack and doncha come back
no more no more no more no more
hit the road jack and doncha come back no more
whatd you say
hit the road jack and doncha come back
no more no more no more no more
hit the road jack and doncha come back no more
now baby listen baby dont you treat me this way
cause ill be back on my feet some day
dont care if you do cause its understood
you aint got no money you just a no good
: 18.44444444444446, well i guess if you say so
ill have to pack my things and go (thats right): 0.722222

In [98]:
from heapq import nlargest

In [100]:
select_length = int(len(sentence_tokens) * 0.3)
select_length

2

In [101]:
summary = nlargest(select_length, sentence_scores, 
                   key = sentence_scores.get)

In [102]:
summary

[
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 whatd you say
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 now baby listen baby dont you treat me this way
 cause ill be back on my feet some day
 dont care if you do cause its understood
 you aint got no money you just a no good,
 
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 whatd you say
 hit the road jack and doncha come back
 no more no more no more no more
 hit the road jack and doncha come back no more
 old woman old woman]

Not very good and it will do worse with rap songs. Will have to figure out a way to remove explicit lyrics.