In [8]:
%pip install nltk swifter swifter[notebook]

Note: you may need to restart the kernel to use updated packages.


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import random
import string
import swifter
import torch

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [11]:
# Load csv file
df = pd.read_csv('subset-song-lyrics.csv')

In [12]:
print(f"number of songs: {len(df)}")
df.head()

number of songs: 12295


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\r\nKilla Cam, ...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},[Produced by Irv Gotti]\r\n\r\n[Intro]\r\nYeah...,3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\r\nAnd these bastards fi...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\r\n\...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\r\nSo they ask me\r\n""Young boy\r\nWha...",6,en,en,en


# Preprocessing

In [13]:
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
(len(df[df["language"] == "en"]), len(df))

(12064, 12295)

In [15]:
ps = PorterStemmer()
stopwords_en = set(stopwords.words('english'))

# Create a copy of the dataframe
df_proc = df.copy()

# Select only english songs
df_proc = df_proc[df_proc["language"] == "en"]

# Select columns we care about
df_proc = df_proc[["title","lyrics"]]
# Rename column
df_proc.rename(columns={"lyrics": "tokens"}, inplace=True)

# Convert to lowercase
df_proc["tokens"] = df_proc["tokens"].str.lower()
# Remove text between square brackets and any non alphanumeric / whitespace characters
df_proc["tokens"] = df_proc["tokens"].str.replace(re.compile(r"\[.{0,100}\]|[^\w\s]"), "", regex=True)
# Split text into words
df_proc["tokens"] = df_proc["tokens"].str.rsplit()
# Remove stopwords and stem tokens
def remove_stopwords_and_stem(tokens):
    return [ps.stem(token) for token in tokens if token not in stopwords_en]
df_proc["tokens"] = df_proc["tokens"].swifter.apply(remove_stopwords_and_stem)

Pandas Apply: 100%|██████████| 12064/12064 [01:32<00:00, 131.12it/s]


In [17]:
(", ".join(df_proc.iloc[5]["tokens"][:20]), df.iloc[5]["lyrics"])

('haha, uhhuh, homo, young, mula, babi, say, he, sweet, make, wanna, lick, wrapper, remix, babi, lollipop, lollipop, breasts, like, dolli',
 '[Intro: Lil Wayne]\r\nHaha\r\nUh-huh\r\nNo homo (Young Mula, baby!)\r\nI say, he\'s so sweet, make her wanna lick the wrapper\r\nRemix, baby!\r\n\r\n[Verse 1: Kanye West]\r\nLollipop, lollipop, breastses just like Dolly Parton\r\nShe ride my spaceship \'til she hit the top\r\nThat hit the spot\r\n\'Til she ask, "How many li-i-li-i-licks do it take" \'til she get to shop?\r\nDon\'t worry why my wrists got so freeze\r\nTell a girl, "Like Doritos, that\'s not \'cho cheese"\r\nTell her friends, "Like Fritos, I\'m tryin\' to lay"\r\nI can\'t only have one, and I ain\'t trying to wait\r\nThis a song with Wayne, so you know it\'s gon\' melt\r\nBut you ain\'t finna murder me like everybody else\r\nI\'ma rap like I got some type respect for myself\r\nI don\'t do it for my health, man, I do it for the belt\r\nMan, I do it to the death, \'til the roof get m

In [18]:
df_proc.head()

Unnamed: 0,title,tokens
0,Killa Cam,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,"[yeah, hah, yeah, rocafella, invit, somethin, ..."
2,Forgive Me Father,"[mayb, caus, im, eatin, bastard, fiend, grub, ..."
3,Down and Out,"[ugh, killa, babi, kany, 1970, heron, flow, hu..."
4,Fly In,"[ask, young, boy, gon, second, time, around, g..."


In [13]:
# Save the testing-song-lyrics.csv file as a pickle file
torch.save(df_proc, 'subset_documents.pkl')

# Test if the pickle file is saved correctly
df_reloaded = torch.load('subset_documents.pkl')
df_reloaded.head()

Unnamed: 0,title,tokens
0,Killa Cam,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,"[yeah, hah, yeah, rocafella, invit, somethin, ..."
2,Forgive Me Father,"[mayb, caus, im, eatin, bastard, fiend, grub, ..."
3,Down and Out,"[ugh, killa, babi, kany, 1970, heron, flow, hu..."
4,Fly In,"[ask, young, boy, gon, second, time, around, g..."


# Generate Query (still a lot to do)

In [81]:
# Functions to Select Verses
def getFirstVerses(lyricsString, amount):
    verseList = re.split('\r\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    return " ".join(FinalList[:amount])

def getFirstVersesOfChorus(lyricsString, amount):
    List = re.split('\r\n', lyricsString)
    verseList = [i for i in List if len(i) > 1]
    for i in range(len(verseList)):
        if "[Chorus" in verseList[i] or "[Hook" in verseList[i]:
            return " ".join(verseList[i+1:i+amount+1])
    return getFirstVerses(lyricsString, amount)

def getRandomVerses(lyricsString, amount):
    verseList = re.split('\r\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    rd = random.randint(0,len(FinalList)-amount)
    return " ".join(FinalList[rd:rd+amount])

In [82]:
# Functions to Degrade message

#Function to create typo by neighbouring letter
NeighbouringKeys = {}
NeighbouringKeys['q'] = "qwas"
NeighbouringKeys['w'] = "qwase"
NeighbouringKeys['e'] = "wsedr"
NeighbouringKeys['r'] = "edrft"
NeighbouringKeys['t'] = "rftgy"
NeighbouringKeys['y'] = "tgyhu"
NeighbouringKeys['u'] = "yhuji"
NeighbouringKeys['i'] = "ujiko"
NeighbouringKeys['o'] = "ikolp"
NeighbouringKeys['p'] = "olp"

NeighbouringKeys['a'] = "qwasz"
NeighbouringKeys['s'] = "wazsxed"
NeighbouringKeys['d'] = "sxedcrf"
NeighbouringKeys['f'] = "dcrfvtg"
NeighbouringKeys['g'] = "fvtgbyh"
NeighbouringKeys['h'] = "gbyhnuj"
NeighbouringKeys['j'] = "hnujmik"
NeighbouringKeys['k'] = "jmikol"
NeighbouringKeys['l'] = "kolp"

NeighbouringKeys['z'] = "azsx"
NeighbouringKeys['x'] = "zsxdc"
NeighbouringKeys['c'] = "xdcfv"
NeighbouringKeys['v'] = "cfvgb"
NeighbouringKeys['b'] = "vgbhn"
NeighbouringKeys['n'] = "bhnjm"
NeighbouringKeys['m'] = "njmk"

englishLetters = NeighbouringKeys.keys()

def typos(text,prob=0.01):

    resultingText = ""

    for letter in text:
        if not letter in englishLetters:
            newLetter = letter
        else:
            if random.random() < prob:
                newLetter = random.choice(NeighbouringKeys[letter])
            else: 
                newLetter = letter
        resultingText += newLetter
    
    return resultingText


#Function to (maybe) invert 2 adjacent letters (do force=True to force it to happen)
def invertAdjacentLetters(text, force=False):
    rd = random.randint(0,len(text)-2)
    if not force:
        if text[rd] in englishLetters and text[rd+1] in englishLetters:
            return text[:rd]+text[rd+1]+text[rd]+text[rd+2:]
        else:
            return text
    else :
        while not(text[rd] in englishLetters and text[rd+1] in englishLetters):
            rd = random.randint(0,len(text)-2)
        return text[:rd]+text[rd+1]+text[rd]+text[rd+2:]

#Function to (maybe) remove a letter (do force=True to force it to happen)
def removeLetter(text, force=False):
    rd = random.randint(0,len(text)-1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd]+text[rd+1:]
        else:
            return text
    else :
        while not(text[rd] in englishLetters):
            rd = random.randint(0,len(text)-1)
        return text[:rd]+text[rd+1:]
    
#Function to (maybe) double a letter (do force=True to force it to happen)
def doubleLetter(text, force=False):
    rd = random.randint(0,len(text)-1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd+1]+text[rd]+text[rd+1:]
        else:
            return text
    else :
        while not(text[rd] in englishLetters):
            rd = random.randint(0,len(text)-1)
        return text[:rd+1]+text[rd]+text[rd+1:]

#Function to add a common misspell
CommonMisspelling = {"absence" : ["absense", "absentse", "abcense", "absance"], "acceptable" : ["acceptible"], "their" : ["there", "they're"], "there" : ["their", "they're"], "they're" : ["their", "there"], "your" : ["you're"], "you're" : ["your"]}

def addCommonMisspell(text):
    for word in CommonMisspelling.keys():
        if word in text:
            return text.replace(word, random.choice(CommonMisspelling[word]))
    return text


In [83]:
# Create a copy of the dataframe
df_query = df.copy()

# Select only english songs
df_query = df_query[df_query["language"] == "en"]

# Select columns we care about
df_query = df_query[["title","lyrics","views"]]

# Create a weight column
maxViews = max(df_query["views"])

df_query["weight"] = (df_query["views"]/maxViews) ** 0.5 * 0.5 + 0.1

In [86]:
def getNQueries(df,n):
    df_sampl = df.sample(n=n, weights='weight').reset_index(drop=True)
    Queries = []
    for i in range(n):
        text = df_sampl['lyrics'][i]
        rd = random.random()
        if rd<0.6:
            query = getFirstVersesOfChorus(text,random.randint(1,2))
        elif rd<0.9:
            query = getFirstVerses(text,random.randint(1,2))
        else:
            query = getRandomVerses(text,random.randint(1,2))
        if random.randint(0,1) == 0:
            query = addCommonMisspell(query)
        query = typos(query)
        for j in range(random.randint(0,3)):
            query = invertAdjacentLetters(query)
        for j in range(random.randint(0,2)):
            query = removeLetter(query)
        for j in range(random.randint(0,2)):
            query = doubleLetter(query)
        Queries.append([query,df_sampl['title'][i]])
    return Queries

In [87]:
getNQueries(df_query,1000)

[['She dropped the gun and started running down the coriodr',
  'Millie Fell Off the Fire Escape'],
 ["Cauuse yyou played yuo'reself", 'You Played Yourself'],
 ['She ulghts up like a Christmas tree (Ethel: "Don\'t rty thiw at home!")',
  'Im The Baby Gotta Love Me'],
 ["There's no confusion in her conclusion", 'Shes On It'],
 ['(Go) go, go, go, go, go and on the count of three (Go) go, go, og, go, go and on the cout of three',
  'GO'],
 ["This cccan't be life or is t", 'Make Them Hear You'],
 ["You see, I'm just Marsshall Mtahers (Marshall Mathers) I'm juts a reulaar guy",
  'Marshall Mathers'],
 ['I was stabbed by Satah On the day that I as born',
  'I Was Stabbed By Satan'],
 ['Just give me a fuckin beat Just gie me a fuckin bea', 'Gimme ah Beat'],
 ['Mna, man Loook at the ssky', 'Ghetto Supastar That Is What You Are'],
 ["Mic Check - 1,2 - What a nigga - gon' do When the 165 Crew run upon ypu?",
  'Micc Checc'],
 ['‘Cause any man who wouldd jump in front of a mnivan',
  'Any Man Fuc