In [3]:
%pip install nltk swifter swifter[notebook]

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2023.12.1-py3-none-any.whl (1.2 MB)
Collecting partd>=1.2.0
  Downloading partd-1.4.1-py3-none-any.whl (18 kB)
Collecting cloudpickle>=1.5.0
  Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Collecting toolz>=0.10.0
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py): started
  Building wheel for swifter (setup.py): finished with status 'done'
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16513 sha256=201c9933a2e9b6b98fb446761713acc7cc463baa4056b8ab4940750469b5acb8
  Stored in directory: c:\users\paula\appdata\local\pip\cache\wheels\7b\4a\7e\bcc48cf10e10fcf5b4dae464a66b523756db6b950e02129680
Successfully built swifter
Installing collected packages: toolz, locket, partd, cloudp

You should consider upgrading via the 'C:\Users\paula\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [184]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import random
import string
import swifter
import torch

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# Load csv file
df = pd.read_csv('subset-song-lyrics.csv')

In [15]:
print(f"number of songs: {len(df)}")
df.head()

number of songs: 12295


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


# Preprocessing

In [16]:
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ffactory/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ffactory/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
(len(df[df["language"] == "en"]), len(df))

(12064, 12295)

In [186]:
ps = PorterStemmer()
stopwords_en = set(stopwords.words('english'))

# Create a copy of the dataframe
df_proc = df.copy()

# Select only english songs
df_proc = df_proc[df_proc["language"] == "en"]

# Select columns we care about
df_proc = df_proc[["title", "lyrics", "views"]]

# Convert to lowercase
df_proc["lyrics"] = df_proc["lyrics"].str.lower()
# Remove any non-alphanumeric / whitespace characters
df_proc["lyrics"] = df_proc["lyrics"].str.replace(re.compile(r"[^\w\s]"), "", regex=True)
# Remove newlines
df_proc["lyrics"] = df_proc["lyrics"].str.replace("\n", " ", regex=False)
# Remove text between square brackets
df_proc["lyrics"] = df_proc["lyrics"].str.replace(re.compile(r"\[.{0,100}\]"), "", regex=True)
# Split text into words
df_proc["tokens"] = df_proc["lyrics"].str.rsplit()

In [187]:
# Remove stopwords and stem tokens
def remove_stopwords_and_stem(tokens):
    return " ".join([ps.stem(token) for token in tokens if token not in stopwords_en])


df_proc["text"] = df_proc["tokens"].swifter.apply(remove_stopwords_and_stem)
df_proc.drop(columns=["tokens", "lyrics"], inplace=True)

Pandas Apply:   0%|          | 0/12064 [00:00<?, ?it/s]

In [188]:
# Compare the original lyrics with the tokenized lyrics
(df.iloc[5]["lyrics"][0:100], df_proc.iloc[5]["text"][0:100])

("[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (Young Mula, baby!)\nI say, he's so sweet, make her wanna lick",
 'intro lil wayn haha uhhuh homo young mula babi say he sweet make wanna lick wrapper remix babi vers ')

In [189]:
df_proc.head()

Unnamed: 0,title,views,text
0,Killa Cam,173166,choru opera steve camron killa cam killa cam c...
1,Can I Live,468624,produc irv gotti intro yeah hah yeah rocafella...
2,Forgive Me Father,4743,mayb caus im eatin bastard fiend grub carri pu...
3,Down and Out,144404,produc kany west brian miller intro camron kan...
4,Fly In,78271,intro ask young boy gon second time around gon...


In [190]:
# Save as a pickle file
torch.save(df_proc[["title", "text"]], 'subset-documents.pkl')

# Test if the pickle file is saved correctly
df_reloaded = torch.load('subset-documents.pkl')
df_reloaded.head()

Unnamed: 0,title,text
0,Killa Cam,choru opera steve camron killa cam killa cam c...
1,Can I Live,produc irv gotti intro yeah hah yeah rocafella...
2,Forgive Me Father,mayb caus im eatin bastard fiend grub carri pu...
3,Down and Out,produc kany west brian miller intro camron kan...
4,Fly In,intro ask young boy gon second time around gon...


# Generate Query

In [151]:
# Functions to Select Verses
def getFirstVerses(lyricsString, amount):
    verseList = re.split('\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    return " ".join(FinalList[:amount])


def getFirstVersesOfChorus(lyricsString, amount):
    List = re.split('\n', lyricsString)
    verseList = [i for i in List if len(i) > 1]
    for i in range(len(verseList)):
        if "[Chorus" in verseList[i] or "[Hook" in verseList[i]:
            return " ".join(verseList[i + 1:i + amount + 1])
    return getFirstVerses(lyricsString, amount)


def getRandomVerses(lyricsString, amount):
    verseList = re.split('\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    rd = random.randint(0, len(FinalList) - amount)
    return " ".join(FinalList[rd:rd + amount])

In [152]:
# Functions to Degrade message

#Function to create typo by neighbouring letter
NeighbouringKeys = {
    'q': "qwas",
    'w': "qwase",
    'e': "wsedr",
    'r': "edrft",
    't': "rftgy",
    'y': "tgyhu",
    'u': "yhuji",
    'i': "ujiko",
    'o': "ikolp",
    'p': "olp",
    'a': "qwasz",
    's': "wazsxed",
    'd': "sxedcrf",
    'f': "dcrfvtg",
    'g': "fvtgbyh",
    'h': "gbyhnuj",
    'j': "hnujmik",
    'k': "jmikol",
    'l': "kolp",
    'z': "azsx",
    'x': "zsxdc",
    'c': "xdcfv",
    'v': "cfvgb",
    'b': "vgbhn",
    'n': "bhnjm",
    'm': "njmk"
}

englishLetters = NeighbouringKeys.keys()


def typos(text, prob=0.01):
    resultingText = ""

    for letter in text:
        if not letter in englishLetters:
            newLetter = letter
        else:
            if random.random() < prob:
                newLetter = random.choice(NeighbouringKeys[letter])
            else:
                newLetter = letter
        resultingText += newLetter

    return resultingText


#Function to (maybe) invert 2 adjacent letters (do force=True to force it to happen)
def invertAdjacentLetters(text, force=False):
    rd = random.randint(0, len(text) - 2)
    if not force:
        if text[rd] in englishLetters and text[rd + 1] in englishLetters:
            return text[:rd] + text[rd + 1] + text[rd] + text[rd + 2:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters and text[rd + 1] in englishLetters):
            rd = random.randint(0, len(text) - 2)
        return text[:rd] + text[rd + 1] + text[rd] + text[rd + 2:]


#Function to (maybe) remove a letter (do force=True to force it to happen)
def removeLetter(text, force=False):
    rd = random.randint(0, len(text) - 1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd] + text[rd + 1:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters):
            rd = random.randint(0, len(text) - 1)
        return text[:rd] + text[rd + 1:]


#Function to (maybe) double a letter (do force=True to force it to happen)
def doubleLetter(text, force=False):
    rd = random.randint(0, len(text) - 1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd + 1] + text[rd] + text[rd + 1:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters):
            rd = random.randint(0, len(text) - 1)
        return text[:rd + 1] + text[rd] + text[rd + 1:]


CommonMisspelling = {"absence": ["absense", "absentse", "abcense", "absance"], "acceptable": ["acceptible"], "their": ["there", "they're"],
                     "there": ["their", "they're"], "they're": ["their", "there"], "your": ["you're"], "you're": ["your"]}

# Add a common misspelling
def addCommonMisspell(text):
    for word in CommonMisspelling.keys():
        if word in text:
            return text.replace(word, random.choice(CommonMisspelling[word]))
    return text


In [171]:
def generate_qrels(df, n):
    # Select only english songs
    df = df[df["language"] == "en"]
    # Select columns we care about
    df = df[["title","lyrics","views"]]
    # Add 'weight' column
    max_views = max(df["views"])
    df["weight"] = (df["views"] / max_views) ** 0.5 * 0.5 + 0.1

    df_sampled = df.sample(n // 2, weights='weight')
    
    def generate_positive_qrel(document):
        text = document['lyrics']
        rd = random.random()
        if rd < 0.6:
            query = getFirstVersesOfChorus(text, random.randint(1, 2))
        elif rd < 0.9:
            query = getFirstVerses(text, random.randint(1, 2))
        else:
            query = getRandomVerses(text, random.randint(1, 2))
        
        if random.randint(0, 3) == 0:
            query = addCommonMisspell(query)
        query = typos(query)
        
        for j in range(len(query)):
            rand = random.randint(0, 50)
            if rand == 0:
                query = invertAdjacentLetters(query)
            elif rand == 1:
                query = removeLetter(query)
            if rand == 2:
                query = doubleLetter(query)
        
        doc_id = document.name
        return pd.Series([query, doc_id, 1], index=['text', 'doc_id', 'relevance'])

    def generate_negative_qrel(positive_qrel):
        negative_doc_id = positive_qrel['doc_id']
        while negative_doc_id == positive_qrel['doc_id']:
            negative_doc_id = df.iloc[random.randint(0, len(df) - 1)].name
    
        return pd.Series([positive_qrel['text'], negative_doc_id, 0])

    positive_qrels = df_sampled.apply(generate_positive_qrel, axis=1, result_type='expand')
    negative_qrels = positive_qrels.apply(generate_negative_qrel, axis=1, result_type='broadcast')
    
    return pd.concat([positive_qrels, negative_qrels]).reset_index(drop=True)

In [178]:
# Save qrels as pickle file
qrels = generate_qrels(df, 10000)
torch.save(qrels, 'subset-qrels.pkl')

In [182]:
qrels

Unnamed: 0,text,doc_id,relevance
0,Intro: Syn,7865,1
1,"Brand Nubian baby, heere to lip it again And y...",11134,1
2,From a nickel and dime ass nigga,8475,1
3,"Verse 1 (Killa Tay) We pump lugs, and punk thu...",11702,1
4,(I) Fuck with your osul likee ethher,230,1
...,...,...,...
9995,"All in together now (no, now, now) What are yo...",9078,0
9996,"Bling bling, every ttime I come around your ci...",1016,0
9997,Ha Huh mann Im trippiin out right now,2695,0
9998,All the ganggstas they gon' ride to tis,1822,0


# TF-IDF
In order to be able to work on the data easier, we are going to make a string made out of our tokens, to then do the TF-IDF

In [185]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_proc['text'])

feature = tfidf_vectorizer.get_feature_names_out()

KeyError: 'text'

In [31]:
doc_vector = tfidf_matrix[0].toarray()
#df with words and their tf-idf values
df_tfidf = pd.DataFrame(list(zip(feature, doc_vector.flatten())), columns=['Word', 'TF-IDF'])

df_tfidf = df_tfidf.sort_values(by='TF-IDF', ascending=False)
print(df_tfidf)

            Word    TF-IDF
10769        cam  0.800548
34638      killa  0.588214
56995       sing  0.042899
13152       clap  0.041008
65962        uhh  0.018107
...          ...       ...
24039   foodmart  0.000000
24040  foodstamp  0.000000
24041      fooey  0.000000
24042       foof  0.000000
72003        𝑤𝑎𝑠  0.000000

[72004 rows x 2 columns]
