In [3]:
%pip install nltk swifter swifter[notebook]




You should consider upgrading via the 'C:\Users\paula\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import random
import string
import swifter
import torch

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Load csv file
df = pd.read_csv('subset-song-lyrics.csv')

In [6]:
print(f"number of songs: {len(df)}")
df.head()

number of songs: 12295


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


# Preprocessing

In [7]:
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paula\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
(len(df[df["language"] == "en"]), len(df))

(12064, 12295)

In [9]:
ps = PorterStemmer()
stopwords_en = set(stopwords.words('english'))

# Create a copy of the dataframe
df_proc = df.copy()

# Select only english songs
df_proc = df_proc[df_proc["language"] == "en"]

# Select columns we care about
df_proc = df_proc[["title", "lyrics", "views"]]

# Convert to lowercase
df_proc["lyrics"] = df_proc["lyrics"].str.lower()
# Remove any non-alphanumeric / whitespace characters
df_proc["lyrics"] = df_proc["lyrics"].str.replace(re.compile(r"[^\w\s]"), "", regex=True)
# Remove newlines
df_proc["lyrics"] = df_proc["lyrics"].str.replace("\n", " ", regex=False)
# Remove text between square brackets
df_proc["lyrics"] = df_proc["lyrics"].str.replace(re.compile(r"\[.{0,100}\]"), "", regex=True)
# Split text into words
df_proc["tokens"] = df_proc["lyrics"].str.rsplit()

In [10]:
# Remove stopwords and stem tokens
def remove_stopwords_and_stem(tokens):
    return " ".join([ps.stem(token) for token in tokens if token not in stopwords_en])


df_proc["text"] = df_proc["tokens"].swifter.apply(remove_stopwords_and_stem)
df_proc.drop(columns=["tokens", "lyrics"], inplace=True)

Pandas Apply:   0%|          | 0/12064 [00:00<?, ?it/s]

In [11]:
# Compare the original lyrics with the tokenized lyrics
(df.iloc[5]["lyrics"][0:100], df_proc.iloc[5]["text"][0:100])

("[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (Young Mula, baby!)\nI say, he's so sweet, make her wanna lick",
 'intro lil wayn haha uhhuh homo young mula babi say he sweet make wanna lick wrapper remix babi vers ')

In [12]:
df_proc.head()

Unnamed: 0,title,views,text
0,Killa Cam,173166,choru opera steve camron killa cam killa cam c...
1,Can I Live,468624,produc irv gotti intro yeah hah yeah rocafella...
2,Forgive Me Father,4743,mayb caus im eatin bastard fiend grub carri pu...
3,Down and Out,144404,produc kany west brian miller intro camron kan...
4,Fly In,78271,intro ask young boy gon second time around gon...


In [13]:
# Save as a pickle file
torch.save(df_proc[["title", "text"]], 'subset-documents.pkl')

# Test if the pickle file is saved correctly
df_reloaded = torch.load('subset-documents.pkl')
df_reloaded.head()

Unnamed: 0,title,text
0,Killa Cam,choru opera steve camron killa cam killa cam c...
1,Can I Live,produc irv gotti intro yeah hah yeah rocafella...
2,Forgive Me Father,mayb caus im eatin bastard fiend grub carri pu...
3,Down and Out,produc kany west brian miller intro camron kan...
4,Fly In,intro ask young boy gon second time around gon...


# Generate Query

In [14]:
# Functions to Select Verses
def getFirstVerses(lyricsString, amount):
    verseList = re.split('\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    return " ".join(FinalList[:amount])


def getFirstVersesOfChorus(lyricsString, amount):
    List = re.split('\n', lyricsString)
    verseList = [i for i in List if len(i) > 1]
    for i in range(len(verseList)):
        if "[Chorus" in verseList[i] or "[Hook" in verseList[i]:
            return " ".join(verseList[i + 1:i + amount + 1])
    return getFirstVerses(lyricsString, amount)


def getRandomVerses(lyricsString, amount):
    verseList = re.split('\n', lyricsString)
    FinalList = [i for i in verseList if (len(i) > 1 and i[0] != '[')]
    rd = random.randint(0, len(FinalList) - amount)
    return " ".join(FinalList[rd:rd + amount])

In [15]:
# Functions to Degrade message

#Function to create typo by neighbouring letter
NeighbouringKeys = {
    'q': "qwas",
    'w': "qwase",
    'e': "wsedr",
    'r': "edrft",
    't': "rftgy",
    'y': "tgyhu",
    'u': "yhuji",
    'i': "ujiko",
    'o': "ikolp",
    'p': "olp",
    'a': "qwasz",
    's': "wazsxed",
    'd': "sxedcrf",
    'f': "dcrfvtg",
    'g': "fvtgbyh",
    'h': "gbyhnuj",
    'j': "hnujmik",
    'k': "jmikol",
    'l': "kolp",
    'z': "azsx",
    'x': "zsxdc",
    'c': "xdcfv",
    'v': "cfvgb",
    'b': "vgbhn",
    'n': "bhnjm",
    'm': "njmk"
}

englishLetters = NeighbouringKeys.keys()


def typos(text, prob=0.01):
    resultingText = ""

    for letter in text:
        if not letter in englishLetters:
            newLetter = letter
        else:
            if random.random() < prob:
                newLetter = random.choice(NeighbouringKeys[letter])
            else:
                newLetter = letter
        resultingText += newLetter

    return resultingText


#Function to (maybe) invert 2 adjacent letters (do force=True to force it to happen)
def invertAdjacentLetters(text, force=False):
    rd = random.randint(0, len(text) - 2)
    if not force:
        if text[rd] in englishLetters and text[rd + 1] in englishLetters:
            return text[:rd] + text[rd + 1] + text[rd] + text[rd + 2:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters and text[rd + 1] in englishLetters):
            rd = random.randint(0, len(text) - 2)
        return text[:rd] + text[rd + 1] + text[rd] + text[rd + 2:]


#Function to (maybe) remove a letter (do force=True to force it to happen)
def removeLetter(text, force=False):
    rd = random.randint(0, len(text) - 1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd] + text[rd + 1:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters):
            rd = random.randint(0, len(text) - 1)
        return text[:rd] + text[rd + 1:]


#Function to (maybe) double a letter (do force=True to force it to happen)
def doubleLetter(text, force=False):
    rd = random.randint(0, len(text) - 1)
    if not force:
        if text[rd] in englishLetters:
            return text[:rd + 1] + text[rd] + text[rd + 1:]
        else:
            return text
    else:
        while not (text[rd] in englishLetters):
            rd = random.randint(0, len(text) - 1)
        return text[:rd + 1] + text[rd] + text[rd + 1:]


CommonMisspelling = {"absence": ["absense", "absentse", "abcense", "absance"], "acceptable": ["acceptible"], "their": ["there", "they're"],
                     "there": ["their", "they're"], "they're": ["their", "there"], "your": ["you're"], "you're": ["your"]}

# Add a common misspelling
def addCommonMisspell(text):
    for word in CommonMisspelling.keys():
        if word in text:
            return text.replace(word, random.choice(CommonMisspelling[word]))
    return text


In [16]:
def generate_qrels(df, n):
    # Select only english songs
    df = df[df["language"] == "en"]
    # Select columns we care about
    df = df[["title","lyrics","views"]]
    # Add 'weight' column
    max_views = max(df["views"])
    df["weight"] = (df["views"] / max_views) ** 0.5 * 0.5 + 0.1

    df_sampled = df.sample(n // 2, weights='weight')
    
    def generate_positive_qrel(document):
        text = document['lyrics']
        rd = random.random()
        if rd < 0.6:
            query = getFirstVersesOfChorus(text, random.randint(1, 2))
        elif rd < 0.9:
            query = getFirstVerses(text, random.randint(1, 2))
        else:
            query = getRandomVerses(text, random.randint(1, 2))
        
        if random.randint(0, 3) == 0:
            query = addCommonMisspell(query)
        query = typos(query)
        
        for j in range(len(query)):
            rand = random.randint(0, 50)
            if rand == 0:
                query = invertAdjacentLetters(query)
            elif rand == 1:
                query = removeLetter(query)
            if rand == 2:
                query = doubleLetter(query)
        
        doc_id = document.name
        return pd.Series([query, doc_id, 1], index=['text', 'doc_id', 'relevance'])

    def generate_negative_qrel(positive_qrel):
        negative_doc_id = positive_qrel['doc_id']
        while negative_doc_id == positive_qrel['doc_id']:
            negative_doc_id = df.iloc[random.randint(0, len(df) - 1)].name
    
        return pd.Series([positive_qrel['text'], negative_doc_id, 0])

    positive_qrels = df_sampled.apply(generate_positive_qrel, axis=1, result_type='expand')
    negative_qrels = positive_qrels.apply(generate_negative_qrel, axis=1, result_type='broadcast')
    
    return pd.concat([positive_qrels, negative_qrels]).reset_index(drop=True)

In [17]:
# Save qrels as pickle file
qrels = generate_qrels(df, 10000)
torch.save(qrels, 'subset-qrels.pkl')

In [18]:
qrels

Unnamed: 0,text,doc_id,relevance
0,"Smoke and maintain, and maitian And maintain, ...",9861,1
1,"I'm single again, back on the prowwl I thought...",2699,1
2,"Freeze! Motheerfuckker, I'm ill, fuck a fuckki...",4387,1
3,I know you like my style,6665,1
4,"Hey, I'm looking for (Good love, yeah)",537,1
...,...,...,...
9995,"One-one-one-one, mic check one Galloin t a hoo...",3095,0
9996,So.. what is all this talk aobut the 'mrk of t...,10626,0
9997,"Y'all cain't stop it Y'all acin't stop, gangst...",91,0
9998,I know I disppeared Anr poppd up inn Pérès maa...,9883,0


# TF-IDF
In order to be able to work on the data easier, we are going to make a string made out of our tokens, to then do the TF-IDF

In [19]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_proc['text'])

feature = tfidf_vectorizer.get_feature_names_out()

In [20]:
doc_vector = tfidf_matrix[0].toarray()
#df with words and their tf-idf values
df_tfidf = pd.DataFrame(list(zip(feature, doc_vector.flatten())), columns=['Word', 'TF-IDF'])

df_tfidf = df_tfidf.sort_values(by='TF-IDF', ascending=False)
print(df_tfidf)

        Word    TF-IDF
11036    cam  0.802686
35284  killa  0.585084
58006   sing  0.042337
13452   clap  0.041190
67073    uhh  0.018195
...      ...       ...
24439   fong  0.000000
24440   foni  0.000000
24441  fonic  0.000000
24442   fonk  0.000000
73198    𝑤𝑎𝑠  0.000000

[73199 rows x 2 columns]


# Word2Vec


In [28]:
from gensim.models import Word2Vec

print(df_proc.columns)

Index(['title', 'views', 'text', 'doc_vectors'], dtype='object')


In [29]:
# Extract the tokenized lyrics as a list of lists
corpus = df_proc['text'].apply(lambda x: x.split()).tolist()

# Train Word2Vec model
w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)

In [33]:
def average_word_vectors(tokens, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for token in tokens:
        if token in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[token])
    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

df_proc['doc_vectors'] = [average_word_vectors(tokens, w2v_model, 100) for tokens in corpus]

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
#find similar songs to the first one.
similarity_matrix = cosine_similarity(df_proc['doc_vectors'].tolist(), [df_proc['doc_vectors'].iloc[0]])
similar_songs_indices = np.argsort(similarity_matrix[:, 0])[::-1]

top_similar_songs = df_proc.iloc[similar_songs_indices[1:10]][['title', 'text']]
print(top_similar_songs)
#i am just trying things out rn

                     title                                               text
12055    Do Ya Thing Remix  intro remix remix remix dipset killa cam duked...
12067        Halftime Show  intro killa dipset check stat man seventyeight...
6721              Me Killa  domm doom doom doom doom krazi killa bone kill...
10098  Killa Cal Lifestyle  intro doc doom uh black knight nigga killa cal...
6704                   BNK  hook whisper killa killa killa killa killa kil...
10364       Juggalo Anthem  intro violent j killa kick anthem like juggalo...
12059   Killa Season Intro  intro dukedagod get killa season let start shi...
9296              D.P.G./K  intro bg knocc haha yeah orgin babi gangsta mo...
4529      Still the Reason  vers 1 camron unh yo ten bird ill serv em brok...


In [37]:
def generate_training_data(corpus,window_size,vocab_size,word_to_index,length_of_corpus,sample=None):

    training_data =  []
    training_sample_words =  []
    for i,word in enumerate(corpus):

        index_target_word = i
        target_word = word
        context_words = []

        #when target word is the first word
        if i == 0:  

            # trgt_word_index:(0), ctxt_word_index:(1,2)
            context_words = [corpus[x] for x in range(i + 1 , window_size + 1)] 


        #when target word is the last word
        elif i == len(corpus)-1:

            # trgt_word_index:(9), ctxt_word_index:(8,7), length_of_corpus = 10
            context_words = [corpus[x] for x in range(length_of_corpus - 2 ,length_of_corpus -2 - window_size  , -1 )]

        #When target word is the middle word
        else:

            #Before the middle target word
            before_target_word_index = index_target_word - 1
            for x in range(before_target_word_index, before_target_word_index - window_size , -1):
                if x >=0:
                    context_words.extend([corpus[x]])

            #After the middle target word
            after_target_word_index = index_target_word + 1
            for x in range(after_target_word_index, after_target_word_index + window_size):
                if x < len(corpus):
                    context_words.extend([corpus[x]])


        trgt_word_vector,ctxt_word_vector = get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index)
        training_data.append([trgt_word_vector,ctxt_word_vector])   
        
        if sample is not None:
            training_sample_words.append([target_word,context_words])   
        
    return training_data,training_sample_words

### i am trying to adapt this code: https://github.com/rahul1728jha/Word2Vec_Implementation/blob/master/Word_2_Vec.ipynb

In [39]:
# Define other parameters
window_size = 5  # Set your desired window size
vocab_size = len(your_vocabulary)  # Replace your_vocabulary with your actual vocabulary
word_to_index = your_word_to_index_dict  # Replace with your actual word_to_index dictionary
length_of_corpus = len(corpus)

# Call the function to generate training data
training_data, training_sample_words = generate_word_similarity_training_data(corpus, window_size, vocab_size, word_to_index, length_of_corpus, sample=None)

word2vec_model = Word2Vec(sentences=your_training_data, vector_size=your_vector_size, window=your_window_size, min_count=1, workers=4)

NameError: name 'your_vocabulary' is not defined

In [40]:
def cosine_similarity_word(word1, word2, model):
    vector1 = model.wv[word1]
    vector2 = model.wv[word2]
    similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    return similarity

In [None]:
similarity_score = cosine_similarity_word('word1', 'word2', word2vec_model)
print(f"Similarity between 'word1' and 'word2': {similarity_score}")