In [17]:
import spacy
import pandas as pd
import numpy as np
import nltk 
import string
import os
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm
nlp = spacy.load('en_core_web_md')

In [18]:
folder_path = "/Volumes/Macintosh HD – dane/GitHub/masters/first semester/Unsupervised Learning/article_2/data"
os.chdir(folder_path)

In [19]:
# Function which creates a list, each position of a final list is a list of song words
def get_lyrics(list_of_songs = []):
    
    for file in os.listdir(): # iterate through directory

        if file.endswith(".txt"): # open each file
            file_path = f"{folder_path}/{file}"

            lyrics = pd.read_csv(file_path, sep='\b', quoting=3, encoding='utf-8', header=None, names=['lines'])
            lyrics_list = lyrics['lines'].tolist() # transform song words into a list
            list_of_songs.append(lyrics_list)
            
    return list_of_songs

In [20]:
# Function which outputs a list of vectors created from the lyrics
def lyrics_preprocess(songs_list, stopwords, songs_vectors = [], stop_words_check = []):

    for song in tqdm(songs_list):

        text = " ".join(song) # create a list of words
        doc = nlp(text)

        tokens  = [token.text for token in doc] # tokenize
        tokens = [token.lemma_ for token in doc] # lemmantize

        tokens = [token for token in tokens if token not in string.punctuation] # remove punctuation
        tokens = [token.lower() for token in tokens] # lower words
        tokens = [item for item in tokens if item not in stopwords] # remove stopwords

        for word in tokens: # checking if any stop word somehow was not deleted
            if word in stopwords:
                stop_words_check.append(word)
            assert len(stop_words_check) == 0, 'Error: not all of the stopwords were deleted from text'
        
        tokens_concat = " ".join(tokens) # joining words into one string
        sentence_vec = nlp(tokens_concat) # vectorizing
        songs_vectors.append(sentence_vec.vector) #saving song vector
        
    return songs_vectors

In [21]:
# Function which creates column names from files names in the directory
def create_column_names(list_of_columns = []):

    for file in os.listdir():

        
        file = file[:-4]
        list_of_columns.append(file)

        if '.DS_S' in list_of_columns:
            list_of_columns.remove('.DS_S')

    return list_of_columns

In [22]:
stop = STOP_WORDS
stop.update(['...', '....', '1', '2', '3', '4', '5', 'chorus', ':]', '[:'])

In [23]:
# Exctracting songs vectors
lyrics = get_lyrics()
lyrics_vectors = lyrics_preprocess(lyrics, stopwords = stop)
df_columns = create_column_names()

100%|██████████| 48/48 [04:53<00:00,  6.12s/it]


In [24]:
songs_df = pd.DataFrame(lyrics_vectors).T
songs_df.columns = df_columns

In [25]:
songs_df.head()

Unnamed: 0,prince,dickinson,beatles,bob-dylan,bjork,johnny-cash,disney,janisjoplin,kanye,bob-marley,...,r-kelly,drake,britney-spears,bruce-springsteen,nicki-minaj,kanye-west,paul-simon,nickelback,eminem,bruno-mars
0,0.488899,-0.151497,0.866229,0.33205,0.464633,-0.052686,0.628423,1.015842,0.393763,0.855792,...,0.799079,0.404629,1.180542,0.407606,0.457071,0.121589,0.254363,0.492601,0.342824,1.013985
1,1.610341,0.748771,1.241061,1.255925,1.078212,1.197751,1.076042,0.849392,1.128111,0.946536,...,1.237406,1.262244,1.398354,1.33931,1.20513,1.069831,0.998072,1.366088,1.29577,1.322053
2,-1.936098,-1.636477,-2.377264,-1.891242,-2.123817,-2.158009,-1.888964,-2.310341,-2.193906,-2.262774,...,-2.35212,-2.181932,-2.494427,-2.47629,-2.033452,-2.097285,-2.303176,-2.186932,-2.072237,-2.35697
3,-0.821185,-0.192223,-0.816632,-0.467427,-0.406551,-0.025675,-0.709705,-1.149756,-0.394666,-0.309796,...,-0.881137,-0.501777,-1.125783,-0.479958,-0.32915,-0.649858,-0.55331,-0.386956,-0.240733,-0.748238
4,-0.371704,0.994015,-0.149797,0.647207,0.377207,0.430123,0.280071,-0.508992,0.116053,-0.125651,...,-0.039799,-0.022557,-0.342595,0.280042,-0.274202,-0.136441,0.717278,0.202165,-0.023635,-0.443886


In [26]:
songs_df.to_csv('/Volumes/Macintosh HD – dane/GitHub/masters/first semester/Unsupervised Learning/article_2/songs_vectors.csv')