In [None]:
!pip install gensim

In [None]:
from gensim.models import KeyedVectors
import os

path = os.path.join('C:', os.sep, 'Users', 'Gavin', 'Downloads', 'CS4641', 'Project', 'GoogleNews-vectors-negative300.bin')

# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
import string
import numpy as np
import re

def create_lyric_matrix():
    # Stop words to exclude
    stop_words = ['i', 'id', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'yall', 'yalls', "youre", "youve", "youll", "youd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'hes', 'him', 'his', 'himself', 'she', "shes", 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'em', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'got', 'gotta', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'because', 'as', 'until', 'til', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'bout', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'theres', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', "dont", 'should', "shouldve", 'now', 'aint', 'arent', 'couldnt', "didnt", "doesnt", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neednt", "shant", 'shouldnt']
    nums = "0123456789"

    # Define the directory containing the text files
    directory = os.path.join('C:', os.sep, 'Users', 'Gavin', 'Downloads', 'CS4641', 'Project', '4641project', 'lyrics')
    print(directory)

    # Initialize a dictionary to store embed-vector to word mappings
    lyric_embeddings = {}

    # Empty array to add word vectors to
    lyric_matrix = np.array([]).reshape(0,300)

    # Dictionary for word frequency of each song (given by filename):
    word_frequencies = {}

    # Iterate over the files in the directory
    for filename in os.listdir(directory):
        iter = 0
        if filename.endswith('.txt'):  # Consider only .txt files
            file_path = os.path.join(directory, filename)
            wordfreq = defaultdict(int)
            # Read the text file
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            for line in lines[1:]:
                # Cleaning up the data
                if '[' in line or ']' in line:
                    continue
                if 'Embed' in line:
                    line = line.replace('Embed', '')
                    try:
                        while line[-1] in nums:
                            line = line.replace(line[-1], '')
                    except IndexError:
                        continue
                line = line.replace("in’", "ing")
                translator = str.maketrans('', '', string.punctuation)
                no_punctuation = line.translate(translator)
                words = no_punctuation.strip().lower().split()
                # Obtain vector and add to dictionaries: we will use this to obtain words from our clusters and frequencies
                for word in words:
                    if word not in stop_words and len(word) > 1:
                        wordfreq[word] += 1
                        if word not in lyric_embeddings:
                            try:
                                vector = model[word]
                                lyric_embeddings[word] = vector
                                lyric_matrix = np.vstack([lyric_matrix, vector])
                            except KeyError:
                                continue
            word_frequencies[file_path] = wordfreq
    #Function returns the lyric dataset for clustering (NumPy array), a dictionary the maps words to their vector, and a dictionary to obtain word frequency
    return lyric_matrix, lyric_embeddings, word_frequencies

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.cluster import KMeans

#Obtain word vector data
km_matrix, dictionary, frequencies = create_lyric_matrix()

#Perform K-means
kmeans = KMeans(n_clusters=2)
kmeans.fit(km_matrix)

#Checking to see if it works
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

print("Labels:", labels)
print("Centroids:", centroids)

In [None]:
from sklearn.mixture import GaussianMixture

#Obtain word vector data
gmm_matrix, dictionary, frequencies = create_lyric_matrix()

#Perform GMM estimate
gmm = GaussianMixture(n_components=2)
gmm.fit(gmm_matrix)

#Checking to see if it works
labels = gmm.predict(gmm_matrix)
means = gmm.means_
covariances = gmm.covariances_

print("Labels:", labels)
print("Means:", means)
print("Covariances:", covariances)

In [None]:
from sklearn.cluster import DBSCAN

#Obtain data from create_lyric_matrix
db_matrix, dictionary, frequencies = create_lyric_matrix()

#Perform DBScan
dbscan = DBSCAN(eps=3, min_samples=2)
dbscan.fit(db_matrix)

#Checking to see if it works
labels = dbscan.labels_

print("Labels:", labels)