In [1]:
!pip install gensim



In [2]:
from gensim.models import KeyedVectors
import os

path = os.path.join('C:', os.sep, 'Users', 'Gavin', 'Downloads', 'CS4641', 'Project', 'GoogleNews-vectors-negative300.bin')

# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format(path, binary=True)

In [38]:
import string
import numpy as np
import hashlib
from collections import defaultdict

def create_lyric_matrix():
    # Stop words to exclude
    stop_words = ['i', 'id', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'yall', 'yalls', "youre", "youve", "youll", "youd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'hes', 'him', 'his', 'himself', 'she', "shes", 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'em', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'got', 'gotta', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'because', 'as', 'until', 'til', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'bout', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'theres', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', "dont", 'should', "shouldve", 'now', 'aint', 'arent', 'couldnt', "didnt", "doesnt", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neednt", "shant", 'shouldnt']
    nums = "0123456789"

    directory = os.path.join('C:', os.sep, 'Users', 'Gavin', 'Downloads', 'CS4641', 'Project', '4641project', 'lyrics')

    # Initialize a dictionary to store embed-vector to word mappings
    lyric_embeddings = {}

    # Empty array to add word vectors to
    lyric_matrix = np.array([]).reshape(0,300)

    # Dictionary for word frequency of each song (given by filename):
    word_frequencies = {}

    # Iterate over the files in the directory
    iter = 0
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):  # Consider only .txt files
            file_path = os.path.join(directory, filename)
            wordfreq = defaultdict(int)
            # Read the text file
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
            for line in lines[1:]:
                # Cleaning up the data
                if '[' in line or ']' in line:
                    continue
                if 'Embed' in line:
                    line = line.replace('Embed', '')
                    try:
                        while line[-1] in nums:
                            line = line.replace(line[-1], '')
                    except IndexError:
                        continue
                line = line.replace("in'", "ing")
                translator = str.maketrans('', '', string.punctuation)
                no_punctuation = line.translate(translator)
                words = no_punctuation.strip().lower().split()
                # Obtain vector and add to dictionaries: we will use this to obtain words from our clusters and frequencies
                for word in words:
                    if word not in stop_words and len(word) > 1:
                        wordfreq[word] += 1
                        if wordfreq[word] == 1:
                            try:
                                vector = model[word]
                                lyric_matrix = np.vstack([lyric_matrix, vector])
                                lyric_embeddings[iter] = word
                                iter += 1
                            except KeyError:
                                continue
            word_frequencies[file_path] = wordfreq
    #Function returns the lyric dataset for clustering (NumPy array), a dictionary the maps words to their vector, and a dictionary to obtain word frequency
    return lyric_matrix, lyric_embeddings, word_frequencies

In [4]:
!pip install scikit-learn



In [5]:
!pip install nltk



In [20]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
                                              0.0/636.8 kB ? eta -:--:--
     ------------------                     307.2/636.8 kB 9.6 MB/s eta 0:00:01
     -------------------------------------- 636.8/636.8 kB 8.0 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [42]:
from sklearn.cluster import KMeans
from textblob import TextBlob
from sklearn.cluster import KMeans
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
nltk.download('wordnet')

#Obtain word vector data
km_matrix, dictionary, frequencies = create_lyric_matrix()

#Ground truth
ground_truth = []

#Lines to exclude from matrix, as we couldn't find a sentiment value for the individual word
exclude_lines = []

sia = SentimentIntensityAnalyzer()

c0_words = []
c1_words = []

iter = 0
for row in km_matrix:
    word = dictionary[iter]
    blob = TextBlob(word)
    blob_sent = blob.sentiment.polarity
    synset = list(swn.senti_synsets(word))
    if synset:
        swn_sent = synset[0].pos_score() - synset[0].neg_score()
    vader_sent = sia.polarity_scores(word)['compound']
    sents = np.array([blob_sent, swn_sent, vader_sent])
    if sents.sum() == 0:
        exclude_lines.append(iter)
        iter += 1
        continue
    avg_sent = sents.sum() / np.absolute(np.sign(sents)).sum()
    if avg_sent >= 0:
        ground_truth.append(0)
        c0_words.append(word)
    else:
        ground_truth.append(1)
        c1_words.append(word)
    iter += 1

#Perform K-means
kmeans = KMeans(n_clusters=2)
kmeans.fit(km_matrix)

#Checking to see if it works
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

pred_labels = np.delete(labels, exclude_lines)
ground_truth = np.array(ground_truth)

#Precision, recall, F-measure
conf_matrix = np.array([[0, 0], [0, 0]])
for i in range(ground_truth.shape[0]):
    conf_matrix[pred_labels[i]][ground_truth[i]] += 1

print(conf_matrix)

print(c0_words)
print(c1_words)

#DB index

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gavin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\Gavin\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gavin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  super()._check_params_vs_input(X, default_n_init=10)


[[1041  830]
 [ 147  146]]
['buddies', 'yeah', 'yea', 'high', 'know', 'say', 'thank', 'god', 'true', 'due', 'like', 'old', 'well', 'live', 'yeah', 'find', 'high', 'webs', 'hand', 'mind', 'senses', 'save', 'friends', 'wish', 'blend', 'different', 'like', 'many', 'fast', 'catching', 'feeling', 'know', 'right', 'feel', 'ive', 'dreaming', 'like', 'feeling', 'much', 'thankful', 'favorite', 'kind', 'chance', 'kiss', 'mama', 'read', 'concrete', 'focused', 'crib', 'laughing', 'joke', 'fly', 'rich', 'friends', 'love', 'know', 'find', 'great', 'good', 'really', 'wish', 'could', 'win', 'watcher', 'love', 'hey', 'booboo', 'much', 'complexity', 'momma', 'really', 'say', 'yes', 'geeked', 'well', 'please', 'play', 'truthfully', 'slap', 'hope', 'fly', 'better', 'like', 'practice', 'know', 'bust', 'pray', 'make', 'whole', 'right', 'dash', 'boy', 'higher', 'crack', 'familiar', 'kind', 'big', 'yeah', 'senses', 'yeah', 'existent', 'wide', 'spark', 'new', 'ferrari', 'web', 'hero', 'opp', 'arachnophobia', '

In [None]:
from sklearn.mixture import GaussianMixture

#Obtain word vector data
gmm_matrix, dictionary, frequencies = create_lyric_matrix()

#Perform GMM estimate
gmm = GaussianMixture(n_components=2)
gmm.fit(gmm_matrix)

#Checking to see if it works
labels = gmm.predict(gmm_matrix)
means = gmm.means_
covariances = gmm.covariances_

print("Labels:", labels)
print("Means:", means)
print("Covariances:", covariances)

In [None]:
from sklearn.cluster import DBSCAN

#Obtain data from create_lyric_matrix
db_matrix, dictionary, frequencies = create_lyric_matrix()

#Perform DBScan
dbscan = DBSCAN(eps=3, min_samples=2)
dbscan.fit(db_matrix)

#Checking to see if it works
labels = dbscan.labels_

print("Labels:", labels)