In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import gensim

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from gensim.models import Doc2Vec

In [None]:
sid = SentimentIntensityAnalyzer()

def get_vader_score(song):
    ss = sid.polarity_scores(song)
    return ss['compound']

def get_tb_score(song):
    polarity = TextBlob(song).sentiment.polarity
    return polarity

In [None]:
df = pd.read_csv('', encoding='UTF-8')

df.dropna(axis=0, how='any', subset=['Lyrics'], inplace=True)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

add_stopwords = ['la', 'ay', 'oh', 'ohh', 'ooh', 'yeah', 'hey', 'im', 'like', 'get', 'go', 'na', 'got', 'duh', 
                 'ya', 'ah', 'let', 'cuz', 'wit', 'da', 'gonna', 'cause', 'imma', 'dem', 'dey', 'dats', 'wat',
                 'dont', 'cant', 'wanna', 'see', 'make', 'want', 'youre', 'keep', 'lets', 'dat', 'ba', 'bah', 'aint',
                 'aah', 'say', 'aye', 'come'
                ]

stopwords.extend(add_stopwords)

In [None]:
def text_processing(text, stops=False):
    '''
    Remove all unusual text patterns that are often found in song lyrics
    '''
    # lowercase and remove anything between brackets ([Chorus] for example)
    text = text.lower()
    text = re.sub(pattern='\[.+?\]()?', repl=' ', string=text)
    
    # Remove occurrences of (x2) or (x3) etc
    text = re.sub(pattern='(\()?x\d+(\))?', repl=' ', string=text)
    
    # Remove words with stretched out sounds ie. 'aaaaahhhh'
    text = re.sub(pattern='(\\b\\w*?)(\\w)\\2{2,}(\\w*)', repl=' ', string=text)
    
    # Remove new lines
    text = re.sub(pattern='^\n', repl=' ', string=text)
    
    # Change rockin' to rocking, ie. 
    text = re.sub(pattern="n\\\' ", repl='ng ', string=text)
    
    # Remove stopwords (default false)
    if stops:
        text = ' '.join([word for word in text.split(' ') if word not in stopwords])
    
    # Remove numbers & other non-alphabetical characters
    text = re.sub(pattern='[^a-zA-Z ]', repl=' ', string=text)
    
    # Remove extra spaces (leading/trailing, doubles)
    text = re.sub(pattern='( ){2,}', repl=' ', string=text)
    text.strip()
    
    return text

In [None]:
df['ID'] = df['Song'] + '_' + df['Artist']

df['Lyrics'] = df['Lyrics'].apply(lambda x: text_processing(x))

df['Vader'] = np.nan

df['TextBlob'] = np.nan

df.reset_index(drop=True, inplace=True)

In [None]:
def cluster_plot(sample, cluster_size):
    sample_labels = gensim.models.doc2vec.TaggedDocument

    content_train = []

    j = 0

    for song in sample.values:
        content_train.append(sample_labels(song, [j]))
        j += 1

    print('Number of songs processed: ', j)
    
    d2v_model = Doc2Vec(content_train, vector_size=100, window=10, min_count=500, workers=7, dm=1, alpha=0.025, min_alpha=0.001)

    d2v_model.train(content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=0.016)
    
    kmeans_model = KMeans(n_clusters=cluster_size, init='k-means++', max_iter=100)

    X = kmeans_model.fit(d2v_model.docvecs.vectors_docs)

    labels = kmeans_model.labels_.tolist()

    l = kmeans_model.fit_predict(d2v_model.docvecs.vectors_docs)

    pca = PCA(n_components=2).fit(d2v_model.docvecs.vectors_docs)

    datapoint = pca.transform(d2v_model.docvecs.vectors_docs)
    
    plt.figure(figsize=(12,12))
    
    label1 = ["#53868B", "#DB2929"]

    color = [label1[i] for i in labels]

    plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

    centroids = kmeans_model.cluster_centers_
    centroidpoint = pca.transform(centroids)
    
    plt.title('K-Means Cluster of Song Lyrics')
    plt.xticks([])
    plt.yticks([])

    plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='x', s=150, c='#000000')
    
    plt.savefig('cluster.png')
    
    return labels

In [None]:
def sentiment_scores(df):
    df.reset_index(drop=True, inplace=True)
    
    for i in range(len(sixties_set['Song'])):
        df.loc[i, ['Vader']] = get_vader_score(df['Lyrics'][i])
        df.loc[i, ['TextBlob']] = get_tb_score(df['Lyrics'][i])

    c1_v = df[df['Cluster'] == 0].mean(axis=0)['Vader']
    c2_v =  df[df['Cluster'] == 1].mean(axis=0)['Vader']
    
    c1_tb = df[df['Cluster'] == 0].mean(axis=0)['TextBlob']
    c2_tb =  df[df['Cluster'] == 1].mean(axis=0)['TextBlob']
    
    return c1_v, c2_v, c1_tb, c2_tb

In [None]:
df['Cluster'] = cluster_plot(df['Lyrics'], 2)

In [None]:
sixties_set = df[df['Year'] < 1970]
seventies_set = df[(df['Year'] >= 1970) & (df['Year'] < 1980)]
eighties_set = df[(df['Year'] >= 1980) & (df['Year'] < 1990)]
nineties_set = df[(df['Year'] >= 1990) & (df['Year'] < 2000)]
millenial_set = df[(df['Year'] >= 2000) & (df['Year'] < 2010)]
tens_set = df[df['Year'] >= 2010]

In [None]:
scores = []
scores.append(sentiment_scores(sixties_set))
scores.append(sentiment_scores(seventies_set))
scores.append(sentiment_scores(eighties_set))
scores.append(sentiment_scores(nineties_set))
scores.append(sentiment_scores(millenial_set))
scores.append(sentiment_scores(tens_set))

In [None]:
c1_vscore = []
c2_vscore = []
c1_tbscore = []
c2_tbscore = []

for i in range(len(scores)):
    c1_vscore.append(scores[i][0])
    c2_vscore.append(scores[i][1])
    c1_tbscore.append(scores[i][2])
    c2_tbscore.append(scores[i][3])

In [None]:
plt.figure(figsize=(8,8))
plt.plot(range(1965, 2025, 10), c1_vscore);
plt.plot(range(1965, 2025, 10), c2_vscore);
plt.title('Mean Vader Sentiment Score Per Decade')
plt.xlabel('Year')
plt.ylabel('Vader Sentiment Score')
plt.legend(labels=['Cluster 1', 'Cluster 2'], loc='lower left');
plt.savefig('Vader.png')

In [None]:
plt.figure(figsize=(8,8))

plt.plot(range(1965, 2025, 10), c1_tbscore);
plt.plot(range(1965, 2025, 10), c2_tbscore);

plt.title('Mean TextBlob Sentiment Score Per Decade')
plt.xlabel('Year')
plt.ylabel('TextBlob Sentiment Score')
plt.legend(labels=['Cluster 1', 'Cluster 2'], loc='lower left');
plt.savefig('tb.png')

In [None]:
c1_tbscore