In [2]:
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import Birch, KMeans
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.corpus import words
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from bokeh.plotting import figure 
from bokeh.io import output_notebook, show
from bokeh.palettes import Spectral11
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
import pandas as pd
output_notebook()

#  Analyzing the lyrics of  Beyonce and 50-cent's songs

I in notebook I have attempted to analyze and compare the songs of Beyonce and 50-Cent using the various methods and techniques available to us in Visual Analytics.

The dataset I am using was sourced from Kaggle and contains the lyrics of 362,237 songs from 18,231 artists.

For the purpose of this experiment I have considered the songs of Beyonce and 50-cent for comparision and Analysis. There are a total of 249 songs for Beyonce and 425 songs for 50-cent.

For each artist, the text from their songs has been preprocessed (parsed, tokenized, stemmed and stopwords removed). The frequency of each word among all the songs for each artist is then computed by parsing through the processed tokens of all the songs of the respective artist.

Visualizations being produced are in the form of 
* word-cloud showing us the most frequently occuring words in the songs of Beyonce and 50-Cent.
* Bar-chart showing the top 10 words among all the songs for each artist.
* Visualization of the trends of how the use of expletives or swear words has changes over the years for each Artist.



In [3]:
#The SnowballStemmer from NLTK is used to reduce each word to it's root.
stemmer = SnowballStemmer("english")
#Read the CSV file into a dataframe and extract the artist, lyrics and song field
df=pd.read_csv('lyrics.csv')
df=df[['artist','lyrics','song']]


# Processing the text : 

* Parsing the file : This involves reading the CSV files to extract the songs of Beyonce and 50-cent and format them into pandas dataframes
* Tokenizing the text : The text is broken up into tokens.
* Stemming the words : Each token is converted to it's root or step i.e. eating is converted to eat, happily is converted to happy and so on.
* Removal of stop words (Also involves removal of punctuation) : Stop words or frequently occuring words in english such as 'and','but','is','are','he' etc. are removed. Also removed are punctuation and special characters such as '?','!', '#' etc.
* Computing high frequency words : The frequency of each word is computed. We do this because words which occur more frequently are often more important to a document and the goal is to identify such words.

The processed lyrics of the songs of both artists were written into text files 'beyonce_lyrics.txt' and '50_cent_lyrics.txt'


# Preprocessing text of Beyonce's songs

In [4]:
song_lyrics_list=[] #list of normalized lyrics of each song where each item in the list corresponds to a song
song_name_list=[] #list of song names corresponding to the lyrics in the list songs_lyrics
song_color_list=[]
artist_name_list=[]
#Extract all the records for which Beyonce is the singer
df_beyonce = df[df['artist']=='beyonce-knowles']
#define the stop words in the text such as 'and','but','is','are','he' etc.
stopWords = set(stopwords.words('english'))
#define a dictionary for storing word counts of the different words occurring in all the songs
beyonce_word_count_dict = defaultdict(int)
#define the tokenizer in nltk for converting the text into tokens
tokenizer = RegexpTokenizer(r'\w+')
#open the file for storing the combined lyrics of all of Beyonce's songs
f = open('beyonce_lyrics.txt','w')
for item in range(len(df_beyonce)): #iterate through each song by Beyonce
    filtered_song_word_list=[]       #list for storing stemmed tokens from lyrics of current song(exclusing stop words)
    song_lyrics=str(df_beyonce.iloc[item,1]) #replace all new line characters
    for word in tokenizer.tokenize(song_lyrics):
        word=word.lower() #convert to lowercase
        if not word in stopWords: #remove words that are stop words
            filtered_song_word_list.append(stemmer.stem(word))
            beyonce_word_count_dict[word]+=1   #increment count of word in the word count dictionary
    normalized_lyrics = " ".join(filtered_song_word_list)
    song_lyrics_list.append(normalized_lyrics) #append the song lyrics to the normalized song lyrics list
    song_name = df_beyonce.iloc[item,2]  
    song_name_list.append(song_name) #get the song name for the correspnding lyrics
    song_color_list.append('red')
    artist_name_list.append('Beyonce')
    f.write(normalized_lyrics) #write the normalized song lyrics into a file
f.close()  

# Preprocessing text of 50-Cent's songs

In [5]:
#Extract all the records for which 50-cent is the singer
df_50_cent = df[df['artist']=='50-cent']
#define the stop words in the text such as 'and','but','is','are','he' etc.
stopWords = set(stopwords.words('english'))
#define a dictionary for storing word counts
fifty_cent_word_count_dict = defaultdict(int)
#define the tokenizer in nltk for converting the text into tokens
tokenizer = RegexpTokenizer(r'\w+')
#open the file for storing the combined lyrics of all of 50-cent's songs
f = open('50_cent_lyrics.txt','w')
for item in range(len(df_50_cent)): #iterate through each song by 50-cent
    filtered_song_word_list=[]       #list for storing stemmed tokens from lyrics of current song (exclusing stop words)
    song_lyrics = str(df_50_cent.iloc[item,1]) #replace all new line characters
    for word in tokenizer.tokenize(song_lyrics):
        word = word.lower() #convert to lowercase
        if not word in stopWords: #remove words that are stop words
            filtered_song_word_list.append(stemmer.stem(word))
            fifty_cent_word_count_dict[word] += 1   #increment count of word in the word count dictionary
    normalized_lyrics = " ".join(filtered_song_word_list)
    song_lyrics_list.append(normalized_lyrics) #append the song lyrics to the normalized song lyrics list
    song_name = df_50_cent.iloc[item,2]
    song_name_list.append(song_name) #get the song name for the correspnding lyrics
    song_color_list.append('blue')
    artist_name_list.append('50-Cent')
    f.write(normalized_lyrics) #write the normalized song lyrics into a file
f.close()

# Visualizing the text

Three different types of visualizations in Boken have been produced:
1. Word cloud representing the most important terms in terms of frequency of the songs of both the artists.
2. Bar charts showing the top 20 most frequently occurring words.
3. Trend in how the number of swear words have changed over the years.

# Word cloud:

In [6]:
#pass the dictionary of word counts to a file so that they can be visualized in word cloud
wc = WordCloud().generate_from_frequencies(beyonce_word_count_dict)
default_colors = wc.to_array()
wc.to_file("Beyonce_lyrics.png") 
####Display the lyrics in Bokeh
#p = figure()
#p.image_url(url=['Beyonce_lyrics.png'],x=0, y=0, w=100, h=100)
#show(p)

url = r'Beyonce_lyrics.png'

source = ColumnDataSource(dict(url = [url]))

p = figure(x_range=(0,1), y_range=(0,1), plot_width=800, plot_height=400)
p.image_url(url='url', x=0, y=1, h=1, w=1, source=source)
show(p)



In [7]:
#pass the dictionary of word counts to a file so that they can be visualized in word cloud
wc = WordCloud().generate_from_frequencies(fifty_cent_word_count_dict)
default_colors = wc.to_array()
wc.to_file("50_cent_lyrics.png") 

url = r'50_cent_lyrics.png'

source = ColumnDataSource(dict(url = [url]))

p = figure(x_range=(0,1), y_range=(0,1), plot_width=800, plot_height=400)
p.image_url(url='url', x=0, y=1, h=1, w=1, source=source)
show(p)

# Get the top 20 most frequent words in songs by Beyonce

In [8]:
#convert dictionary elements into tuples and sort
beyonce_word_count_tuples=sorted(beyonce_word_count_dict.items(), key=lambda x: x[1], reverse=True)
labels=[beyonce_word_count_tuples[i][0] for i in range(0,20) ] #get the top 10 most frequent songs
counts=[beyonce_word_count_tuples[i][1] for i in range(0,20) ] #get the counts of the top 10 most frequent songs
hover = HoverTool(   #define the attributes that you want to display on hovering over a bar
        tooltips=[ 
            ("word", "@x"),
            ("frequency of word", "@y"),
        ])
#read the data for the top 20 songs into a ColumnDataSource object
source=ColumnDataSource(dict(x=labels, y=counts))
#create and define the bar plot
p=figure(x_range=labels, plot_width=1000, plot_height=500, y_range=(0,max(counts)), title="Top 20 most frequently words in Beyonce's songs", tools=[hover])
p.vbar(x='x', top='y', width=0.9, source=source) 
p.xaxis.axis_label = "Number of times the word occurs"
p.yaxis.axis_label = "Word"
show(p)



# Get the top 20 most frequent words in songs by 50-cent

In [9]:
#convert dictionary elements into tuples and sort
fifty_cent_word_count_tuples=sorted(fifty_cent_word_count_dict.items(), key=lambda x: x[1], reverse=True)
#print(fifty_cent_word_count_tuples)
labels=[fifty_cent_word_count_tuples[i][0] for i in range(0,20) ] #get the top 10 most frequent songs
counts=[fifty_cent_word_count_tuples[i][1] for i in range(0,20) ] #get the counts of the top 10 most frequent songs
hover = HoverTool(   #define the attributes that you want to display on hovering over a bar
        tooltips=[ 
            ("word", "@x"),
            ("frequency of word", "@y"),
        ])
#read the data for the top 20 songs into a ColumnDataSource object
source=ColumnDataSource(dict(x=labels, y=counts))
#create and define the bar plot
p=figure(x_range=labels, plot_width=1000, plot_height=500, y_range=(0,max(counts)), title="Top twenty most frequently words in 50-Cent's songs", tools=[hover])
p.vbar(x='x', top='y', width=0.9, source=source) 
p.xaxis.axis_label = "Number of times the word occurs"
p.yaxis.axis_label = "Word"
show(p)



# Observation and Analysis

From the Word Cloud plots as well as the line plots we can see that We can see from the word counts of the songs that
* Explicit and vulgar language occurs more frequently in the songs of 50-cent (example: n\*\*\*\*, shit, f\*\*\*\* etc.)
* There occurrences of more slang words compared to Beyonce (example: ya, yeah)
* There are also words that are common among both artists such as 'like','see', 'go' which are commonly seen in most pop and rap songs today.

# Trend of number of swear words per song  per year for Beyonce  and 50-cent

* The following code segments counts the number of swear words encountered per year per song for Beyonce and 50-cent separately so that we can compare the trend in their usage of explicit words 
* Since the number of songs released by 50-cent and beyonce vary widely per year, we compute the average number of swear words per song as opposed to the total number of swear words in songs per year.
* Also Songs of beyonce before 2005 are missing in the dataset and I therefore could not perform analysis.


# Compute the trend of number of average number of swear words per song for Beyonce over the years (from 2006 - Data for songs before 2006 missing)

In [10]:
with open('Googe_swear_words.txt','r') as f:
    swear_words_list=tokenizer.tokenize(f.read())
f.close()
df=pd.read_csv('lyrics.csv') #read the songs
df_beyonce = df[df['artist']=='beyonce-knowles']
explicit_word_count_per_year_beyonce=defaultdict(int)
for index in range(len(df_beyonce)):
    song_lyrics=str(df_beyonce.loc[index,'lyrics'])
    for word in tokenizer.tokenize(song_lyrics):
        word=word.lower() #convert to lowercase
        if not word in stopWords and word in swear_words_list: #remove words that are stop words and identify if the retained word is a stop wor
            explicit_word_count_per_year_beyonce[df_beyonce.loc[index,'year']]+=1
for key in  explicit_word_count_per_year_beyonce:
    explicit_word_count_per_year_beyonce[key]=explicit_word_count_per_year_beyonce[key]/len(df_beyonce[df_beyonce['year']==key])
explicit_word_count_per_year_beyonce[2002]=float('nan')
explicit_word_count_per_year_beyonce[2003]=float('nan')
explicit_word_count_per_year_beyonce[2004]=float('nan')
explicit_word_count_per_year_beyonce[2005]=float('nan')

In [None]:
explicit_word_count_per_year_beyonce_tuples=list(explicit_word_count_per_year_beyonce.items())
explicit_word_count_per_year_beyonce_tuples=sorted(explicit_word_count_per_year_beyonce_tuples, key=lambda x: x[0])
#print(explicit_word_count_per_year_beyonce_tuples)

hover = HoverTool(   #define the attributes that you want to display on hovering over a plot-line for a country
        tooltips=[ 
            ("(year,number of explicit words)", "(@x, @y)"),
        ])

p=figure(title='Average number of explitives per song over the years',width=700, height=700, tools=[hover])

years=[explicit_word_count_per_year_beyonce_tuples[i][0] for i in range(len(explicit_word_count_per_year_beyonce_tuples[:]))]
number_of_explitives=[explicit_word_count_per_year_beyonce_tuples[i][1] for i in range(len(explicit_word_count_per_year_beyonce_tuples[:]))]


source=ColumnDataSource(dict(x=years, y=number_of_explitives ))
p.circle('x','y', source=source, color='red', alpha=0.5,legend='beyonce') #plot points as circles
p.line('x','y', source=source, color='red', alpha=0.5,legend='beyonce') #plot line to connect circles   


# Compute the trend of number of average number of swear words per song for 50-Cent over the years (from 2002)

In [None]:
df_50_cent = df[df['artist']=='50-cent']
df_50_cent=df_50_cent.reset_index()
explicit_word_count_per_year_50_cent=defaultdict(int)
for index in range(len(df_50_cent)):
    song_lyrics=str(df_50_cent.loc[index,'lyrics'])
    for word in tokenizer.tokenize(song_lyrics):
        word=word.lower() #convert to lowercase
        if not word in stopWords and word in swear_words_list: #remove words that are stop words and identify if the retained word is a stop wor
            explicit_word_count_per_year_50_cent[df_50_cent.loc[index,'year']]+=1
            
for key in  explicit_word_count_per_year_50_cent:
    explicit_word_count_per_year_50_cent[key]=explicit_word_count_per_year_50_cent[key]/len(df_50_cent[df_50_cent['year']==key])

In [None]:

explicit_word_count_per_year_50_cent_tuples=list(explicit_word_count_per_year_50_cent.items())
explicit_word_count_per_year_50_cent_tuples=sorted(explicit_word_count_per_year_50_cent_tuples, key=lambda x: x[0])


years=[explicit_word_count_per_year_50_cent_tuples[i][0] for i in range(len(explicit_word_count_per_year_50_cent_tuples[:]))]
number_of_explitives=[explicit_word_count_per_year_50_cent_tuples[i][1] for i in range(len(explicit_word_count_per_year_50_cent_tuples[:]))]
source=ColumnDataSource(dict(x=years, y=number_of_explitives ))
p.circle('x','y', source=source, color='blue', alpha=0.5,legend='50-cent') #plot points as circles
p.line('x','y', source=source, color='blue', alpha=0.5,legend='50-cent') #plot line to connect circles     
p.legend.location = "top_left" #adjust legend position
p.legend.click_policy="hide" #hide or show line by clicking on the corresponding legend symbol

#label the axes
p.xaxis.axis_label = "Year"

p.yaxis.axis_label = "Average number of explitives per song"

show(p)

# Observation and Analysis

* 50-cent uses nearly **3-time** as much explicit langauge in his songs compared to Beyonce.
* Till 2010 for both Beyonce and 50-cent we observe a downward trend in the number of explitives used per song over the year.
* After 2010 there is a resergence or a trend which points to an overall increase in the number of explitives used per song per year.

# Clustering: Using t-SNE (distributed stochastic neighbor embedding) to compress the data to 2 dimensions and perform clustering on this reduced dimensional space.
The aim of this section is to cluster similar songs with each other regardless of whether they belong to Beyonce or 50-cent. The approach taken is as follows:
* Using the bag of words model i.e. CountVectorizer from scikit-learn, a vector where the features are the 'words' and the values are 'counts' of the words is created. 
* Using these vectors we compress the data using t-sne clustering into 2 dimensions.
* Birch Clustering using these 2-dimensions is perform to analyze the groups in the data

I referred to this blogpost (http://intelligentonlinetools.com/blog/2016/06/05/using-python-for-mining-data-from-twitter-visualization-and-other-enchancements/) provided among the midterm resource links for guidance.

The number of dimensions in the feature vector of the bag of words model has been restricted to 5000.

# The Code below plots a scatter plot of the sample points in the reduced dimensional space

In [None]:
#train a CountVectorizer model (Bag of words) from scikit-learn on songs by both Beyonce and 50-cent
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words='english', max_features = 5000)
train_data_features = vectorizer.fit_transform(song_lyrics_list)
train_data_features = train_data_features.toarray()

#define a t-SNE model to compress data into 2 dimensions
model = TSNE(n_components=2, random_state=0)
#Fit the t-SNE model on the training dataset consists of all the songs and their respective lyrics
Y=model.fit_transform(train_data_features)
#get the labels/words used as features
vocab = vectorizer.get_feature_names()
np.set_printoptions(suppress=True)

hover = HoverTool(   #define the attributes that you want to display on hovering over the scatter-plot
        tooltips=[ 
            ("song name:", "@name"),
            ("artist:", "@artist"),
        ])
#plot the points in the graph with red representing Beyonce's songs and blue representing 50-Cent's songs
p=figure(tools=[hover])
source=ColumnDataSource(dict(x=Y[:,0], y=Y[:,1], name=song_name_list, artist=artist_name_list ))
p.circle('x','y', source=source, color=song_color_list,alpha=0.5) #plot points as circles
source=ColumnDataSource(dict(x=[], y=[], name=[]))
p.circle('x','y', source=source, color = 'red', legend='Beyonce')
p.circle('x','y', source=source, color = 'blue', legend='50-Cent')
#label the axes
p.xaxis.axis_label = "t-SNE dimension 1"
p.yaxis.axis_label = "t-SNE dimension 2"
p.legend.location = "top_left" #adjust legend position
show(p)

# Analysis of t-SNE Dimensionality reduction
On visual inspection we can see that beyonce's songs are placed close to each other and also songs with similar themes are even closer to each other. together. The is very nuanced and by clicking on the points that are nearby each other we can see they have common themes.
* For example 'Drunk in love', 'Crazy in love' and 'Still in Love'  which are all songs by beyonce that have the similar theme are close to each other.
* Also songs and their remixes (which obviosly share the same lyrics with some differences) are clustered together. For 50-cent, the  Remixes of the same song can also be seen close to each other for 50-cent too. For example remixes of 'In Da Club' are also places close to each other in this 2-D space.

Hence we can see that t-SNE has done a good job at placing similar songs close to each other in the vector space.


# Clustering using BIRCH (balanced iterative reducing and clustering using hierarchies) clustering
The Scikit-learn implementation of Birch clustering is used for performing clustering of song in the t-SNE reduced dimensional space. 
* The Average Silhouette score of all the points in the sample space is used to assess the quality of the clusters.
* The Silhouette score has a value in the range [-1,1]. A score of -1 implies that a point is far away from neighboring clusters, a score of 0 implies a points in a cluster is close to the boundary between two clusters and a negative score implies that a point may have been assigned to the wrong cluster.
* The cluster cumber with the highest average Silhouette score is chosen as the optimal cluster number and the songs are then clustered using this 'optimal' number of clusters.
* In the following code I have performed clustering using cluster numbers ranging from 2 to 20 and computed the Silhouette score for each cluster
* Bar plot representing the average Silhouette score for each number of clusters is shown below

In [None]:
cluster_number=[]
silhouette_score_list=[]
#perform birch clustering from cluster's rangeing from 2 to 20 and compute average Silhouette scores for each cluster
print("The cluster numbers and their respective average Silhouette scores are:")
for n_clusters in range(2,21):
    brc = Birch(branching_factor=50, n_clusters=n_clusters, threshold=0.5,  compute_labels=True) 
    brc.fit(Y)
    clustering_result=brc.predict(Y)
    silhouette_avg = silhouette_score(train_data_features, clustering_result)
    cluster_number.append(str(n_clusters))
    silhouette_score_list.append(silhouette_avg)
    print('num_cluster:',n_clusters,'average silhouette score',silhouette_avg )
hover = HoverTool(   #define the attributes that you want to display on hovering over a bar
        tooltips=[ 
            ("number of clusters", "@x"),
            ("Silhouette score", "@y"),
        ])
#read the data for the top 10 songs into a ColumnDataSource object
source=ColumnDataSource(dict(x=cluster_number, y=silhouette_score_list))
#create and define the bar plot
p=figure(x_range=cluster_number, plot_height=250, y_range=(0,max(silhouette_score_list)), title="Average Silhouette scores for each number of clusters", tools=[hover])
p.vbar(x='x', top='y', width=0.9, source=source) 
show(p)

# BIRCH Clustering using the optimal number of clusters 
* The number of clusters with the best Silhouette score is 2 (score=0.763). 
* I will now perform Birch clustering using this optimal number of clusters.

# Visualization of Results
The code for performing clustering using this optimal number of clusters i.e. 2 as well as the results of clustering is shown below:

In [None]:
#Perform BIRCH clustering on the 2d dataset into 2 clusters
brc = Birch(branching_factor=50, n_clusters=2, threshold=0.5,  compute_labels=True)
brc.fit(Y)
clustering_result=brc.predict(Y)
cluster=[]
cluster_color=[]
#Assign orange color to the sample points in cluster 1 and green color to the sample points in cluster 0
for label in clustering_result:
    cluster.append(label)
    if(label==1):
        cluster_color.append('orange')
    else:
        cluster_color.append('green')
        

hover = HoverTool(   #define the attributes that you want to display on hovering over a scatter-plot point
        tooltips=[ 
            ("cluster number", "@cluster"),
            ("song name", "@name"),
            ("artist name", "@artist"),
        ])
        
p=figure(title="Clustering of the sample points into 2 clusters using BIRCH clustering",tools=[hover])
source=ColumnDataSource(dict(x=Y[:,0], y=Y[:,1], cluster=cluster, name=song_name_list, artist=artist_name_list))
p.circle('x','y', source=source, color=cluster_color,alpha=0.5) #plot points as circles
source=ColumnDataSource(dict(x=[], y=[], name=[]))
p.circle('x','y', source=source, color = 'orange', legend='Cluster 1')
p.circle('x','y', source=source, color = 'green', legend='Cluster 0')
#label the axes
p.xaxis.axis_label = "t-SNE dimension 1"
p.yaxis.axis_label = "t-SNE dimension 2"
p.legend.location = "top_left" #adjust legend position
show(p)
 

# Analysis of Clustering results
From the clusterig results on hovering over the point and observing the names of the songs, I noticed that
# Cluster 0 Analysis
* Cluster 0 (green) has more songs that are above love compared to cluster 1 (orange) This is true for both both beyonce and 50-cent. 
* Songs by Beyonce with names containing the word love include 'Still in love','Crazy in love','Why don't you love me?','Love on top', 'Keep giving your love to me'. There songs are even situated close.
* There is aso true for 50-Cent where we see songs like 'Baltimore love thing', 'Thug,love', 'Love,hate,love'.
* Cluster 0 also has songs that are about related to death in general that are clustered close to each other such as 'Body bags', 'I'm supposed to die tonight', 'Life's on the line' by 50-Cent. Another group of songs related to the same theme that are close to each other include 'I'll still kill','The good die young','the Funeral','If dead men could talk' all by 50-cent that are all quite close to one another.
# Cluster 1 Analsis
* Cluster 1 has more songs with themes such as power, status, money and wealth.
* Examples of this would include 'Swag level', 'You will never take my crown', 'Business mind' by 50-Cent and 'Diamonds' by Beyonce.
* Cluster 1 also has songs related to stars and constellations such as Gift from Virgo and wishing on a start by Beyonce
* Hence, we can say that Cluster 1 contains songs that are based on themes such as aspirations and dreams as well as wealth and power.
