In [None]:
# import required packages
import spotipy  # use terminal to pip install PyLyrics
import numpy
from spotipy.oauth2 import SpotifyClientCredentials

# setup with spotify API
# find client_id etc. post creating your app on spotify
username='your-uname'
client_id = 'your-client-id'
client_secret = 'your-client-secret'

# update the settings on the app page of spotify to get uri, scope etc.
# use those credentials for calling data from spotify's API
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

# set spotify client credentials to sp to be used for further coding
# accessing tracks and such, check spotipy documentation for reference:
# https://spotipy.readthedocs.io/en/2.9.0/#api-reference
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# selected artist for analysis - Bob Dylan 
# want just the artist Bob Dylan from a list of artists matching the name Bob or Dylan 
# use uri specific to Bob Dylan at position 0 of the list 
name = "Bob Dylan" #chosen artist
result = sp.search(q='artist:' + name, type='artist') #search query
#result["artists"]["items"][0]
artist_uri = result["artists"]["items"][0]["uri"] #'spotify:artist:74ASZWbe4lXaubB36ztrGX'

#set limit and offset - need to loop through to get the entire list of tracks/albums for Bob Dylan 
limit=50
offset=0
# Store artist's albums' uris in a list
album_uris = []


# Pull all of the artist's albums
sp_albums = sp.artist_albums(artist_uri, limit=limit, offset=offset)

for i in range(len(sp_albums['items'])):
    album_uris.append(sp_albums['items'][i]['uri'])   

while len(album_uris) < sp_albums["total"] :
    offset+=limit
    sp_albums = sp.artist_albums(artist_uri, limit=limit, offset=offset)

    for i in range(len(sp_albums['items'])):
        album_uris.append(sp_albums['items'][i]['uri'])
album_uris

#album_uris


In [None]:
len(album_uris)

In [None]:
# get all album tracks using the uri

def albumSongs(uri):
    album = uri #assign album uri to a_name
    tracks = sp.album_tracks(album) #pull data on album tracks
    for n in range(len(tracks['items'])): #for each song track
        artist_tracks[tracks['items'][n]['uri']] = {}

        
artist_tracks = {}

for i in album_uris: #each album
    albumSongs(i)
artist_tracks


In [None]:
len(album_uris)

In [None]:
# get all tracks' release dates from the albums 
track_names=[]
track_album_release_dates = []
for i in artist_tracks:
    track = sp.track(i)
    track_names.append(track["name"])
    track_album_release_dates.append(track["album"]["release_date"])



In [None]:
# create dataframes
import pandas as pd
dat = pd.DataFrame()
dat['track_name'] = track_names
dat['release_date'] = track_album_release_dates


In [None]:
dat.shape

In [None]:
# ignore live, remix and deluxe album versions
# Credit:
mask = [('live' not in s.lower() and 'deluxe' not in s.lower()
         and 'remix' not in s.lower() and 'rmx' not in s.lower()
        and 'remastered' not in s.lower() and 'take 2' not in s.lower() and 
        'take 3' not in s.lower() and 'take 4' not in s.lower() and 
        'take 5' not in s.lower() and 'take 6' not in s.lower() and 
         'take 7' not in s.lower() and 'take 8' not in s.lower()
        and 'take 11' not in s.lower() and 'take 13' not in s.lower()) for s in dat.track_name.values]
dat = dat[mask]
dat.shape


In [None]:
# import more packages 
from PyLyrics import *  # use terminal to pip install PyLyrics
import re
import nltk
import os


In [None]:
# get lyrics from tracks iterating over each track 
track_lyrics = []
for i in dat.track_name.values:
    try:
        lyrics = PyLyrics.getLyrics('Bob Dylan',i)
        track_lyrics.append(lyrics)
    except:
        # sometimes this may not work (e.g. songs recorded live do not have lyrics stored)
        track_lyrics.append('exception')
dat['lyrics'] = track_lyrics


In [None]:
dat.shape

In [None]:
# ignore exceptions
mask = [('exception' != s.lower()) for s in dat.lyrics.values]
dat = dat[mask]
dat.shape


In [None]:
# check new dataset
dat

In [None]:
# import packages
import nltk
import re
import string
import numpy as np

In [None]:
# start with text analysis - data munging 
# clean the lyrics and make a corpus for each album

# convert pd to csv 
bob_dylan_lyrics = dat.to_csv("~/Case_Studies/lyrics.csv")

In [None]:
# read in csv 
bob_dylan_lyrics = pd.read_csv("~/Case_Studies/lyrics.csv", index_col=False)

In [None]:
# remove punctuation
punctuation = re.compile('[%s]' % re.escape(string.punctuation))


def remove_punctuation(doc):
    return punctuation.sub('', doc.lower())


bob_dylan_lyrics['processed_text'] = np.vectorize(remove_punctuation)(bob_dylan_lyrics['lyrics'])

# remove stop words
stop_words = nltk.corpus.stopwords.words('english')

for index, row in bob_dylan_lyrics.iterrows():
    bob_dylan_lyrics.at[index, 'processed_text'] = ' '.join([term for term in bob_dylan_lyrics.loc[index, 'processed_text'].split()
                                                      if term not in stop_words])

# stem
porter = nltk.stem.porter.PorterStemmer()

for index, row in bob_dylan_lyrics.iterrows():
    bob_dylan_lyrics.at[index, 'processed_text'] = ' '.join([porter.stem(term) for term
                                                      in bob_dylan_lyrics.loc[index, 'processed_text'].split()])

In [None]:
# tokenize
bob_dylan_lyrics['tokenized'] = ''
bob_dylan_lyrics['tokenized'] = bob_dylan_lyrics['tokenized'].astype(object)

In [None]:
# write new csv post text parsing
bob_dylan_lyrics.to_csv('/Users/Chandni/Documents/Education/NCSU/MSA/Case_Studies/lyrics_processed.csv')

In [None]:
# import packages for creating word cloud
from wordcloud import WordCloud
import matplotlib.pylab as plt

In [None]:
# column to use for word cloud 
fields = ['processed_text']
bd_lyrics = pd.read_csv('/Users/Chandni/Documents/Education/NCSU/MSA/Case_Studies/lyrics_processed.csv', usecols=fields)



In [None]:
# cretae wordcloud
bd_wordcloud = WordCloud().generate(' '.join(bd_lyrics['processed_text']))

# manually pass computed frequencies of words to cloud 
# WordCloud.generate_from_frequencies

# lower max_font_size, change the maximum number of word and lighten the background:
bd_wordcloud_1 = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(' '.join(bd_lyrics['processed_text']))
plt.figure()

plt.imshow(bd_wordcloud_1, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Save the image in the img folder:
bd_wordcloud_1.to_file("~/Case_Studies/bd_wordcloud_V1.png")

In [None]:
# import more required packages 
import datetime
import pandas as pd

# need year for sentiment analysis and trend analysis 
# extract year from release date column 
# convert back csv to dataframe for year extraction and year concatenations
bd_lyrics = pd.read_csv('~/Case_Studies/lyrics_processed.csv')
lyrics_df = pd.DataFrame(bd_lyrics)

lyrics_df.shape

# need year for sentiment analysis and trend analysis 
# extract year from release date column 
# convert back csv to dataframe for year extraction and year concatenations
lyrics_df['release_year'] = pd.DatetimeIndex(lyrics_df['release_date']).year
lyrics_df.head()

# create csv to check

# check tail also since dates provided in year format only  
lyrics_df.tail()

In [None]:
# group all processed text (lyrics that have been processed) for every year in a separate row
# i.e. merge rows for same years 

# select only two columns from df for further analysis
# select two columns 
text_date_df = lyrics_df[['processed_text', 'release_year']] 
text_date_df.head()
# text_date_df.tail()

text_date_df.shape



In [None]:
# create and check with a small subset of data first - select first two rows only
test_df = text_date_df[0:3]

#test_df.shape
test_df.head()


In [None]:
# merge rows for same years for test data set
grouped_test_df = test_df.groupby(['release_year'], as_index = False).agg({'processed_text': ' ' .join})

#grouped_test_df.shape
grouped_test_df.head()

In [None]:
# write test data to csv to check
grouped_test_df.to_csv('~/Case_Studies/grouped_test_df.csv')

In [None]:
# merge rows for same years for entire data set
grouped_text_date_df = text_date_df.groupby(['release_year'], as_index = False).agg({'processed_text': ' ' .join})



In [None]:
# check shape
grouped_text_date_df.shape


In [None]:
# write to ENTIRE date set to csv to check
grouped_text_date_df.to_csv('~/Case_Studies/grouped_all_data_df.csv')

In [None]:
grouped_text_date_df.head()

In [None]:
# import more packages 
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [None]:
#import SentimentIntensityAnalyzer class 
#from vaderSentiment.vaderSentiment module. 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [None]:
# check only for positive sentiment first 
sid = SentimentIntensityAnalyzer()

for index, row in grouped_text_date_df.iterrows():
    grouped_text_date_df.at[index, 'positive'] = sid.polarity_scores(grouped_text_date_df.loc[index, 'processed_text'])['pos']
    

lyrics_year = grouped_text_date_df.groupby(['release_year'])['positive']

In [None]:
lyrics_year

In [None]:
lyrics_year.head()

In [None]:
# check for positive and negative sentiment and create final dataset for sentiment analysis
sid = SentimentIntensityAnalyzer()

# create new df to prevent error
new_grouped_data = grouped_text_date_df

for index, row in new_grouped_data.iterrows():
    new_grouped_data.at[index, 'positive'] = sid.polarity_scores(new_grouped_data.loc[index, 'processed_text'])['pos']
    new_grouped_data.at[index, 'negative'] = sid.polarity_scores(new_grouped_data.loc[index, 'processed_text'])['neg']
    

lyrics_year = grouped_text_date_df.groupby(['release_year'])['positive', 'negative']


In [None]:
lyrics_year.head()

In [None]:
# check new data sets shape for this should include the positive and negative columns 
new_grouped_data.shape

In [None]:
new_grouped_data.head()

In [None]:
# write to this sentiment analysis data set to csv for creating visualization in tableau
new_grouped_data.to_csv('~/Case_Studies/Spotify_Senti_Analysis.csv')