<a href="https://colab.research.google.com/github/cnn22/SingerSongwriter/blob/main/TaylorSwift_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spotipy

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.1 spotipy-2.23.0


In [2]:
import pandas as pd
import pickle
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import requests

# 1. Gather Data

## 1.1 Get Lyrics

In [3]:
#get all of taylor swift's albums
url = "https://taylor-swift-api.sarbo.workers.dev/albums"
response = requests.get(url)

if response.status_code == 200:
  data = response.json()
  albums = pd.DataFrame(data)
  # print(albums)
else:
  print(f'Failed to retrieve data. Status code: {response.status_code}')

In [4]:
# gets all of taylor's songs for all albums
url = 'https://taylor-swift-api.sarbo.workers.dev/songs'
response = requests.get(url)

if response.status_code == 200:
  data = response.json()
  songs = pd.DataFrame(data)
  # print(songs)
else:
  print(f'Failed to retrieve data. Status code: {response.status_code}')

In [5]:
#get lyrics
lyrics_url = 'https://taylor-swift-api.sarbo.workers.dev/lyrics/'
lyrics_df = pd.DataFrame(columns=['song_title', 'lyrics'])

for index, row in songs.iterrows():
  song = row['song_id']
  url = f'{lyrics_url}{song}'
  response = requests.get(url)

  if response.status_code == 200:
    data = response.json()
    lyrics = data['lyrics']
    song_title = data['song_title']
    new_row = pd.DataFrame({'song_title': [song_title], 'lyrics': [lyrics]})
    lyrics_df = pd.concat([lyrics_df, new_row], ignore_index = True)
  else:
    print(f'Failed to retrieve data. Status code: {response.status_code}')

In [6]:
lyrics = pd.merge(songs, lyrics_df, left_on ='title', right_on = 'song_title', how = 'inner').drop(columns=['song_title'])
albums = albums.rename(columns={'title':'album_title'})
lyrics = pd.merge(lyrics, albums, on = 'album_id', how = 'inner')
# lyrics.to_excel('all_taylor_swift_lyrics.xlsx', index = True)


## 1.2 Get song attributes

In [7]:
songs = pd.read_csv('all_taylor_swift_lyrics.csv', index_col = False)

In [8]:
songs['Artist'] = 'Taylor Swift'
songs['SpotifyID'] = None
songs["Tempo"] = None
songs["Loudness"] = None
songs["Key"] = None
songs["isExplicit"] = None
songs["Danceability"] = None
songs["Energy"] = None
songs["Liveness"] = None
songs["Duration"] = None
songs["Popularity"] = None
songs["Valence"] = None

In [9]:
cid = ""
secret = ""

#initiating Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id = cid, client_secret = secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [10]:
#getSongID takes track's name, track's Artist, and track results
#iterate through the track results to get the name in what spotify returned and
# test those results against what song and artist we have in the data frame
def getSongID(trackName, trackArtist, trackResults):
    for track in trackResults['tracks']['items']:
        if (track['name'].lower() == trackName.lower()) & (track['artists'][0]['name'].lower() == trackArtist.lower()):
            return track['id']
    return None

In [11]:
#getExplicit function that takes in a songID and returns if the song is explicit or not
def getExplicit(songID):
    return sp.track(songID)['explicit']

In [12]:
#getTrackAttributes function that takes in a songID and returns song attributes such as, Tempo, Loudness, Key and Duration
def getTrackAttributes(songID):
    return sp.audio_features(tracks=songID)[0]

In [13]:
def getPopularity(songID):
  return sp.track(songID)['popularity']

In [14]:
#getSongKey function that takes in a pitch class and returns the key. Reference: https://en.wikipedia.org/wiki/Pitch_class
def getSongKey(pitchClass):
    if pitchClass == 0:
        return 'C'
    elif pitchClass == 1:
        return 'C#, Db'
    elif pitchClass == 2:
        return 'D'
    elif pitchClass == 3:
        return 'D#, Eb'
    elif pitchClass == 4:
        return 'E'
    elif pitchClass == 5:
        return 'F'
    elif pitchClass == 6:
        return 'F#, Gb'
    elif pitchClass == 7:
        return 'G'
    elif pitchClass == 8:
        return 'G#, Ab'
    elif pitchClass == 9:
        return 'A'
    elif pitchClass == 10:
        return 'A# Bb'
    elif pitchClass == 11:
        return 'B'
    else: return None

In [15]:
for index, row in songs.iterrows():
  trackResults = sp.search(q=row['title'], type='track', market= 'US', limit=10,offset=0)
  songID = getSongID(row['title'], row['Artist'], trackResults)
  # print(songID)
  # songs.SpotifyID.iloc[index] = songID
  songs.at[index, 'SpotifyID'] = songID

In [59]:
songs.loc[songs['title'] == 'tis the damn season', 'SpotifyID'] = '4GBkffrtA51p17JH35irGA'
songs.loc[songs['title'] == "Soon You'll Get Better", 'SpotifyID'] = '4AYtqFyFbX0Xkc2wtcygTr'
songs.loc[songs['title'] == "It's Nice To Have A Friend", 'SpotifyID'] = '1SmiQ65iSAbPto6gPFlBYm'
songs.loc[songs['title'] == "Don't Blame Me", 'SpotifyID'] = '1R0a2iXumgCiFb7HEZ7gUE'
songs.loc[songs['title'] == "New Year's Day", 'SpotifyID'] = '7F5oktn5YOsR9eR5YsFtqb'
songs.loc[songs['title'] == "Question…?", 'SpotifyID'] = '0heeNYlwOGuUSe7TgUD27B'
songs.loc[songs['title'] == "Snow on the Beach", 'SpotifyID'] = '4zmKGsrXjLmljb5fTaBTot'
songs.loc[songs['title'] == "So It Goes…", 'SpotifyID'] = '5PxFv9yJEg9dxvbZggykro'
songs.loc[songs['title'] == "it's time to go", 'SpotifyID'] = '6RvmQShxVQwJ0AWMtP6JoU'
songs.loc[songs['title'] == "Teardrops on My Guitar", 'SpotifyID'] = '7zMcNqs55Mxer82bvZFkpg'
songs.loc[songs['title'] == "Breathe", 'SpotifyID'] = '49mWEy5MgtNujgT7xU3emT'
songs.loc[songs['title'] == "Tell Me Why", 'SpotifyID'] = '3rnI1UCyGJvUTVvT97VQr5'
songs.loc[songs['title'] == "We Were Happy", 'SpotifyID'] = '34V9RiEPe8MNdU32qJsJa1'

songs.loc[songs['title'] == "Nothing New (Taylor’s version)", 'SpotifyID'] = '01K4zKU104LyJ8gMb7227B'
songs.loc[songs['title'] == "Everything Has Changed (Taylor’s version)", 'SpotifyID'] = '7qEUFOVcxRI19tbT68JcYK'
songs.loc[songs['title'] == "I Bet You Think About Me (Taylor’s version)", 'SpotifyID'] = '4CkgMiMqZ5JzW9iYXSTMTL'
songs.loc[songs['title'] == "Red (Taylor’s version)", 'SpotifyID'] = '4OAuvHryIVv4kMDNSLuPt6'
songs.loc[songs['title'] == "Run (Taylor’s version)", 'SpotifyID'] = '4IQkfUsrwXol38VV3U7t7T'
songs.loc[songs['title'] == "The Very First Night (Taylor’s version)", 'SpotifyID'] = '6pYNq0ZwpPVazKzsqpf0G8'
songs.loc[songs['title'] == "22 (Taylor’s version)", 'SpotifyID'] = '3yII7UwgLF6K5zW3xad3MP'
songs.loc[songs['title'] == "State of Grace (Taylor’s version)", 'SpotifyID'] = '6lzc0Al0zfZOIFsFvBS1ki'
songs.loc[songs['title'] == "Sad Beautiful Tragic (Taylor’s version)", 'SpotifyID'] = '73qMN9bXy7MSPwwGfH3wQr'
songs.loc[songs['title'] == "Better Man (Taylor’s version)", 'SpotifyID'] = '4OmFmE0fzcMG6g0Y8p4eSD'
songs.loc[songs['title'] == "I Almost Do (Taylor’s version)", 'SpotifyID'] = '2r9CbjYgFhtAmcFv1cSquB'
songs.loc[songs['title'] == "Forever Winter (Taylor’s version)", 'SpotifyID'] = '3oGVx9RBmiYGv5ZCecWLkx'
songs.loc[songs['title'] == "Ronan (Taylor’s version)", 'SpotifyID'] = '7nWui6jiMM2m9qFmET1Mtj'
songs.loc[songs['title'] == "Babe (Taylor’s version)", 'SpotifyID'] = '0v4z1tuZvn6LGknom9Qx7d'
songs.loc[songs['title'] == "Begin (Taylor’s version)", 'SpotifyID'] = '05GsNucq8Bngd9fnd4fRa0'
songs.loc[songs['title'] == "Girl at Home (Taylor’s version)", 'SpotifyID'] = '05GsNucq8Bngd9fnd4fRa0'
songs.loc[songs['title'] == "All Too Well (Taylor’s version)", 'SpotifyID'] = '3nsfB1vus2qaloUdcBZvDu'
songs.loc[songs['title'] == "Holy Ground (Taylor’s version)", 'SpotifyID'] = '7J4b3LVCIGO4CMBDFLPoP6'
songs.loc[songs['title'] == "Treacherous (Taylor’s version)", 'SpotifyID'] = '3S7HNKPakdwNEBFIVTL6dZ'
songs.loc[songs['title'] == "The Lucky One (Taylor’s version)", 'SpotifyID'] = '4e5ayHsOLJNLTGfjau2mEw'
songs.loc[songs['title'] == "We Are Never Ever Getting Back Together (Taylor’s version)", 'SpotifyID'] = '5YqltLsjdqFtvqE7Nrysvs'
songs.loc[songs['title'] == "Come Back… Be Here (Taylor’s version)", 'SpotifyID'] = '4pNApnaUWAL2J4KO2eqokq'
songs.loc[songs['title'] == "I Knew You Were Trouble (Taylor’s version)", 'SpotifyID'] = '6AtZLIzUINvExIUy4QhdjP'
songs.loc[songs['title'] == "The Moment I Knew (Taylor’s version)", 'SpotifyID'] = '0NRHj8hDwwmSPaA41o379r'
songs.loc[songs['title'] == "Starlight (Taylor’s version)", 'SpotifyID'] = '7A2cNLRT0YJc1yjxHlKihs'
songs.loc[songs['title'] == "Stay Stay Stay (Taylor’s version)", 'SpotifyID'] = '7eQj6r5PIdYKEIZjucBMcq'
songs.loc[songs['title'] == "Message in a Bottle (Taylor’s version)", 'SpotifyID'] = '3z6XUommYDWPHeFhmhhT6j'
songs.loc[songs['title'] == "All Too Well (10 minute version) (Taylor’s version)", 'SpotifyID'] = '5enxwA8aAbwZbf5qCHORXi'



songs = songs[~songs.SpotifyID.isnull()]

In [62]:
# songs[songs.SpotifyID.isnull()].count()

In [61]:
#iteratte through songs in songs data frame (which we pulled from billboard.com) to grab song attributes
#using Spotify's API for things like, isExplicit, tempo, loudness, duration, release date.

for index, row in songs.iterrows():
    #get explicit attribute from Get Track API
    explicitValue = getExplicit(row['SpotifyID'])
    attributes = getTrackAttributes(row['SpotifyID'])

    #filling in values
    songs.at[index, 'Tempo'] = attributes['tempo']
    songs.at[index, 'Loudness'] = attributes['loudness']
    songs.at[index, 'Key'] = getSongKey(attributes['key'])
    songs.at[index, 'isExplicit'] = explicitValue
    songs.at[index, 'Danceability'] = attributes['danceability']
    songs.at[index, 'Energy'] = attributes['energy']
    songs.at[index, 'Liveness']= attributes['liveness']
    songs.at[index, 'Duration'] = attributes['duration_ms']/1000 #converting from milliseconds to seconds
    songs.at[index, 'Popularity'] = getPopularity(row['SpotifyID'])
    songs.at[index, 'Valence'] = attributes['valence']

In [63]:
# songs.to_csv('TAYLOR_SWIFT_FINAL_DATASET.csv', index = False)

# 2. Sentiment Analysis

In [67]:
!pip install vaderSentiment



In [68]:
import pandas as pd
import pickle as pk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [69]:
# songs = pd.read_csv('TAYLOR_SWIFT_FINAL_DATASET.csv')

In [70]:
analyzer = SentimentIntensityAnalyzer()

sentiment_scores = []

for index, row in songs.iterrows():
  song = row['lyrics']
  sentiment = analyzer.polarity_scores(song)
  sentiment_scores.append({
      'Title': row['title'],
      'Album': row['album_title'],
      'Positive': sentiment['pos'],
      'Neutral': sentiment['neu'],
      'Negative': sentiment['neg'],
      'Compound': sentiment['compound']
  })

In [83]:
# for sentiment_data in sentiment_scores:
#     print(f"Album: {sentiment_data['Album']}")
#     print(f"Track: {sentiment_data['Title']}")
#     print(f"Positive: {sentiment_data['Positive']}")
#     print(f"Neutral: {sentiment_data['Neutral']}")
#     print(f"Negative: {sentiment_data['Negative']}")
#     print(f"Compound: {sentiment_data['Compound']}")
#     print("\n")  # Add a newline for better readability between song data

In [72]:
sentiment_df = pd.DataFrame(sentiment_scores)

In [73]:
sentiment_df

Unnamed: 0,Title,Album,Positive,Neutral,Negative,Compound
0,Blank Space,1989,0.164,0.688,0.149,0.8537
1,Style,1989,0.096,0.898,0.007,0.9894
2,Out Of The Woods,1989,0.227,0.766,0.007,0.9995
3,All You Had to Do Was Stay,1989,0.128,0.850,0.022,0.9932
4,Shake It Off,1989,0.089,0.500,0.411,-0.9998
...,...,...,...,...,...,...
160,Bigger Than the Whole Sky,Midnights,0.000,0.957,0.043,-0.8665
161,Paris,Midnights,0.220,0.734,0.046,0.9971
162,Glitch,Midnights,0.116,0.862,0.022,0.9750
163,Dear Reader,Midnights,0.205,0.729,0.066,0.9921


In [74]:
taylor_swift_sentiment_analysis_df = pd.merge(songs, sentiment_df, left_on = ['album_title', 'title'], right_on = ['Album', 'Title'], how='inner')

In [75]:
taylor_swift_sentiment_analysis_df.to_csv('TAYLOR_SWIFT_FINAL_DATASET.csv', index = False)

In [76]:
taylor_swift_sentiment_analysis_df

Unnamed: 0,song_id,title,album_id,lyrics,album_title,release_date,Artist,SpotifyID,Tempo,Loudness,...,Liveness,Duration,Popularity,Valence,Title,Album,Positive,Neutral,Negative,Compound
0,1,Blank Space,1,"Nice to meet you, where you been?\nI could sho...",1989,2014-10-27,Taylor Swift,1p80LdxRV74UKvL8gnD7ky,96.006,-5.421,...,0.13,231.827,81,0.583,Blank Space,1989,0.164,0.688,0.149,0.8537
1,2,Style,1,"Midnight\nYou come and pick me up, no headligh...",1989,2014-10-27,Taylor Swift,4lIxdJw6W3Fg4vUIYCB0S5,95.019,-5.572,...,0.117,231.0,81,0.456,Style,1989,0.096,0.898,0.007,0.9894
2,3,Out Of The Woods,1,Looking at it now\nIt all seems so simple\nWe ...,1989,2014-10-27,Taylor Swift,5OndtwLGA9O6XHFcGm2H7r,91.991,-6.938,...,0.337,235.8,70,0.343,Out Of The Woods,1989,0.227,0.766,0.007,0.9995
3,4,All You Had to Do Was Stay,1,"(Hey, hey, hey)\n(Hey, hey, hey)\n(Hey, hey, h...",1989,2014-10-27,Taylor Swift,0dAb8TY433dl3ZfXYCLE19,96.969,-5.778,...,0.105,193.293,68,0.471,All You Had to Do Was Stay,1989,0.128,0.850,0.022,0.9932
4,5,Shake It Off,1,I stay out too late\nGot nothing in my brain\n...,1989,2014-10-27,Taylor Swift,5xTtaWoae3wi06K5WfVUUH,160.015,-5.414,...,0.148,219.2,77,0.943,Shake It Off,1989,0.089,0.500,0.411,-0.9998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,163,Bigger Than the Whole Sky,10,No words appear before me in the aftermath\nSa...,Midnights,2022-10-21,Taylor Swift,0BiqmkasE5FdrChwKfVp8X,165.71,-12.379,...,0.115,218.503,73,0.068,Bigger Than the Whole Sky,Midnights,0.000,0.957,0.043,-0.8665
161,164,Paris,10,Your ex-friend's sister met someone at a club ...,Midnights,2022-10-21,Taylor Swift,7712gjoih4QoDbXpljEk21,110.947,-10.547,...,0.137,196.259,72,0.345,Paris,Midnights,0.220,0.734,0.046,0.9971
162,165,Glitch,10,We were supposed to be just friends\nYou don't...,Midnights,2022-10-21,Taylor Swift,6wAFvJPpTZVirBKGZ4EnMW,140.864,-9.738,...,0.11,148.781,70,0.347,Glitch,Midnights,0.116,0.862,0.022,0.9750
163,166,Dear Reader,10,Dear reader\nIf it feels like a trap\nYou're a...,Midnights,2022-10-21,Taylor Swift,3QF5RsWzK1lCvf2o2cY65P,107.747,-12.088,...,0.117,225.194,70,0.159,Dear Reader,Midnights,0.205,0.729,0.066,0.9921


# 3. Word Count

In [77]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re

In [78]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [79]:
df = pd.read_csv('TAYLOR_SWIFT_FINAL_DATASET.csv')

In [80]:
text = ' '.join(df['lyrics'])
lemmatizer = WordNetLemmatizer()



text = text.lower() #lower case all chars
text = re.sub(r'[^a-zA-Z\s]', '', text) #remove punctuation/special chars
text = re.sub(r'\b\d+\b|\d+[a-zA-Z]+\w*', '', text) # remove numbers like 1,4th, 1950s
# text = re.sub(r'(\w+)\1{1,}', r'\1', text) # Remove repeated sequences of characters (e.g., 'mmm', 'oh', 'ohoh')
words = word_tokenize(text)

#remove stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]

# Remove specific contractions
words = [word for word in words if word not in ["im", "youre", "id", "dont", "ive"]]

# Lemmatize words
words = [lemmatizer.lemmatize(word) for word in words]
# Lemmatize words with verb POS ('v')
words = [lemmatizer.lemmatize(word, pos='v') for word in words]

word_freq = Counter(words)

word_freq_df = pd.DataFrame(word_freq.items(), columns = ['Word', 'Frequency'])
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

In [81]:
word_freq_df.to_excel('word_freq.xlsx', index = False)

In [82]:
# word_freq_df[word_freq_df.Word == 'mmmmmmm']

# Notes

## Vader Sentiment





### What is it?

Valence Aware Dictionary and sEntiment Reasoner (VADER) is a pre-trained sentiment analysis tool designed to analyze text data and determine the sentiment or emotional tone expressed. VADER is specifically tailored for social media text and informal language.


## Key Features of VADER
1.   Valnence Awareness. It's sensitive to both polarity (positive and negative) and intensity of sentiments in text. It can recognize not only if it's a positive or negative statement but also the degree of sentiment intensity.
2.   Pre-trained lexicon. it contains thousands of words and phrases, each assigned a polarity and an intensity score. This allows VADER to analyze text quickly without need for additional training
3. Recognizes emoticons and punctuations!
4. Noisy text handling. It's robust in handling noisy text data, such as text with missspellings, grammatical errors, and informal language.

### Compound Score
A single numerical value that summarizes the sentiment of the text in a way that considers both positive and negative sentiments present.
*   A score greater than 0 suggests a positive sentiment (e.g., 0.2 indicates mild positive sentiment)
*   A score less than 0 suggests a negative sentiment (e.g., -0.3 indicates mild negative sentiment)
*   A score close to 0 suggests a relatively neutral sentiment (e.g., -0.1 to 0.1)

### Compound Score Calculation
Weighted sum of the positive, negative, and neutral scores, which is then normalized to fall within a range typically between -1(most negative) and +1(most positive)

## How VADER calculates scores
1.   For each word in the text, VADER checks if the word exists in its lexicon
2.   If the word is found, it considers the assigned polarity and intensity scores for that word.
3.   It aggregates the scores across all words in the text, considering their positions and grammatical modifiers.
4.   The positive, negative, and neutral scores are normalized to a range of 0 to 1.
5.   The compound score is calculated using a mathematical formula that takes into account the normalized scores and their relative weights.

## Spotify Metrics

* **Danceability**: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.

* **Energy**: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

* **liveness**: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.

* **loudness**: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typically range between -60 and 0 db.

* **tempo**: The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.

* **valence**: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).

## Dashboarding Ideas

### the ideas...
Taylor Swift's Musical Evolution:

Use line charts to show how Taylor Swift's music has evolved over time in terms of tempo, loudness, and key.
Showcase her albums on the x-axis and metrics on the y-axis, allowing users to see trends and changes.


********************************
Sentiment Analysis Over Albums:

Create a bar chart or heatmap to display sentiment scores for each song within Taylor Swift's albums.Group songs by album and visualize how sentiment varies from one album to another.

*******************************
Song Characteristics vs. Sentiment:

Explore the relationship between song characteristics (e.g., tempo, energy) and sentiment scores.
Create scatter plots to see if there is any correlation between certain characteristics and sentiment.