In [2]:
# import the Pandas library
import pandas as pd

In [3]:
# read the dataset into the DataFrame
df = pd.read_csv('taylor_swift_spotify_data.csv')

In [4]:
# access the columns
columns = df.columns
print(columns)

Index(['artist_name', 'artist_id', 'album_id', 'album_type',
       'album_release_date', 'album_release_year',
       'album_release_date_precision', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'track_id', 'analysis_url',
       'time_signature', 'disc_number', 'duration_ms', 'explicit',
       'track_href', 'is_local', 'track_name', 'track_preview_url',
       'track_number', 'type', 'track_uri', 'external_urls.spotify',
       'album_name', 'key_name', 'mode_name', 'key_mode'],
      dtype='object')


In [5]:
# how many columns are there, how many rows?
print('No. of columns:', len(columns))
print('No. of rows:', len(df))

No. of columns: 36
No. of rows: 1265


In [6]:
# what's the average length of a Taylor Swift song?
avg_duration = df['duration_ms'].mean()
print(avg_duration)
# this is great, but it's in milliseconds. who knows about milliseconds?

230381.6181818182


In [7]:
# define a function to convert the millisecond timestamp into something more understandable
def convert_time(time_ms):
    # convert from milliseconds to seconds
    time_sec = time_ms / 1000
    # get the minutes, dividing the seconds by 60
    time_min = int(time_sec / 60)
    # get the seconds extra past the last minute
    time_sec = int(time_sec % 60)
    return str(time_min)+' minutes and '+str(time_sec)+' seconds'

print('The average song length is ', convert_time(avg_duration))

The average song length is  3 minutes and 50 seconds


But, how does this compare to the most recent average for pop songs? Well, the average in recent years is about... 3:50!
https://www.vox.com/2014/8/18/6003271/why-are-songs-3-minutes-long

how about the tempo, then? the average tempo is 116 bpm.. https://www.washingtonpost.com/news/to-your-health/wp/2015/10/30/the-mathematical-formula-behind-feel-good-songs/

In [8]:
df['tempo'].mean()
# a bit faster, but still very typical of the pop genre

120.87697944664023

what are the top 3 keys that Taylor Swift likes to write in?

In [9]:
df['key_mode'].value_counts()[:3]

G major    224
F major    156
C major    151
Name: key_mode, dtype: int64

# Data Cleaning
List all of her albums on Spotify

In [12]:
df['album_name'].unique()

array(['Midnights', 'evermore', 'folklore', 'Lover',
       'Taylor Swift Karaoke: reputation', 'reputation',
       'reputation Stadium Tour Surprise Song Playlist', '1989',
       'Taylor Swift Karaoke: 1989', 'Red', 'Taylor Swift Karaoke: Red',
       'Speak Now World Tour Live', 'Speak Now',
       'Taylor Swift Karaoke: Speak Now', 'Fearless', 'Fearless Karaoke',
       'Fearless Platinum Edition',
       'Live From Clear Channel Stripped 2008', 'Taylor Swift',
       'Taylor Swift Karaoke'], dtype=object)

it seems like there are lots of albums that have several versions. can we clean out this data?


In [14]:
# create a mask that gets rid of any () versions
mask = ~df['album_name'].str.contains('\(')
albums = df[mask]['album_name'].unique()
print(len(albums))
print(albums)

20
['Midnights' 'evermore' 'folklore' 'Lover'
 'Taylor Swift Karaoke: reputation' 'reputation'
 'reputation Stadium Tour Surprise Song Playlist' '1989'
 'Taylor Swift Karaoke: 1989' 'Red' 'Taylor Swift Karaoke: Red'
 'Speak Now World Tour Live' 'Speak Now' 'Taylor Swift Karaoke: Speak Now'
 'Fearless' 'Fearless Karaoke' 'Fearless Platinum Edition'
 'Live From Clear Channel Stripped 2008' 'Taylor Swift'
 'Taylor Swift Karaoke']


I am trying to open a club. What's Taylor Swift's most danceable song?

In [158]:
max_dance = df['danceability'].max()
mask = df['danceability'] == max_dance
df[mask][['album_name', 'track_name', 'danceability']]

Unnamed: 0,album_name,track_name,danceability
370,Lover,I Think He Knows,0.897
