# Spotify Streaming History Dashboard
## Part 2: Acquiring Track Audio Features and Refining Dataframe
Summary of process:
- Add column for track start time 
- Add column for listening session number (start of new listening session defined as being > 60 mins since previous track end time)
- Add columns for day number (1-365)
- Add columns for month, day of week, and hour of day
- Add audio features for each track using spotipy library 
- Refine dataframe and normalize audio features where necessary
***
### Install and Import Required Libraries

In [1]:
!pip install spotipy
!pip install pandas
!pip install tqdm



In [2]:
import pandas as pd
import random
import time
import calendar
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from datetime import datetime, timedelta
from tqdm.notebook import tqdm_notebook
from sklearn import preprocessing

### Read CSV to Dataframe

In [3]:
df = pd.read_csv('StreamingHistory_PostURI.csv')

### Add Columns for Start Time, Listening Session, and Various Time Intervals

In [4]:
# Add Start Time column
startTime = []

for index, row in enumerate(df.iterrows()):
    startTime.append(datetime.strptime(row[1]['endTime'], '%Y-%m-%d %H:%M')
                     - timedelta(milliseconds=(row[1]['msPlayed'])))

df['startTime'] = startTime

# Add Listening Session column 
listeningSession = []
session = 1

for index, row in enumerate(df.iterrows()):
    if index == 0:
        listeningSession.append(session)
    else:
        if (row[1]['startTime'] - datetime.strptime(df.loc[index-1, 'endTime'], '%Y-%m-%d %H:%M')) > timedelta(minutes=60):
            session = session + 1
            listeningSession.append(session)
        else:
            listeningSession.append(session)

df['listeningSession'] = listeningSession

# Add day, day of week, month, and hour columns 
day = []
dayofweek = []
month = []
hour = []
day_ = 1

for index, row in enumerate(df.iterrows()):
    if index == 0:
        day.append(day_)
        dayofweek_ = row[1]['startTime'].weekday()
        dayofweek.append(calendar.day_name[dayofweek_])
        day_number = row[1]['startTime'].day
        month_ = row[1]['startTime'].month
        month.append(calendar.month_name[month_])
        hour_ = row[1]['startTime'].hour
        hour.append(hour_)
    else:
        if row[1]['startTime'].day == day_number:
            day.append(day_)
            dayofweek.append(calendar.day_name[dayofweek_])
            month_ = row[1]['startTime'].month
            month.append(calendar.month_name[month_])
            hour_ = row[1]['startTime'].hour
            hour.append(hour_)
        else:
            day_ = day_ + 1
            day_number = row[1]['startTime'].day
            dayofweek_ = row[1]['startTime'].weekday()
            day.append(day_)
            dayofweek.append(calendar.day_name[dayofweek_])
            month_ = row[1]['startTime'].month
            month.append(calendar.month_name[month_])
            hour_ = row[1]['startTime'].hour
            hour.append(hour_)

df['day'] = day
df['dayofweek'] = dayofweek
df['month'] = month
df['hour'] = hour
            
df.tail()

Unnamed: 0,endTime,artistName,trackName,msPlayed,uri,startTime,listeningSession,day,dayofweek,month,hour
41377,2022-05-20 23:10,Your Smith,Ooh Wee,110189,3NzqnpBP4gT27QMD73GwSM,2022-05-20 23:08:09.811,1057,365,Friday,May,23
41378,2022-05-20 23:10,Mac Miller,Ladders,17904,39NDBdU5Xkm5pCFGa5kZtI,2022-05-20 23:09:42.096,1057,365,Friday,May,23
41379,2022-05-20 23:15,Jessie Reyez,"Figures, a Reprise",4110,0qynxleiY0wavOijAW9trC,2022-05-20 23:14:55.890,1057,365,Friday,May,23
41380,2022-05-20 23:15,Daniel Caesar,OPEN UP,266226,4QTwWQDW89udp0JDO2lmPI,2022-05-20 23:10:33.774,1057,365,Friday,May,23
41381,2022-05-20 23:30,TENDER,Can't Show My Face,87796,456UdZSHZco8pr5Mau8w3e,2022-05-20 23:28:32.204,1057,365,Friday,May,23


### Retrieve Track Audio Feautures Using Spotipy Library

In [6]:
# Access Spotify Web API
client_id = 'ebd35cded9324038b57d33aa816040fe'
client_secret = '4d5c594f164e475e89cb8860506c3a3e'
redirect_uri = 'http://localhost:5000/callback'
scope = 'user-read-recently-played'

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri=redirect_uri,
                                               scope=scope))

In [10]:
# Prepare empty lists to be populated and added to df
danceability = []
energy = []
loudness = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []

# Split df into 2 sections due to volume of requests
df1 = df.iloc[0:25000]
df2 = df.iloc[25000:]

# Populate lists with audio features (first half)
for index, row in tqdm_notebook(df1.iterrows()):
    if index % 200 == 0:
        time.sleep(random.uniform(3, 6))
    uri = row['uri']
    query = sp.audio_features(uri)
    try:
        danceability.append(query[0]['danceability'])
    except:
        danceability.append('NA')
    try:
        energy.append(query[0]['energy'])
    except:
        energy.append('NA')
    try:
        loudness.append(query[0]['loudness'])
    except:
        loudness.append('NA')
    try:
        speechiness.append(query[0]['speechiness'])
    except:
        speechiness.append('NA')
    try:
        acousticness.append(query[0]['acousticness'])
    except:
        acousticness.append('NA')
    try:
        instrumentalness.append(query[0]['instrumentalness'])
    except:
        instrumentalness.append('NA')
    try:
        liveness.append(query[0]['liveness'])
    except:
        liveness.append('NA')
    try:
        valence.append(query[0]['valence'])
    except:
        valence.append('NA')
    try:
        tempo.append(query[0]['tempo'])
    except:
        tempo.append('NA')
    try:
        duration_ms.append(query[0]['duration_ms'])
    except:
        duration_ms.append('NA')

0it [00:00, ?it/s]

In [11]:
# Populate lists with audio features (second half)
for index, row in tqdm_notebook(df2.iterrows()):
    if index % 200 == 0:
        time.sleep(random.uniform(3, 6))
    uri = row['uri']
    query = sp.audio_features(uri)
    try:
        danceability.append(query[0]['danceability'])
    except:
        danceability.append('NA')
    try:
        energy.append(query[0]['energy'])
    except:
        energy.append('NA')
    try:
        loudness.append(query[0]['loudness'])
    except:
        loudness.append('NA')
    try:
        speechiness.append(query[0]['speechiness'])
    except:
        speechiness.append('NA')
    try:
        acousticness.append(query[0]['acousticness'])
    except:
        acousticness.append('NA')
    try:
        instrumentalness.append(query[0]['instrumentalness'])
    except:
        instrumentalness.append('NA')
    try:
        liveness.append(query[0]['liveness'])
    except:
        liveness.append('NA')
    try:
        valence.append(query[0]['valence'])
    except:
        valence.append('NA')
    try:
        tempo.append(query[0]['tempo'])
    except:
        tempo.append('NA')
    try:
        duration_ms.append(query[0]['duration_ms'])
    except:
        duration_ms.append('NA')

0it [00:00, ?it/s]

In [12]:
# Add columns to df
df['danceability'] = danceability
df['energy'] = energy
df['loudness'] = loudness
df['speechiness'] = speechiness
df['acousticness'] = acousticness
df['instrumentalness'] = instrumentalness
df['liveness'] = liveness
df['valence'] = valence
df['tempo'] = tempo
df['duration_ms'] = duration_ms

df.tail()

Unnamed: 0,endTime,artistName,trackName,msPlayed,uri,startTime,listeningSession,day,dayofweek,month,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
41377,2022-05-20 23:10,Your Smith,Ooh Wee,110189,3NzqnpBP4gT27QMD73GwSM,2022-05-20 23:08:09.811,1057,365,Friday,May,...,0.748,0.645,-8.08,0.03,0.15,0.0111,0.0929,0.634,95.993,251217
41378,2022-05-20 23:10,Mac Miller,Ladders,17904,39NDBdU5Xkm5pCFGa5kZtI,2022-05-20 23:09:42.096,1057,365,Friday,May,...,0.802,0.463,-8.379,0.162,0.236,0.00531,0.105,0.291,103.961,287040
41379,2022-05-20 23:15,Jessie Reyez,"Figures, a Reprise",4110,0qynxleiY0wavOijAW9trC,2022-05-20 23:14:55.890,1057,365,Friday,May,...,0.506,0.412,-5.64,0.137,0.667,0.0,0.0983,0.195,173.115,192827
41380,2022-05-20 23:15,Daniel Caesar,OPEN UP,266226,4QTwWQDW89udp0JDO2lmPI,2022-05-20 23:10:33.774,1057,365,Friday,May,...,0.84,0.192,-12.843,0.0514,0.409,6e-06,0.101,0.31,114.93,266227
41381,2022-05-20 23:30,TENDER,Can't Show My Face,87796,456UdZSHZco8pr5Mau8w3e,2022-05-20 23:28:32.204,1057,365,Friday,May,...,0.268,0.405,-9.606,0.0452,0.0946,0.297,0.104,0.0524,107.059,249120


### Refine Dataframe by Removing Errors

In [13]:
# Find rows where audio features weren't found
df_NA = df[(df['danceability']=='NA') | (df['energy']=='NA') | (df['loudness']=='NA') | (df['speechiness']=='NA') |
           (df['acousticness']=='NA') | (df['instrumentalness']=='NA') | (df['liveness']=='NA') |
           (df['valence']=='NA') | (df['tempo']=='NA') | (df['duration_ms']=='NA')]
df_NA

Unnamed: 0,endTime,artistName,trackName,msPlayed,uri,startTime,listeningSession,day,dayofweek,month,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
33103,2022-03-06 03:08,Natasha Bedingfield,These Words,216360,6MFQeWtk7kxWGydnJB2y36,2022-03-06 03:04:23.640,853,288,Sunday,March,...,,,,,,,,,,


In [14]:
# Remove rows where audio features weren't found
df.drop(df_NA.index, inplace=True)
df.reset_index(inplace=True, drop=True)

# Update datatype for numeric columns
df = df.astype({'listeningSession': int, 'msPlayed': int, 'danceability': float, 'energy': float,
                'loudness': float, 'speechiness': float, 'acousticness': float, 'instrumentalness': float,
                'liveness': float, 'valence': float, 'tempo': float, 'duration_ms': int, 'day': int, 'hour': int})

### Normalize Audio Features Using sklearn Preprocessing

In [15]:
# All of the audio features range from 0 to 1 except for loudness and tempo
# Let's add columns with normalized data for these two features so they also range from 0 to 1
cols_to_norm = ['loudness','tempo']
min_max_scaler = preprocessing.MinMaxScaler()
df[['loudness_norm', 'tempo_norm']] = min_max_scaler.fit_transform(df[cols_to_norm])

# Reorganize columns
col_order = ['day', 'dayofweek', 'month', 'hour', 'listeningSession', 'startTime', 'endTime', 'msPlayed', 'duration_ms', 'trackName', 'artistName',
             'uri', 'danceability', 'energy', 'loudness', 'loudness_norm', 'speechiness', 'acousticness',
             'instrumentalness', 'liveness', 'valence', 'tempo', 'tempo_norm']
df = df[col_order]

df.tail()

Unnamed: 0,day,dayofweek,month,hour,listeningSession,startTime,endTime,msPlayed,duration_ms,trackName,...,energy,loudness,loudness_norm,speechiness,acousticness,instrumentalness,liveness,valence,tempo,tempo_norm
41376,365,Friday,May,23,1057,2022-05-20 23:08:09.811,2022-05-20 23:10,110189,251217,Ooh Wee,...,0.645,-8.08,0.796663,0.03,0.15,0.0111,0.0929,0.634,95.993,0.436498
41377,365,Friday,May,23,1057,2022-05-20 23:09:42.096,2022-05-20 23:10,17904,287040,Ladders,...,0.463,-8.379,0.789577,0.162,0.236,0.00531,0.105,0.291,103.961,0.47273
41378,365,Friday,May,23,1057,2022-05-20 23:14:55.890,2022-05-20 23:15,4110,192827,"Figures, a Reprise",...,0.412,-5.64,0.854489,0.137,0.667,0.0,0.0983,0.195,173.115,0.787187
41379,365,Friday,May,23,1057,2022-05-20 23:10:33.774,2022-05-20 23:15,266226,266227,OPEN UP,...,0.192,-12.843,0.683785,0.0514,0.409,6e-06,0.101,0.31,114.93,0.522609
41380,365,Friday,May,23,1057,2022-05-20 23:28:32.204,2022-05-20 23:30,87796,249120,Can't Show My Face,...,0.405,-9.606,0.760499,0.0452,0.0946,0.297,0.104,0.0524,107.059,0.486818


### Save to CSV

In [16]:
df.to_csv('StreamingHistory_Final.csv', index=False)