# This section of the code calls the Covid-19 John Hopkins API and merges the cases dataset to the enriched Spotify Top 50 dataset (that contains the info from the Spotify API) by date.

In [99]:
# Import necessary packages for API requests and manipulation

import requests
import pandas as pd
import datetime
import numpy as np
import os

In [68]:
# Get the API data for the past 467 days (or however many days to January 1st 2020 as of today)

response = requests.get("https://disease.sh/v3/covid-19/historical/all?lastdays=467")

In [69]:
# Test API connection

if response.status_code == 200:
    print("API Successful")
else:
    print("API Query Failed")

API Successful


In [70]:
# Load data as JSON

import json
covid_data = json.loads(response.text)

In [71]:
# Isolate only cases key of the larger nested dictionary

covid_cases = covid_data['cases']

In [72]:
# Check data type

type(covid_cases)

dict

In [86]:
# Iteratively move through dictionary and compile to list

df_rows = []
for k,v in covid_cases.items():
    print('key', k)
    print('value',v)
    df_rows.append([k,v])

key 1/22/20
value 557
key 1/23/20
value 655
key 1/24/20
value 941
key 1/25/20
value 1433
key 1/26/20
value 2118
key 1/27/20
value 2927
key 1/28/20
value 5578
key 1/29/20
value 6167
key 1/30/20
value 8235
key 1/31/20
value 9927
key 2/1/20
value 12038
key 2/2/20
value 16787
key 2/3/20
value 19887
key 2/4/20
value 23898
key 2/5/20
value 27643
key 2/6/20
value 30803
key 2/7/20
value 34396
key 2/8/20
value 37130
key 2/9/20
value 40160
key 2/10/20
value 42769
key 2/11/20
value 44811
key 2/12/20
value 45229
key 2/13/20
value 60382
key 2/14/20
value 66909
key 2/15/20
value 69052
key 2/16/20
value 71235
key 2/17/20
value 73270
key 2/18/20
value 75152
key 2/19/20
value 75652
key 2/20/20
value 76212
key 2/21/20
value 76841
key 2/22/20
value 78602
key 2/23/20
value 78982
key 2/24/20
value 79546
key 2/25/20
value 80399
key 2/26/20
value 81376
key 2/27/20
value 82736
key 2/28/20
value 84122
key 2/29/20
value 86013
key 3/1/20
value 88394
key 3/2/20
value 90377
key 3/3/20
value 92971
key 3/4/20
value 

In [87]:
# Make dataframe

covid_data = pd.DataFrame(df_rows, columns=['Date','cases'])

In [88]:
# Convert string date to datetime

covid_data['Date']= pd.to_datetime(covid_data['Date'])

In [92]:
# Check subsample of data to see if formatting is correct

covid_data[(covid_data['Date'] > '2020-12-01') & (covid_data['Date'] < '2020-12-31')]

Unnamed: 0,Date,cases
315,2020-12-02,64673997
316,2020-12-03,65368134
317,2020-12-04,66051795
318,2020-12-05,66693750
319,2020-12-06,67232715
320,2020-12-07,67752737
321,2020-12-08,68396263
322,2020-12-09,69066170
323,2020-12-10,70564179
324,2020-12-11,71268458


In [None]:
# Change directory for this script as well

os.chdir('F:\OneDrive - Central European University\Courses\Winter_Term\Coding 3\Term Project\spotify-covid-analysis\data')

In [64]:
# Read in the enriched Spotify dataset

spotify = pd.read_csv('all_spotify_data.csv')

In [65]:
# Make date field to datetime in the Spotify enriched dataset

spotify['Date']= pd.to_datetime(spotify['Date'])

In [103]:
# Merge Spotify data with Covid data by Date columns --> check result

df = pd.merge(spotify,covid_data,on='Date',how='left')
df

Unnamed: 0.1,Unnamed: 0,Position,Track Name,Artist,Date,trackID,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cases
0,0,1,Pura Pura Lupa,Mahen,2020-01-02,6Aw5k0jkFceYayaJoRupLT,0.529,0.4460,11,-9.167,1,0.0294,0.75700,0.000764,0.1460,0.140,111.903,246964,4,
1,1,1,Pura Pura Lupa,Mahen,2020-01-09,6Aw5k0jkFceYayaJoRupLT,0.529,0.4460,11,-9.167,1,0.0294,0.75700,0.000764,0.1460,0.140,111.903,246964,4,
2,2,1,Pura Pura Lupa,Mahen,2020-01-16,6Aw5k0jkFceYayaJoRupLT,0.529,0.4460,11,-9.167,1,0.0294,0.75700,0.000764,0.1460,0.140,111.903,246964,4,
3,3,2,Pura Pura Lupa,Mahen,2020-01-23,6Aw5k0jkFceYayaJoRupLT,0.529,0.4460,11,-9.167,1,0.0294,0.75700,0.000764,0.1460,0.140,111.903,246964,4,655.0
4,4,4,Pura Pura Lupa,Mahen,2020-01-30,6Aw5k0jkFceYayaJoRupLT,0.529,0.4460,11,-9.167,1,0.0294,0.75700,0.000764,0.1460,0.140,111.903,246964,4,8235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2645,2645,38,Nuages ​​Creux,Lydia Baskow Trio,2020-12-31,4Bv9PjhIhp2sQE4S7dEAxR,0.279,0.0331,0,-21.254,1,0.0410,0.77500,0.008630,0.0938,0.212,98.321,157500,4,83558756.0
2646,2646,42,Let Me In (20 CUBE),ENHYPEN,2020-12-31,4eOcd47wPU3ixxYw0spREl,0.483,0.7950,8,-4.338,0,0.1810,0.00258,0.000000,0.0912,0.685,171.853,189859,4,83558756.0
2647,2647,43,Dance With You,Skusta Clee,2020-12-31,7bUvgvAf0tdQIty95lV5wf,0.771,0.6290,8,-7.100,1,0.0833,0.03750,0.000000,0.1080,0.552,116.000,244321,4,83558756.0
2648,2648,45,Melek,Reynmen,2020-12-31,6XbuJHhqugUVdJbTdshUEL,0.569,0.6420,11,-5.160,1,0.0550,0.15200,0.000000,0.2360,0.285,180.076,154923,4,83558756.0


In [105]:
# Correctly label missing values as 0 Covid cases (this is our benchmark period - i.e. early January 2020)

df['cases'] = np.where(np.isnan(df['cases']), 0, df['cases'])
df

In [107]:
# Write out the final dataset used for analytics to a CSV

df.to_csv('spotify_covid_data.csv')