In [19]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint

In [20]:
import json

In [21]:
# Import API key
# from api_keys import audio_api

In [22]:
#create an empty list to populate with artist names
artist_names=[]

In [23]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
#https://numpy.org/doc/stable/reference/random/generated/numpy.random.randint.html
random_nums = np.random.randint(low=100000, high=170000, size=800)

#print(random_nums)

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [24]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    #https://docs.python.org/3/tutorial/errors.html
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
#print(artist_names)

In [25]:
len(artist_names)

602

In [26]:
#print(artist_names)

In [27]:
#alternative to "if not in list then .append"
#convert list to set to remove redundant artists
#https://java2blog.com/python-list-to-set/


#unique_artist_ids = set(artist_names)
#print(unique_artist_ids)

In [28]:
#create loop to pull artist information from the database
#using populated artist_names list
#example url: http://theaudiodb.com/api/v1/json/1/search.php?s=coldplay
#tutorial: https://www.youtube.com/watch?v=pxofwuWTs7c

#create empty dictionary to fill with values
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [29]:
#print(audio_data)

In [30]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,164050,Josef Mohr,Male,1.0,,,,Yes,
1,123555,The Machine,,1.0,,,0.0,,
2,152953,MOJA,Mixed,1.0,,,2013.0,,"Nantes, France"
3,111560,Monkey,Male,3.0,,,,,
4,152865,Freya Josephine Hollick,Female,1.0,Country,Country,0.0,,
5,131624,Chrome,Male,2.0,,Rock,1976.0,,"San Francisco, America"
6,116518,Sleep,Male,3.0,Metal,Doom Metal,1991.0,,
7,163885,Taunusheim,,,,,1999.0,,
8,135847,Howard Shelley,Male,1.0,,Classical,,,England
9,155652,Anton Webern,Male,1.0,Classical,Composer,,,Austrian


In [31]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)

#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,164050,Josef Mohr,Male,1.0,Null,,Null,Yes,Null
1,123555,The Machine,Null,1.0,Null,,,,Null
2,152953,MOJA,Mixed,1.0,Null,,2013,,"Nantes, France"
3,111560,Monkey,Male,3.0,Null,,Null,,Null
4,152865,Freya Josephine Hollick,Female,1.0,Country,Country,,,Null
5,131624,Chrome,Male,2.0,Null,Rock,1976,,"San Francisco, America"
6,116518,Sleep,Male,3.0,Metal,Doom Metal,1991,,Null
7,163885,Taunusheim,Null,,Null,,1999,,Null
8,135847,Howard Shelley,Male,1.0,Null,Classical,Null,,England
9,155652,Anton Webern,Male,1.0,Classical,Composer,Null,,Austrian


In [43]:
##replacing and correcting location strings
#replace strings that contain a given value with an entire string

audio_df.loc[audio_df['location'].str.contains
             ('US|American|New York|Sacramento|Colorado|Missouri|California|Florida|Indiana|Texas|Los Angeles|Arizona|Arlington', 
              na=False, case=False, regex=True), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains
             ('UK|England|English|Edinburgh|London|Scotland|Ireland|United Kingdon|Nottingham|Croydon|Liverpool', 
              na=False, case=False, regex=True), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains
             ('German|Deutsch', 
              na=False, case=False, regex=True), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains
             ('France|French', 
              na=False, case=False, regex=True), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains
             ('Belgium', 
              na=False, case=False, regex=True), 'location'] = 'Belgium'
audio_df.loc[audio_df['location'].str.contains
             ('Greece|Greek', 
              na=False, case=False, regex=True), 'location'] = 'Greece'
audio_df.loc[audio_df['location'].str.contains
             ('Switzerland|Swiss', 
              na=False, case=False, regex=True), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains
             ('Canada|Canadian', 
              na=False, case=False, regex=True), 'location'] = 'Canada'
audio_df.loc[audio_df['location'].str.contains
             ('Melbourne|Australia', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Japan|Japanese', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Poznań|Poland', 
              na=False, case=False, regex=True), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains
             ('Россия|Moscow', 
              na=False, case=False, regex=True), 'location'] = 'Russia'
audio_df.loc[audio_df['location'].str.contains
             ('Portugal', na=False, case=False, regex=True), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains
             ('Swede', na=False, case=False, regex=True), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains
             ('Odense', na=False, case=False, regex=True), 'location'] = 'Denmark'
audio_df.loc[audio_df['location'].str.contains
             ('Zealand', na=False, case=False, regex=True), 'location'] = 'New Zealand'
audio_df.loc[audio_df['location'].str.contains
             ('Rumania', na=False, case=False, regex=True), 'location'] = 'Romania'
audio_df.loc[audio_df['location'].str.contains
             ('Österreich|Austria', 
              na=False, case=False, regex=True), 'location'] = 'Austria'
#audio_df.loc[audio_df['location'].str.contains
#             ('text1|text2', 
#              na=False, case=False, regex=True), 'location'] = 'replace'

In [44]:
#location stats
#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

Null                      333
USA                        82
UK                         39
Canada                     12
Sweden                      9
                         ... 
Kópavogur, Iceland          1
Baku, Azerbaijan            1
Poland                      1
Norway                      1
Zuqaq al-Blat, Lebanon      1
Name: location, Length: 92, dtype: int64

In [45]:
#proportion each location represents in the data
location_proportion = location_counts / location_counts.sum()

#series
location_proportion

Null                      0.566327
USA                       0.139456
UK                        0.066327
Canada                    0.020408
Sweden                    0.015306
                            ...   
Kópavogur, Iceland        0.001701
Baku, Azerbaijan          0.001701
Poland                    0.001701
Norway                    0.001701
Zuqaq al-Blat, Lebanon    0.001701
Name: location, Length: 92, dtype: float64

In [35]:
#genre stats
#count of unique genres
genre_counts = audio_df['genre'].value_counts()

#series
genre_counts

                          213
Pop                        34
Rock                       22
Jazz                       16
Folk                       16
                         ... 
Contemporary Christian      1
New Age                     1
Grime                       1
Industrial Metal            1
Rap                         1
Name: genre, Length: 85, dtype: int64

In [18]:
#proportion each genre represents in the data
genre_proportion = genre_counts / genre_counts.sum()

#series
genre_proportion

                       0.413081
Pop                    0.058520
Hip-Hop                0.032702
Rock                   0.030981
Folk                   0.025818
                         ...   
Indie Pop              0.001721
Street Punk            0.001721
Alternative Hip-Hop    0.001721
Avant-Garde            0.001721
Synthpop               0.001721
Name: genre, Length: 89, dtype: float64

In [None]:
#'m working on some code to remove null values 
#from genre and location so we get an accurate proportion
# -- Angela

In [None]:
# Percentage of artists how are Urban/R&B
#Count of artists per style
