In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint

In [2]:
import json

In [3]:
#create an empty list to populate with artist names

random_nums = np.random.randint(low=100000, high=170000, size=50)

#random_nums=[152689, 140338, 138577, 136662, 143824, 168060, 
#              134859, 153222, 156737, 138958, 132787, 150081]

artist_names=[]

In [4]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    #https://docs.python.org/3/tutorial/errors.html
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
#print(artist_names)

In [5]:
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [6]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(3)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,117080,Bob Brookmeyer,Male,1.0,,Jazz,,,American
1,145030,The Cosmic Jokers,,1.0,,Avant-Garde,1973.0,Yes,
2,162744,大門弥生,,,,,1991.0,,


In [7]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)

#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,117080,Bob Brookmeyer,Male,1.0,Null,Jazz,Null,,American
1,145030,The Cosmic Jokers,Null,1.0,Null,Avant-Garde,1973,Yes,Null
2,162744,大門弥生,Null,,Null,,1991,,Null
3,155663,Human Vivisection,Male,5.0,Null,,2012,,"Bree, Limburg"
4,123727,Tiga,Male,1.0,Electronic,Electronic,1974,,Null
5,146973,Nargaroth,Null,1.0,Null,Black Metal,1996,,Null
6,135888,Inculto,Male,5.0,Null,Ska,2003,Yes,Null
7,162189,Ludvig Forssell,Male,1.0,Null,OST,,,Null
8,149351,Tisíc let od ráje,Null,,Null,,1994,,Null
9,124302,The Black-Eyed Snakes,Null,,Null,,,,Null


In [8]:
audio_df.describe()

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
count,41,41,41,29,41,37.0,32,3,41
unique,41,41,3,5,6,21.0,25,1,13
top,117080,Bob Brookmeyer,Male,1,Null,,Null,Yes,Null
freq,1,1,19,20,29,14.0,5,3,29


In [18]:
#replacing and correcting location strings
#replace strings that contain a given value with an entire string

audio_df.loc[audio_df['location'].str.contains
             ('US|American|New York|Sacramento|Colorado|Missouri|California|Florida|Indiana|Texas|Los Angeles|Arizona|Arlington', 
              na=False, case=False, regex=True), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains
             ('UK|England|English|Edinburgh|London|Scotland|Ireland|United Kingdon|Nottingham|Croydon|Liverpool', 
              na=False, case=False, regex=True), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains
             ('German|Deutsch', 
              na=False, case=False, regex=True), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains
             ('France|French', 
              na=False, case=False, regex=True), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains
             ('Belgium', 
              na=False, case=False, regex=True), 'location'] = 'Belgium'
audio_df.loc[audio_df['location'].str.contains
             ('Greece|Greek', 
              na=False, case=False, regex=True), 'location'] = 'Greece'
audio_df.loc[audio_df['location'].str.contains
             ('Switzerland|Swiss', 
              na=False, case=False, regex=True), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains
             ('Canada|Canadian', 
              na=False, case=False, regex=True), 'location'] = 'Canada'
audio_df.loc[audio_df['location'].str.contains
             ('Melbourne|Australia', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Japan|Japanese', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Poznań|Poland', 
              na=False, case=False, regex=True), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains
             ('Россия|Moscow', 
              na=False, case=False, regex=True), 'location'] = 'Russia'

audio_df.loc[audio_df['location'].str.contains('Portugal', na=False), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains('Swede', na=False), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains('Odense', na=False), 'location'] = 'Denmark'
audio_df.loc[audio_df['location'].str.contains('Zealand', na=False), 'location'] = 'New Zealand'
audio_df.loc[audio_df['location'].str.contains('Rumania', na=False), 'location'] = 'Romania'

audio_df

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,117080,Bob Brookmeyer,Male,1.0,Null,Jazz,Null,,USA
1,145030,The Cosmic Jokers,Null,1.0,Null,Avant-Garde,1973,Yes,Null
2,162744,大門弥生,Null,,Null,,1991,,Null
3,155663,Human Vivisection,Male,5.0,Null,,2012,,"Bree, Limburg"
4,123727,Tiga,Male,1.0,Electronic,Electronic,1974,,Null
5,146973,Nargaroth,Null,1.0,Null,Black Metal,1996,,Null
6,135888,Inculto,Male,5.0,Null,Ska,2003,Yes,Null
7,162189,Ludvig Forssell,Male,1.0,Null,OST,,,Null
8,149351,Tisíc let od ráje,Null,,Null,,1994,,Null
9,124302,The Black-Eyed Snakes,Null,,Null,,,,Null


In [18]:
#location stats
#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

Null                                    21
USA                                      6
Constanta, Romania                       1
France                                   1
Johannesburg, South Africa               1
UK                                       1
Germany                                  1
Boston                                   1
Palermo, Italia                          1
    Caxias do Sul, Rio Grande do Sul     1
Canada                                   1
Sweden                                   1
Name: location, dtype: int64

In [11]:
#proportion each location represents in the data
location_proportion = location_counts / location_counts.sum()

#series
location_proportion

Null                                    0.567568
USA                                     0.135135
Constanta, Romania                      0.027027
Raleigh NC                              0.027027
France                                  0.027027
Johannesburg, South Africa              0.027027
UK                                      0.027027
Germany                                 0.027027
Boston                                  0.027027
Palermo, Italia                         0.027027
    Caxias do Sul, Rio Grande do Sul    0.027027
Canada                                  0.027027
Sweden                                  0.027027
Name: location, dtype: float64

In [12]:
#genre stats

#replace empty strings in genre column with NaN 
genre_na = audio_df.replace({'genre': r''}, {'genre': np.nan}, regex=True)

#count of unique genres, excluding null values
genre_counts = genre_na['genre'].value_counts()

#series
genre_counts

Rock                   3
Pop                    2
Dance                  1
Hip-Hop                1
House                  1
Lounge                 1
Heavy Metal            1
Power Metal            1
Latin                  1
Psychedelic Rock       1
Classical              1
Jazz                   1
Composer               1
Alternative Country    1
Pop-Rock               1
New Wave               1
Trance                 1
Name: genre, dtype: int64

In [13]:
#proportion each genre represents in the data
genre_proportion = genre_counts / genre_counts.sum()

#series
genre_proportion

Rock                   0.15
Pop                    0.10
Dance                  0.05
Hip-Hop                0.05
House                  0.05
Lounge                 0.05
Heavy Metal            0.05
Power Metal            0.05
Latin                  0.05
Psychedelic Rock       0.05
Classical              0.05
Jazz                   0.05
Composer               0.05
Alternative Country    0.05
Pop-Rock               0.05
New Wave               0.05
Trance                 0.05
Name: genre, dtype: float64

In [14]:
#genre_na1 = audio_df.replace(to_replace=r'', value=np.nan, regex=True)
#genre_na = audio_df.replace(regex=[r'', 'None '], value=np.nan)
genre_na = audio_df.replace({'genre': r''}, {'genre': np.nan}, regex=True)
#genre_na2 = genre_na.replace({'genre': 'None'}, {'genre': np.nan}, regex=True)
genre_na2 = genre_na.replace(to_replace='None', value=np.nan, method='pad')
genre_na2

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,132030,Andre Ola,Female,1.0,Rock/Pop,Dance,1996,,"Constanta, Romania"
1,161175,Luna Halo,Null,1.0,Null,Rock,1999,,Null
2,132534,Streets of Laredo,Null,1.0,Null,,,,Null
3,133697,Cough Cool,Null,,Null,,,,Null
4,114916,DEVO,Male,5.0,Rock/Pop,New Wave,1972,,USA
5,142814,Gary Portnoy,Null,,Null,,1956,,Null
6,124992,Adrenaline Factor,Male,4.0,Null,Pop-Rock,,,USA
7,153988,Jan Driver,Null,,Null,,1977,,Null
8,152120,Guadalupe Pineda,Null,,Null,,1955,,Null
9,152491,American Aquarium,Male,5.0,Null,Alternative Country,2005,,Raleigh NC


In [15]:
genre_na2 = genre_na1.replace(to_replace=r'None', value=np.nan, regex=True)
genre_na2

NameError: name 'genre_na1' is not defined

In [None]:
#remove null values from genre data and find proportion
genre_dropna = genre_na.dropna(axis='rows', subset = ['genre'])
#genre_dropna_counts = genre_dropna['genre'].valuecounts()
#genre_dropna_proportion = 
genre_dropna

In [None]:

genre_by_location = genre_grouped['location'].value_counts()
genre_by_location