In [32]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint

In [33]:
import json

In [34]:
#create an empty list to populate with artist names

random_nums = np.random.randint(low=100000, high=170000, size=50)

#random_nums=[152689, 140338, 138577, 136662, 143824, 168060, 
#              134859, 153222, 156737, 138958, 132787, 150081]

artist_names=[]

In [35]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    #https://docs.python.org/3/tutorial/errors.html
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
#print(artist_names)

In [36]:
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [37]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(3)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,166587,Benny Cristo,,,,,1987,,
1,141215,Tom Sancton,,,,,1949,,
2,128837,Maya Jane Coles,Female,1.0,,House,0,,


In [60]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)
audio_df['genre'].replace({'' : 'Null', None : 'Null'}, inplace = True)

#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,166587,Benny Cristo,Null,,Null,Null,1987,,Null
1,141215,Tom Sancton,Null,,Null,Null,1949,,Null
2,128837,Maya Jane Coles,Female,1.0,Null,House,Null,,Null
3,143760,夢みるアドレセンス,Female,5.0,Null,J-Pop,2012,,Null
4,141083,Orquestra Jazz de Matosinhos,Null,,Null,Null,Null,,Null
5,116044,Krymplings,Male,5.0,Rock/Pop,Punk Rock,1993,Yes,Null
6,129190,Left Boy,Male,1.0,Null,Null,Null,,Null
7,152244,Ryan Amon,Male,1.0,Null,Composer,Null,,Null
8,160814,Thinkman,Male,1.0,Null,Null,Null,,Null
9,134601,A Whisper in the Noise,Null,,Null,Null,2002,,Null


In [39]:
audio_df.describe()

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
count,39,39,39,27,39,36.0,28,4,39
unique,39,39,4,5,5,15.0,23,1,15
top,166587,Benny Cristo,Male,1,Null,,Null,Yes,Null
freq,1,1,17,17,30,18.0,4,4,25


In [42]:
#replacing and correcting location strings
#replace strings that contain a given value with an entire string

audio_df.loc[audio_df['location'].str.contains
             ('US|American|New York|Sacramento|Colorado|Missouri|California|Florida|Indiana|Texas|Los Angeles|Elk Grove Village|Arizona|Arlington', 
              na=False, case=False, regex=True), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains
             ('UK|England|English|Edinburgh|London|Scotland|Irish|Ireland|Manchester|United Kingdom|Nottingham|Croydon|Liverpool', 
              na=False, case=False, regex=True), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains
             ('German|Deutsch|Deutch', 
              na=False, case=False, regex=True), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains
             ('Italia|Italy', 
              na=False, case=False, regex=True), 'location'] = 'Italy'
audio_df.loc[audio_df['location'].str.contains
             ('France|French', 
              na=False, case=False, regex=True), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains
             ('Belgium', 
              na=False, case=False, regex=True), 'location'] = 'Belgium'
audio_df.loc[audio_df['location'].str.contains
             ('Greece|Greek', 
              na=False, case=False, regex=True), 'location'] = 'Greece'
audio_df.loc[audio_df['location'].str.contains
             ('Switzerland|Swiss', 
              na=False, case=False, regex=True), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains
             ('Canada|Canadian', 
              na=False, case=False, regex=True), 'location'] = 'Canada'
audio_df.loc[audio_df['location'].str.contains
             ('Melbourne|Australia', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Japan|Japanese', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Poznań|Poland', 
              na=False, case=False, regex=True), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains
             ('Россия|Moscow', 
              na=False, case=False, regex=True), 'location'] = 'Russia'
audio_df.loc[audio_df['location'].str.contains
             ('Portugal', 
              na=False, case=False, regex=True), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains
             ('Swede|Gothenburg', 
              na=False, case=False, regex=True), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains
             ('Odense', 
              na=False, case=False, regex=True), 'location'] = 'Denmark'
audio_df.loc[audio_df['location'].str.contains
             ('Dutch|Netherlands', 
              na=False, case=False, regex=True), 'location'] = 'Netherlands'
audio_df.loc[audio_df['location'].str.contains
             ('Zealand|Wellington', 
              na=False, case=False, regex=True), 'location'] = 'New Zealand'
audio_df.loc[audio_df['location'].str.contains
             ('Rumania', 
              na=False, case=False, regex=True), 'location'] = 'Romania'
audio_df.loc[audio_df['location'].str.contains
             ('South Africa', 
              na=False, case=False, regex=True), 'location'] = 'South Africa'
audio_df.loc[audio_df['location'].str.contains
             ('Argentina', 
              na=False, case=False, regex=True), 'location'] = 'Argentina'
audio_df.loc[audio_df['location'].str.contains
             ('Mexico', 
              na=False, case=False, regex=True), 'location'] = 'Mexico'

audio_df

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,166587,Benny Cristo,Null,,Null,,1987,,Null
1,141215,Tom Sancton,Null,,Null,,1949,,Null
2,128837,Maya Jane Coles,Female,1.0,Null,House,,,Null
3,143760,夢みるアドレセンス,Female,5.0,Null,J-Pop,2012,,Null
4,141083,Orquestra Jazz de Matosinhos,Null,,Null,,,,Null
5,116044,Krymplings,Male,5.0,Rock/Pop,Punk Rock,1993,Yes,Null
6,129190,Left Boy,Male,1.0,Null,,Null,,Null
7,152244,Ryan Amon,Male,1.0,Null,Composer,,,Null
8,160814,Thinkman,Male,1.0,Null,,,,Null
9,134601,A Whisper in the Noise,Null,,Null,,2002,,Null


In [63]:
#REPLACE AND DROP NAN FOR INDIVIDUAL COLUMNS
#assign individual variables for individual columns with null dropped

#replace 'Null' with NaN for all values in location column, drop NaN
location_na = audio_df.replace({'location': r'Null'}, {'location': np.nan}, regex=True)
location_dropped = location_na.dropna(axis=0, subset=['location'], inplace=False)
location_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
genre_na = audio_df.replace({'genre': r'Null'}, {'genre': np.nan}, regex=True)
genre_dropped = genre_na.dropna(axis=0, subset=['genre'], inplace=False)
genre_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
gender_na = audio_df.replace({'gender': r'Null'}, {'gender': np.nan}, regex=True)
gender_dropped = gender_na.dropna(axis=0, subset=['gender'], inplace=False)
gender_dropped

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
2,128837,Maya Jane Coles,Female,1,Null,House,Null,,Null
3,143760,夢みるアドレセンス,Female,5,Null,J-Pop,2012,,Null
5,116044,Krymplings,Male,5,Rock/Pop,Punk Rock,1993,Yes,Null
6,129190,Left Boy,Male,1,Null,Null,Null,,Null
7,152244,Ryan Amon,Male,1,Null,Composer,Null,,Null
8,160814,Thinkman,Male,1,Null,Null,Null,,Null
10,131491,Allah-Las,Male,3,Rock/Pop,Null,Null,,Null
11,112619,Thijs van Leer,Male,4,Progressive Rock,Null,1969,,Netherlands
13,113154,Jerry Cantrell,Male,1,Null,Grunge,1984,,USA
15,145039,Erich Leinsdorf,Male,1,Null,Classical,Null,Yes,USA


In [52]:
#LOCATION COUNT - NULL INCLUDED

#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

Null                                25
USA                                  4
UK                                   2
Netherlands                          1
Tbilisi, Georgia                     1
Montevideo, Uruguay                  1
Oulu, Finland                        1
Stara Pazova, Serbia                 1
Germany                              1
Santa Maria                          1
Sarajevo, Bosnia and Herzegovina     1
Name: location, dtype: int64

In [53]:
#LOCATION COUNT - NULL DROPPED

#number of times each location appears in the dataframe
location_counts_dropped = location_dropped['location'].value_counts()

#series
location_counts_dropped

USA                                 4
UK                                  2
Netherlands                         1
Tbilisi, Georgia                    1
Montevideo, Uruguay                 1
Oulu, Finland                       1
Stara Pazova, Serbia                1
Germany                             1
Santa Maria                         1
Sarajevo, Bosnia and Herzegovina    1
Name: location, dtype: int64

In [90]:
#LOCATION PROPORTION - NULL INCLUDED

#proportion each location represents in the data
location_proportion = round(location_counts / location_counts.sum(), 2)

#series
location_proportion

Null                                0.64
USA                                 0.10
UK                                  0.05
Netherlands                         0.03
Tbilisi, Georgia                    0.03
Montevideo, Uruguay                 0.03
Oulu, Finland                       0.03
Stara Pazova, Serbia                0.03
Germany                             0.03
Santa Maria                         0.03
Sarajevo, Bosnia and Herzegovina    0.03
Name: location, dtype: float64

In [89]:
#LOCATION PROPORTION - NULL DROPPED

#proportion each location represents in the data
location_proportion_dropped = round(location_counts_dropped / location_counts_dropped.sum(), 2)

#series
location_proportion_dropped

USA                                 0.29
UK                                  0.14
Netherlands                         0.07
Tbilisi, Georgia                    0.07
Montevideo, Uruguay                 0.07
Oulu, Finland                       0.07
Stara Pazova, Serbia                0.07
Germany                             0.07
Santa Maria                         0.07
Sarajevo, Bosnia and Herzegovina    0.07
Name: location, dtype: float64

In [64]:
#GENRE COUNT - NULL INCLUDED

#count of unique genres, excluding null values
genre_counts = audio_df['genre'].value_counts()

#series
genre_counts

Null                  21
Classical              3
House                  2
Rock                   2
J-Pop                  1
Punk Rock              1
Composer               1
Grunge                 1
Indie                  1
Pop                    1
Alternative Rock       1
Electronic             1
Heavy Metal            1
Progressive Trance     1
R&B                    1
Name: genre, dtype: int64

In [66]:
#GENRE COUNT - NULL DROPPED

#count of unique genres, excluding null values
genre_counts_dropped = genre_dropped['genre'].value_counts()

#series
genre_counts_dropped

Classical             3
House                 2
Rock                  2
J-Pop                 1
Punk Rock             1
Composer              1
Grunge                1
Indie                 1
Pop                   1
Alternative Rock      1
Electronic            1
Heavy Metal           1
Progressive Trance    1
R&B                   1
Name: genre, dtype: int64

In [88]:
#GENRE PROPORTION - NULL INCLUDED

#proportion each genre represents in the data
genre_proportion = round(genre_counts / genre_counts.sum(), 2)

#series
genre_proportion

Null                  0.54
Classical             0.08
House                 0.05
Rock                  0.05
J-Pop                 0.03
Punk Rock             0.03
Composer              0.03
Grunge                0.03
Indie                 0.03
Pop                   0.03
Alternative Rock      0.03
Electronic            0.03
Heavy Metal           0.03
Progressive Trance    0.03
R&B                   0.03
Name: genre, dtype: float64

In [87]:
#GENRE PROPORTION - NULL DROPPED

#proportion each genre represents in the data
genre_proportion_dropped = round(genre_counts_dropped / genre_counts_dropped.sum(), 2)

#series
genre_proportion_dropped

Classical             0.17
House                 0.11
Rock                  0.11
J-Pop                 0.06
Punk Rock             0.06
Composer              0.06
Grunge                0.06
Indie                 0.06
Pop                   0.06
Alternative Rock      0.06
Electronic            0.06
Heavy Metal           0.06
Progressive Trance    0.06
R&B                   0.06
Name: genre, dtype: float64

In [86]:
#GENDER COUNT - NULL INCLUDED

#count of unique genres, excluding null values
gender_counts = audio_df['gender'].value_counts()

#series
gender_counts

Male      17
Null      16
Female     5
Mixed      1
Name: gender, dtype: int64

In [84]:
#GENDER COUNT - NULL DROPPED

#count of unique genres, excluding null values
gender_counts_dropped = gender_dropped['gender'].value_counts()

#series
gender_counts_dropped

Male      17
Female     5
Mixed      1
Name: gender, dtype: int64

In [83]:
#GENDER PROPORTION - NULL INCLUDED

#proportion each genre represents in the data
gender_proportion = round(gender_counts / gender_counts.sum(), 2)

#series
gender_proportion

Male      0.44
Null      0.41
Female    0.13
Mixed     0.03
Name: gender, dtype: float64

In [82]:
#GENDER PROPORTION - NULL DROPPED

#proportion each genre represents in the data
gender_proportion_dropped = round(gender_counts_dropped / gender_counts_dropped.sum(), 2)

#series
gender_proportion_dropped

Male      0.74
Female    0.22
Mixed     0.04
Name: gender, dtype: float64

In [74]:
#GROUPBYS WITH NULL VALUES DROPPED
#genre and location columns with Null values dropped

#FROM ABOVE
gender_na = audio_df.replace({'gender': r'Null'}, {'gender': np.nan}, regex=True)
gender_dropped = gender_na.dropna(axis=0, subset=['gender'], inplace=False)
gender_dropped

#create a variable with gender and location null values dropped
genre_grouped = genre_dropped.groupby('genre')

genre_by_location_ = genre_grouped['location'].value_counts()

genre_by_location

genre               location            
Alternative Rock    USA                     1
Classical           Montevideo, Uruguay     1
                    Null                    1
                    USA                     1
Composer            Null                    1
Electronic          UK                      1
Grunge              USA                     1
Heavy Metal         Oulu, Finland           1
House               Null                    2
Indie               UK                      1
J-Pop               Null                    1
Pop                 Tbilisi, Georgia        1
Progressive Trance  Stara Pazova, Serbia    1
Punk Rock           Null                    1
R&B                 Null                    1
Rock                Germany                 1
                    USA                     1
Name: location, dtype: int64