In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint

In [None]:
import json

In [None]:
#create an empty list to populate with artist names

random_nums = np.random.randint(low=100000, high=170000, size=50)

#random_nums=[152689, 140338, 138577, 136662, 143824, 168060, 
#              134859, 153222, 156737, 138958, 132787, 150081]

artist_names=[]

In [None]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    #https://docs.python.org/3/tutorial/errors.html
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
#print(artist_names)

In [None]:
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [None]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(3)

In [None]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)

#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

In [None]:
audio_df.describe()

In [None]:
#replacing and correcting location strings
#replace strings that contain a given value with an entire string

audio_df.loc[audio_df['location'].str.contains
             ('US|American|New York|Sacramento|Colorado|Missouri|California|Florida|Indiana|Texas|Los Angeles|Arizona|Arlington', 
              na=False, case=False, regex=True), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains
             ('UK|England|English|Edinburgh|London|Scotland|Ireland|United Kingdon|Nottingham|Croydon|Liverpool', 
              na=False, case=False, regex=True), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains
             ('German|Deutsch', 
              na=False, case=False, regex=True), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains
             ('France|French', 
              na=False, case=False, regex=True), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains
             ('Belgium', 
              na=False, case=False, regex=True), 'location'] = 'Belgium'
audio_df.loc[audio_df['location'].str.contains
             ('Greece|Greek', 
              na=False, case=False, regex=True), 'location'] = 'Greece'
audio_df.loc[audio_df['location'].str.contains
             ('Switzerland|Swiss', 
              na=False, case=False, regex=True), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains
             ('Canada|Canadian', 
              na=False, case=False, regex=True), 'location'] = 'Canada'
audio_df.loc[audio_df['location'].str.contains
             ('Melbourne|Australia', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Japan|Japanese', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Poznań|Poland', 
              na=False, case=False, regex=True), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains
             ('Россия|Moscow', 
              na=False, case=False, regex=True), 'location'] = 'Russia'

audio_df.loc[audio_df['location'].str.contains('Portugal', na=False), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains('Swede', na=False), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains('Odense', na=False), 'location'] = 'Denmark'
audio_df.loc[audio_df['location'].str.contains('Zealand', na=False), 'location'] = 'New Zealand'
audio_df.loc[audio_df['location'].str.contains('Rumania', na=False), 'location'] = 'Romania'

audio_df

In [None]:
#location stats
#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

In [None]:
#proportion each location represents in the data
location_proportion = location_counts / location_counts.sum()

#series
location_proportion

In [None]:
#genre stats

#replace empty strings in genre column with NaN 
genre_na = audio_df.replace({'genre': r''}, {'genre': np.nan}, regex=True)

#count of unique genres, excluding null values
genre_counts = genre_na['genre'].value_counts()

#series
genre_counts

In [None]:
#proportion each genre represents in the data
genre_proportion = genre_counts / genre_counts.sum()

#series
genre_proportion

In [None]:
#genre_na1 = audio_df.replace(to_replace=r'', value=np.nan, regex=True)
#genre_na = audio_df.replace(regex=[r'', 'None '], value=np.nan)
genre_na = audio_df.replace({'genre': r''}, {'genre': np.nan}, regex=True)
#genre_na2 = genre_na.replace({'genre': 'None'}, {'genre': np.nan}, regex=True)
genre_na2 = genre_na.replace(to_replace='None', value=np.nan, method='pad')
genre_na2

In [None]:
#remove null values from genre data and find proportion
genre_dropna = genre_na.dropna(axis='rows', subset = ['genre'])
#genre_dropna_counts = genre_dropna['genre'].valuecounts()
#genre_dropna_proportion = 
genre_dropna

In [None]:

genre_by_location = genre_grouped['location'].value_counts()
genre_by_location