In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint

In [4]:
import json

In [5]:
# Import API key
# from api_keys import audio_api

In [6]:
#create an empty list to populate with artist names
artist_names=[]

In [7]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
#https://numpy.org/doc/stable/reference/random/generated/numpy.random.randint.html
random_nums = np.random.randint(low=100000, high=170000, size=800)

#print(random_nums)

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [8]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    #https://docs.python.org/3/tutorial/errors.html
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
#print(artist_names)

In [9]:
len(artist_names)

608

In [10]:
#print(artist_names)

In [11]:
#alternative to "if not in list then .append"
#convert list to set to remove redundant artists
#https://java2blog.com/python-list-to-set/


#unique_artist_ids = set(artist_names)
#print(unique_artist_ids)

In [12]:
#create loop to pull artist information from the database
#using populated artist_names list
#example url: http://theaudiodb.com/api/v1/json/1/search.php?s=coldplay
#tutorial: https://www.youtube.com/watch?v=pxofwuWTs7c

#create empty dictionary to fill with values
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [13]:
#print(audio_data)

In [14]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,130042,Alex C.,Male,1.0,,Dance,,,
1,153271,John Conlee,,,,Country,1946.0,,
2,162769,slip,,,,,0.0,,
3,113602,Bill Withers,Male,1.0,Urban/R&B,Soul,1970.0,Yes,"Virginia, USA"
4,112467,Shooter Jennings,Male,1.0,,Country Rock,1997.0,,
5,141254,“Fats” Sadi,,,,,1927.0,,
6,142875,12 Gauge,Male,1.0,,Rap,,,"Augusta, Georgia"
7,144550,Steffen Brandt,Male,1.0,,,,,Aarhus
8,159712,Paloma San Basilio,,,,,1950.0,,
9,117511,54-40,Male,4.0,Rock/Pop,Alternative Rock,1981.0,,"Tsawwassen, British Columbia, Canada"


In [15]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)

#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,130042,Alex C.,Male,1.0,Null,Dance,Null,,Null
1,153271,John Conlee,Null,,Null,Country,1946,,Null
2,162769,slip,Null,,Null,,,,Null
3,113602,Bill Withers,Male,1.0,Urban/R&B,Soul,1970,Yes,"Virginia, USA"
4,112467,Shooter Jennings,Male,1.0,Null,Country Rock,1997,,Null
5,141254,“Fats” Sadi,Null,,Null,,1927,,Null
6,142875,12 Gauge,Male,1.0,Null,Rap,Null,,"Augusta, Georgia"
7,144550,Steffen Brandt,Male,1.0,Null,,Null,,Aarhus
8,159712,Paloma San Basilio,Null,,Null,,1950,,Null
9,117511,54-40,Male,4.0,Rock/Pop,Alternative Rock,1981,,"Tsawwassen, British Columbia, Canada"


In [44]:
#replacing and correcting location strings
#replace strings that contain a given value with an entire string
audio_df.loc[audio_df['location'].str.contains('US', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('America', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Missour', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Colorado', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('California', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Florida', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Indiana', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Texas', na=False), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains('Los Angeles', na=False), 'location'] = 'USA'

audio_df.loc[audio_df['location'].str.contains('UK', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Eng', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Edinburgh', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('London', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Scotland', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Ireland', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('United Kingdom', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Nottingham', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Croydon', na=False), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains('Liverpool', na=False), 'location'] = 'UK'

audio_df.loc[audio_df['location'].str.contains('German', na=False), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains('Deutsch', na=False), 'location'] = 'Germany'

audio_df.loc[audio_df['location'].str.contains('Canad', na=False), 'location'] = 'Canada'

audio_df.loc[audio_df['location'].str.contains('Japan', na=False), 'location'] = 'Japan'

audio_df.loc[audio_df['location'].str.contains('French', na=False), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains('France', na=False), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains('Belgium', na=False), 'location'] = 'Belgium'

audio_df.loc[audio_df['location'].str.contains('Swede', na=False), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains('Odense', na=False), 'location'] = 'Denmark'

audio_df.loc[audio_df['location'].str.contains('Switzerland', na=False), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains('Swiss', na=False), 'location'] = 'Switzerland'

audio_df.loc[audio_df['location'].str.contains('Melbourne', na=False), 'location'] = 'Australia'
audio_df.loc[audio_df['location'].str.contains('Australia', na=False), 'location'] = 'Australia'
audio_df.loc[audio_df['location'].str.contains('Zealand', na=False), 'location'] = 'New Zealand'

audio_df.loc[audio_df['location'].str.contains('São Paulo', na=False), 'location'] = 'Brazil'
audio_df.loc[audio_df['location'].str.contains('Brazil', na=False), 'location'] = 'Brazil'

audio_df.loc[audio_df['location'].str.contains('Poznań', na=False), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains('Portug', na=False), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains('Rumania', na=False), 'location'] = 'Romania'
audio_df.loc[audio_df['location'].str.contains('Россия', na=False), 'location'] = 'Russia'
audio_df.loc[audio_df['location'].str.contains('Greece', na=False), 'location'] = 'Greece'



In [46]:
#location stats
#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

Null           312
USA             70
UK              50
Canada          13
Germany         11
              ... 
Chile            1
Italy            1
Greek            1
Denmark          1
New Zealand      1
Name: location, Length: 105, dtype: int64

In [47]:
#proportion each location represents in the data
location_proportion = location_counts / location_counts.sum()

#series
location_proportion

Null           0.524370
USA            0.117647
UK             0.084034
Canada         0.021849
Germany        0.018487
                 ...   
Chile          0.001681
Italy          0.001681
Greek          0.001681
Denmark        0.001681
New Zealand    0.001681
Name: location, Length: 105, dtype: float64

In [48]:
#genre stats
#count of unique genres
genre_counts = audio_df['genre'].value_counts()

#series
genre_counts

                     215
Pop                   28
Rock                  19
Jazz                  19
Folk                  18
                    ... 
Fusion                 1
Hardcore Punk          1
Alternative Metal      1
Progressive Rock       1
Drum & Bass            1
Name: genre, Length: 87, dtype: int64

In [49]:
#proportion each genre represents in the data
genre_proportion = genre_counts / genre_counts.sum()

#series
genre_proportion

                     0.378521
Pop                  0.049296
Rock                 0.033451
Jazz                 0.033451
Folk                 0.031690
                       ...   
Fusion               0.001761
Hardcore Punk        0.001761
Alternative Metal    0.001761
Progressive Rock     0.001761
Drum & Bass          0.001761
Name: genre, Length: 87, dtype: float64

In [None]:
#'m working on some code to remove null values 
#from genre and location so we get an accurate proportion
# -- Angela

In [None]:
# Percentage of artists how are Urban/R&B
#Count of artists per style
