In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint
import json

In [2]:
#modules for statistics
import math
import statistics
from scipy import stats

# Generate Artists List

In [3]:
# Find a code to get all the artists from this api
#https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay
artist_names = []

In [4]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random_nums = np.random.randint(low=100000, high=170000, size=800)
#print(ran_ids)

#test with known IDs and intentional erroes
#random_nums = [112024, 0, 100000, 114364]

In [5]:
#create a base url
#example: https://theaudiodb.com/api/v1/json/1/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/1/artist.php?i="

#create a loop that uses random number
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
    
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        artist_names.append(info['artists'][0]['strArtist'])
    
    #print(artist_names)

In [6]:
len(artist_names)

625

In [7]:
#print(artist_names)

In [8]:
#alternative to "if not in list then .append"
#convert list to set to remove redundant artists

#unique_artist_ids = set(artist_ids)
#print(unique_artist_ids)

In [9]:
#create loop to pull artist information from the database
#using populated artist_names list
#example url: http://theaudiodb.com/api/v1/json/1/search.php?s=coldplay
#tutorial: https://www.youtube.com/watch?v=pxofwuWTs7c

#create empty dictionary to fill with values
audio_data = {'artist_id':[],
              'artist_name':[],
              'gender':[],
              'members':[],
              'style':[], 
              'genre':[],
             'year_formed':[],
             'year_disbanded':[],
             'location':[]}

#for each artist in the artist list, 
for name in artist_names:
    
    #add the artist name to the base url
    base_url = "http://theaudiodb.com/api/v1/json/1/search.php?s="
    req = requests.get(base_url + f'{name}')
    
    #convert request information into json as a variable
    data = req.json()
    
    #add json values to dictionary
    try:
        audio_data['artist_id'].append(data['artists'][0]['idArtist'])
        audio_data['artist_name'].append(data['artists'][0]['strArtist'])
        audio_data['gender'].append(data['artists'][0]["strGender"])
        audio_data['members'].append(data['artists'][0]["intMembers"])
        audio_data['style'].append(data['artists'][0]["strStyle"])
        audio_data['genre'].append(data['artists'][0]["strGenre"])
        audio_data['year_formed'].append(data['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(data['artists'][0]["strDisbanded"])
        audio_data['location'].append(data['artists'][0]["strCountry"])
    
    #restart loop if TypeError found
    except (TypeError):
        continue

In [10]:
#print(audio_data)

In [11]:
audio_df = pd.DataFrame(audio_data)
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,129340,Nate James,Male,1.0,Urban/R&B,Soul,2005.0,,
1,141072,Olu Dara,,,,,1941.0,,
2,132170,Jack Savoretti,Male,1.0,,Acoustic,1999.0,,"London, UK"
3,127221,Pappo,Male,1.0,Blues,Blues,1966.0,Yes,Buenos Aires
4,145283,Craig Cardiff,Male,1.0,,Folk,1976.0,,"Kitchener/Waterloo, Canada"
5,126635,HB,Mixed,4.0,Metal,Symphonic Metal,2002.0,,
6,137241,Plebe rude,Male,4.0,,,1981.0,,
7,132755,Christophe Rousset,,1.0,,,,,
8,120847,Gire,,1.0,Metal,Avant-Garde,1995.0,Yes,Makó
9,131529,Arpeggio,Mixed,4.0,Electronic,Disco,1977.0,Yes,


In [12]:
# Cleaning the data: replace the 0s, None, and empty strings with null. Year disbanded: replaced None to no, meaning that they haven't disbanded
#To only replace empty values for one column, specify the column name for the DataFrame:
#df["Calories"].fillna(130, inplace = True)

audio_df['year_disbanded'].replace({2021 : 'No'}, inplace = True)
#audio_df_nan = audio_df.replace(0, np.nan)
audio_df['year_formed'].replace({None: 'Null', '0' : np.nan}, inplace = True) 
audio_df['gender'].replace({None: 'Null', '' : 'Null'}, inplace = True) 
audio_df['location'].replace({None: 'Null', '' : 'Null'}, inplace = True)
audio_df['style'].replace({'' : 'Null', None : 'Null'}, inplace = True)
audio_df['genre'].replace({'' : 'Null'}, inplace = True)


#cols = ['year_formed',"gender","BootSize","SuitSize","Type"]
#df2[cols] = df2[cols].replace({'0':np.nan, 0:np.nan})
audio_df.head(20)

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,129340,Nate James,Male,1.0,Urban/R&B,Soul,2005,,Null
1,141072,Olu Dara,Null,,Null,Null,1941,,Null
2,132170,Jack Savoretti,Male,1.0,Null,Acoustic,1999,,"London, UK"
3,127221,Pappo,Male,1.0,Blues,Blues,1966,Yes,Buenos Aires
4,145283,Craig Cardiff,Male,1.0,Null,Folk,1976,,"Kitchener/Waterloo, Canada"
5,126635,HB,Mixed,4.0,Metal,Symphonic Metal,2002,,Null
6,137241,Plebe rude,Male,4.0,Null,Null,1981,,Null
7,132755,Christophe Rousset,Null,1.0,Null,Null,Null,,Null
8,120847,Gire,Null,1.0,Metal,Avant-Garde,1995,Yes,Makó
9,131529,Arpeggio,Mixed,4.0,Electronic,Disco,1977,Yes,Null


In [13]:
#replacing and correcting location strings
#replace strings that contain a given value with an entire string

audio_df.loc[audio_df['location'].str.contains
             ('US|American|New York|Sacramento|Colorado|Missouri|California|Florida|Indiana|Texas|Los Angeles|Elk Grove Village|Arizona|Arlington', 
              na=False, case=False, regex=True), 'location'] = 'USA'
audio_df.loc[audio_df['location'].str.contains
             ('UK|England|English|Edinburgh|London|Scotland|Irish|Ireland|Manchester|United Kingdom|Nottingham|Croydon|Liverpool', 
              na=False, case=False, regex=True), 'location'] = 'UK'
audio_df.loc[audio_df['location'].str.contains
             ('German|Deutsch|Deutch', 
              na=False, case=False, regex=True), 'location'] = 'Germany'
audio_df.loc[audio_df['location'].str.contains
             ('Italia|Italy', 
              na=False, case=False, regex=True), 'location'] = 'Italy'
audio_df.loc[audio_df['location'].str.contains
             ('France|French', 
              na=False, case=False, regex=True), 'location'] = 'France'
audio_df.loc[audio_df['location'].str.contains
             ('Belgium', 
              na=False, case=False, regex=True), 'location'] = 'Belgium'
audio_df.loc[audio_df['location'].str.contains
             ('Greece|Greek', 
              na=False, case=False, regex=True), 'location'] = 'Greece'
audio_df.loc[audio_df['location'].str.contains
             ('Switzerland|Swiss', 
              na=False, case=False, regex=True), 'location'] = 'Switzerland'
audio_df.loc[audio_df['location'].str.contains
             ('Canada|Canadian', 
              na=False, case=False, regex=True), 'location'] = 'Canada'
audio_df.loc[audio_df['location'].str.contains
             ('Melbourne|Australia', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Japan|Japanese', 
              na=False, case=False, regex=True), 'location'] = 'Japan'
audio_df.loc[audio_df['location'].str.contains
             ('Poznań|Poland', 
              na=False, case=False, regex=True), 'location'] = 'Poland'
audio_df.loc[audio_df['location'].str.contains
             ('Россия|Moscow', 
              na=False, case=False, regex=True), 'location'] = 'Russia'
audio_df.loc[audio_df['location'].str.contains
             ('Portugal', 
              na=False, case=False, regex=True), 'location'] = 'Portugal'
audio_df.loc[audio_df['location'].str.contains
             ('Swede|Gothenburg', 
              na=False, case=False, regex=True), 'location'] = 'Sweden'
audio_df.loc[audio_df['location'].str.contains
             ('Odense', 
              na=False, case=False, regex=True), 'location'] = 'Denmark'
audio_df.loc[audio_df['location'].str.contains
             ('Dutch|Netherlands', 
              na=False, case=False, regex=True), 'location'] = 'Netherlands'
audio_df.loc[audio_df['location'].str.contains
             ('Zealand|Wellington', 
              na=False, case=False, regex=True), 'location'] = 'New Zealand'
audio_df.loc[audio_df['location'].str.contains
             ('Rumania', 
              na=False, case=False, regex=True), 'location'] = 'Romania'
audio_df.loc[audio_df['location'].str.contains
             ('South Africa', 
              na=False, case=False, regex=True), 'location'] = 'South Africa'
audio_df.loc[audio_df['location'].str.contains
             ('Argentina', 
              na=False, case=False, regex=True), 'location'] = 'Argentina'
audio_df.loc[audio_df['location'].str.contains
             ('Mexico', 
              na=False, case=False, regex=True), 'location'] = 'Mexico'


In [14]:
#REPLACE AND DROP NAN FOR INDIVIDUAL COLUMNS
#assign individual variables for individual columns with null dropped

#replace 'Null' with NaN for all values in location column, drop NaN
location_na = audio_df.replace({'location': r'Null'}, {'location': np.nan}, regex=True)
location_dropped = location_na.dropna(axis=0, subset=['location'], inplace=False)
location_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
genre_na = audio_df.replace({'genre': r'Null'}, {'genre': np.nan}, regex=True)
genre_dropped = genre_na.dropna(axis=0, subset=['genre'], inplace=False)
genre_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
gender_na = audio_df.replace({'gender': r'Null'}, {'gender': np.nan}, regex=True)
gender_dropped = gender_na.dropna(axis=0, subset=['gender'], inplace=False)
gender_dropped

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location
0,129340,Nate James,Male,1,Urban/R&B,Soul,2005,,Null
2,132170,Jack Savoretti,Male,1,Null,Acoustic,1999,,UK
3,127221,Pappo,Male,1,Blues,Blues,1966,Yes,Buenos Aires
4,145283,Craig Cardiff,Male,1,Null,Folk,1976,,Canada
5,126635,HB,Mixed,4,Metal,Symphonic Metal,2002,,Null
...,...,...,...,...,...,...,...,...,...
605,148395,Theuns Jordaan,Male,1,Null,Null,Null,,South Africa
606,133621,Audio Karate,Male,4,Null,Pop-Punk,1995,,USA
608,166683,Carter Ace,Male,1,Urban/R&B,Hip-Hop,1998,,Null
609,112916,Megadeth,Male,4,Metal,Thrash Metal,1983,,USA


### Location

In [15]:
#LOCATION COUNT - NULL INCLUDED

#number of times each location appears in the dataframe
location_counts = audio_df['location'].value_counts()

#series
location_counts

Null               306
USA                 91
UK                  52
Germany             17
France              11
                  ... 
Macaé, BR            1
Düsseldorf, DE       1
Angers               1
 Ravensburg, DE      1
South Africa         1
Name: location, Length: 108, dtype: int64

In [16]:
#LOCATION COUNT - NULL DROPPED

#number of times each location appears in the dataframe
location_counts_dropped = location_dropped['location'].value_counts()

#series
location_counts_dropped

USA               91
UK                52
Germany           17
France            11
Canada             9
                  ..
Copenhagen         1
Macaé, BR          1
Düsseldorf, DE     1
Angers             1
South Africa       1
Name: location, Length: 107, dtype: int64

In [17]:
#LOCATION PROPORTION - NULL INCLUDED

#proportion each location represents in the data
location_proportion = round(location_counts / location_counts.sum(), 2)

#series
location_proportion

Null               0.50
USA                0.15
UK                 0.09
Germany            0.03
France             0.02
                   ... 
Macaé, BR          0.00
Düsseldorf, DE     0.00
Angers             0.00
 Ravensburg, DE    0.00
South Africa       0.00
Name: location, Length: 108, dtype: float64

In [18]:
#LOCATION PROPORTION - NULL DROPPED

#proportion each location represents in the data
location_proportion_dropped = round(location_counts_dropped / location_counts_dropped.sum(), 2)

#series
location_proportion_dropped

USA               0.30
UK                0.17
Germany           0.06
France            0.04
Canada            0.03
                  ... 
Copenhagen        0.00
Macaé, BR         0.00
Düsseldorf, DE    0.00
Angers            0.00
South Africa      0.00
Name: location, Length: 107, dtype: float64

### Genre

In [19]:
#GENRE COUNT - NULL INCLUDED

#count of unique genres, excluding null values
genre_counts = audio_df['genre'].value_counts()

#series
genre_counts

Null                208
Pop                  28
Jazz                 27
Folk                 20
Rock                 17
                   ... 
Opera                 1
Folk Metal            1
Psychedelic Rock      1
Latin                 1
Country Rock          1
Name: genre, Length: 84, dtype: int64

In [20]:
#GENRE COUNT - NULL DROPPED

#count of unique genres, excluding null values
genre_counts_dropped = genre_dropped['genre'].value_counts()

#series
genre_counts_dropped

Pop                 28
Jazz                27
Folk                20
Rock                17
Indie               15
                    ..
Opera                1
Folk Metal           1
Psychedelic Rock     1
Latin                1
Country Rock         1
Name: genre, Length: 83, dtype: int64

In [24]:
#GENRE PROPORTION - NULL INCLUDED

#proportion each genre represents in the data
genre_proportion = round(genre_counts / genre_counts.sum(), 4)

#series
genre_proportion

Null                0.3568
Pop                 0.0480
Jazz                0.0463
Folk                0.0343
Rock                0.0292
                     ...  
Opera               0.0017
Folk Metal          0.0017
Psychedelic Rock    0.0017
Latin               0.0017
Country Rock        0.0017
Name: genre, Length: 84, dtype: float64

In [23]:
#GENRE PROPORTION - NULL DROPPED

#proportion each genre represents in the data
genre_proportion_dropped = round(genre_counts_dropped / genre_counts_dropped.sum(), 4)

#series
genre_proportion_dropped

Pop                 0.0747
Jazz                0.0720
Folk                0.0533
Rock                0.0453
Indie               0.0400
                     ...  
Opera               0.0027
Folk Metal          0.0027
Psychedelic Rock    0.0027
Latin               0.0027
Country Rock        0.0027
Name: genre, Length: 83, dtype: float64

In [30]:
# Artist and genre analysis
sty_analysis = {"Artist per Music Style": genre_counts_dropped}
music_analysis = pd.DataFrame(sty_analysis)
music_analysis

Unnamed: 0,Artist per Music Style
Pop,28
Jazz,27
Folk,20
Rock,17
Indie,15
...,...
Opera,1
Folk Metal,1
Psychedelic Rock,1
Latin,1


In [31]:
# .value_counts() returns an array containing the # of times each unique value occurs in a given column
# Returns the value counts of each unique value in the purchase_category column

#print(audio_df["style"].value_counts())
statistics.mode(genre_dropped['genre'])

'Pop'

### Gender

In [32]:
#GENDER COUNT - NULL INCLUDED

#count of unique genres, excluding null values
gender_counts = audio_df['gender'].value_counts()

#series
gender_counts

Male      259
Null      240
Female     74
Mixed      38
Name: gender, dtype: int64

In [33]:
#GENDER COUNT - NULL DROPPED

#count of unique genres, excluding null values
gender_counts_dropped = gender_dropped['gender'].value_counts()

#series
gender_counts_dropped

Male      259
Female     74
Mixed      38
Name: gender, dtype: int64

In [34]:
#GENDER PROPORTION - NULL INCLUDED

#proportion each genre represents in the data
gender_proportion = round(gender_counts / gender_counts.sum(), 2)

#series
gender_proportion

Male      0.42
Null      0.39
Female    0.12
Mixed     0.06
Name: gender, dtype: float64

In [35]:
#GENDER PROPORTION - NULL DROPPED

#proportion each genre represents in the data
gender_proportion_dropped = round(gender_counts_dropped / gender_counts_dropped.sum(), 2)

#series
gender_proportion_dropped

Male      0.7
Female    0.2
Mixed     0.1
Name: gender, dtype: float64

In [36]:
statistics.mode(audio_df['gender'])

'Male'

In [40]:
# How many of our artists are male, female, mixed? Create a count plot

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x = 'gender', data = gender_counts_dropped)
plt.show()

ValueError: Could not interpret input 'gender'

### Giving location numerical values

In [41]:
audio_df['Code'] = pd.factorize(audio_df.location)[0]
audio_df.head()

Unnamed: 0,artist_id,artist_name,gender,members,style,genre,year_formed,year_disbanded,location,Code
0,129340,Nate James,Male,1.0,Urban/R&B,Soul,2005,,Null,0
1,141072,Olu Dara,Null,,Null,Null,1941,,Null,0
2,132170,Jack Savoretti,Male,1.0,Null,Acoustic,1999,,UK,1
3,127221,Pappo,Male,1.0,Blues,Blues,1966,Yes,Buenos Aires,2
4,145283,Craig Cardiff,Male,1.0,Null,Folk,1976,,Canada,3


In [42]:
audio_df.Code.value_counts()

0      306
7       91
1       52
9       17
25      11
      ... 
37       1
36       1
35       1
34       1
107      1
Name: Code, Length: 108, dtype: int64

### Genre by Location

In [43]:
# Style and location

# I'm stuck right here

where = audio_df.groupby(['location', 'style'])
most_pop = where['location'].count()
most_pop

location                  style    
 Budapest                 Metal         1
 Ravensburg, DE           Null          1
 Szombathely / Amsterdam  Metal         1
Ajido, Nigeria            Null          1
Angers                    Null          1
                                       ..
USA                       Rock/Pop     25
                          Urban/R&B    14
Windsor                   Rock/Pop      1
Wolverhampton             Null          1
Česko                     Jazz          1
Name: location, Length: 166, dtype: int64

In [44]:
#GROUPBYS WITH NULL VALUES DROPPED

#create new vartiables = one df with null values in 'genre' and 'location' dropped
gender_location_na = location_na.replace({'gender': r'Null'}, {'gender': np.nan}, regex=True)
genre_location_dropped = location_dropped.dropna(axis=0, subset=['gender'], inplace=False)
genre_location_dropped

#groupby location
genre_location_grouped = genre_location_dropped.groupby('location')

#count location by genre
genre_by_location = genre_location_grouped['genre'].value_counts()

genre_by_location

location                  genre              
 Budapest                 Heavy Metal            1
 Ravensburg, DE           Singer Songwriter      1
 Szombathely / Amsterdam  Black Metal            1
Ajido, Nigeria            World/Ethnic           1
Angers                    Null                   1
                                                ..
USA                       Sludge Metal           1
                          Thrash Metal           1
Windsor                   Black Metal            1
Wolverhampton             Classical Crossover    1
Česko                     Alternative Rock       1
Name: genre, Length: 222, dtype: int64

### Year Analysis

In [45]:
#find the max and min year
#year_bins = [0,9,19,29,39,49]
#year_groups = []

Notes for questions and stats to run:
1. This data is categorical and nominal (unordered)
2. Best to use counts and bar graphs for categorical data
3. Measure of center: Mode - used to categorical data b/c the variables don't have a numerical representation and use the function in stats module: statistics.mode

Questions:

Which artist released the most albums?
Which genre has the most artists?
Is there a relationship between genre and location?

Maybe create a second dataframe with discography api. If so, more questions that could be answered? Year and genre?

- Do groups or individual artist release more albums?
- Which category do the albums fall into?