# Week 15 Project


In [1]:
#Dependencies
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint
import json
import random

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#modules for statistics
import math
import statistics
from scipy import stats

# Generate Dictionary of Artist Data

In [3]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [4]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(1200)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [5]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])


#### Convert Dictionary to DataFrame

In [6]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
928,130132,Marea,Male,5,Rock/Pop,Rock,1997,,ES,
929,168652,Marlene Dietrich,Female,1,,,,,DE,
930,113975,Xandria,Mixed,4,Metal,Symphonic Metal,1997,,DE,
931,162484,Moaning,,,,,0,,,


In [None]:
# split data into multiple data frames for tables
artist_df = audio_df.iloc[:,:3]
info_df = audio_df.iloc[:, [0,8,6,7,3]]
location_df = audio_df.iloc[]

# Preprocessing

#### Converting to NaN
- Cleaning the data to ensure that all null values are represented in the same way.  
- This data contains NaN, Null, None, and empty string '' --all representing missing values.  
- We replaced these missing values with Null with null. For year disbanded, replaced None with no, meaning that they haven't disbanded.

In [7]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id           0
artist_name         0
gender            276
member_count      276
style             281
genre              48
year_formed       113
year_disbanded    879
country_code        0
label             882
dtype: int64


In [8]:
#convert empty strings and Empty values into NaN

#audio_df['gender'].replace({None: np.nan, '' : np.nan}, inplace = True) 
#audio_df['member_count'].replace({None: np.nan, '' : np.nan}, inplace = True)
#audio_df['genre'].replace({'':np.nan, None:np.nan}, inplace = True)
#artist_df['year_formed'].replace({'0':np.nan, 'None':np.nan}, inplace = True) 
#audio_df['year_disbanded'].replace({'0':np.nan, 'None':np.nan}, inplace = True)

audio_df.replace({'0':np.nan,
                  0:np.nan,
                  'None':np.nan, 
                  None:np.nan, 
                  '':np.nan}, inplace = True)

In [9]:
audio_df.isnull().sum()

artist_id           0
artist_name         0
gender            406
member_count      276
genre             422
year_formed       354
year_disbanded    880
country_code      179
label             882
dtype: int64

#### Converting datatypes

In [10]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [43]:
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True)

#.apply(pd.to_numeric,errors='coerce',downcast='integer')
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

In [34]:
#convert year_disbanded and year_formed to datetime

#pd.to_datetime(audio_df['year_formed'], 
#               format= '%Y', errors='coerce').dt.year.astype('Int64')


#audio_df['year_formed2'] = pd.DatetimeIndex(audio_df['year_formed']).year
#audio_df

#### Feature Engineering

In [51]:
alpha2_codes = pd.read_csv('../Datasets/alpha2_codes.csv')
alpha2 = []
country_code = audio_df['country_code']

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe7 in position 958: invalid continuation byte

In [44]:
audio_df['country_code'].value_counts()

US    229
GB     80
DE     54
FR     52
IT     28
     ... 
RS      1
CO      1
IR      1
GH      1
LT      1
Name: country_code, Length: 63, dtype: int64

In [None]:
# use country code to create expanded country column
audio_df.loc[audio_df['country_code'].str.contains
             ('France|French|Livorno|Paris', 
              na=False, case=False, regex=True), 'location'] = 'France'

#### Dropping Null Data
We created variables to represent dataframes with the null values removed for a single column for further analysis

In [None]:
#REPLACE AND DROP NAN FOR INDIVIDUAL COLUMNS
#assign individual variables for individual columns with null dropped

#replace 'Null' with NaN for all values in location column, drop NaN
location_na = audio_df.replace({'location': r'Null'}, {'location': np.nan}, regex=True)
location_dropped = location_na.dropna(axis=0, subset=['location'], inplace=False)
#location_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
genre_na = audio_df.replace({'genre': r'Null'}, {'genre': np.nan}, regex=True)
genre_dropped = genre_na.dropna(axis=0, subset=['genre'], inplace=False)
#genre_dropped

#replace 'Null' with NaN for all values in genre column, drop NaN
gender_na = audio_df.replace({'gender': r'Null'}, {'gender': np.nan}, regex=True)
gender_dropped = gender_na.dropna(axis=0, subset=['gender'], inplace=False)
#gender_dropped

#replace 'Null' with NaN for all values in year_formed column, drop NaN
year_formed_na = audio_df.replace({'year_formed': r'Null'}, {'year_formed': np.nan}, regex=True)
year_formed_dropped = year_formed_na.dropna(axis=0, subset=['year_formed'], inplace=False)
#year_formed_dropped

# Analysis of the Data

## Location Analysis

Find the count and proportion of artists by location.

#### Location Count

In [None]:
#LOCATION COUNT - NULL DROPPED

#number of times each location appears in the dataframe
location_counts_dropped = location_dropped['location'].value_counts()

#series
location_counts_dropped

#create dictionary and convert to df
location_counts_dict = {'Artist Location':location_counts_dropped}
location_counts_df = pd.DataFrame(location_counts_dict)

#top 5 artist locations
location_top5 = location_counts_df.head(5)
location_top5

#### Location Proportion

In [None]:
#LOCATION PROPORTION - NULL DROPPED

#proportion each location represents in the data
location_proportion_dropped = round(location_counts_dropped / location_counts_dropped.sum(), 3)

#series
location_proportion_dropped

#create dictionary and convert to df
location_proportion_dict = {'Artist Location Proportion':location_proportion_dropped}
location_proportion_df = pd.DataFrame(location_proportion_dict)

#top 5 by proportion
location_proportion_top5 = location_proportion_df.head(5)
location_proportion_top5

In [None]:
x = np.array(["USA", "UK", "Germany", "Canada", "Italy"])
y = np.array([113, 44, 18, 12, 11])

plt.bar(x,y)
plt.title('Top Locations for Artist')
plt.xlabel('Countries')
plt.ylabel('Number of Artists')
plt.xticks(rotation='vertical')
plt.show()


In [None]:
x = [113, 44, 18, 12, 11]
labels = ["USA", "UK", "Germany", "Canada", "Italy"]
myexplode = [0.2, 0, 0, 0, 0]

fig, ax = plt.subplots()
ax.pie(x, labels=labels, explode = myexplode, shadow = True)
ax.set_title('Top Location for Artist', y = 1.12)
plt.tight_layout()
plt.show()


## Genre Analysis

Find the mode, count, proportion of artists by genre.

#### Genre Mode

The value that appears most frequently in the genre column is Pop.

In [None]:
#The mode shows the most frequently occuring value in a column

statistics.mode(genre_dropped['genre'])

#### Genre Count

In [None]:
#GENRE COUNT - NULL DROPPED

#count of unique genres, excluding null values
genre_counts_dropped = genre_dropped['genre'].value_counts()

#series
genre_counts_dropped

#create dictionary and convert to df
genre_counts_dict = {'Genre Count':genre_counts_dropped}
genre_counts_df = pd.DataFrame(genre_counts_dict)

#top 10 by proportion
genre_counts_top10 = genre_counts_df.head(10)
genre_counts_top10

#### Genre Proportion

In [None]:
#GENRE PROPORTION - NULL DROPPED

#proportion each genre represents in the data
genre_proportion_dropped = round(genre_counts_dropped / genre_counts_dropped.sum(), 3)

#series
genre_proportion_dropped

#create dictionary and convert to df
genre_proportion_dict = {'Genre Proportion':genre_proportion_dropped}
genre_proportion_df = pd.DataFrame(genre_proportion_dict)

#top 10 by proportion
genre_proportion_top10 = genre_proportion_df.head(10)
genre_proportion_top10

In [None]:
x = [37, 22, 22, 19, 16]
labels = ["Pop", "Jazz", "Rock", "Hip-Hop", "Classical"]
myexplode = [0.2, 0, 0, 0, 0]


fig, ax = plt.subplots()
ax.pie(x, labels=labels, explode = myexplode, shadow = True)
ax.set_title('Popular Genres among Artist')
plt.tight_layout()
plt.show()

# Pop music is the most popular among the artists

In [None]:
x = ["Pop", "Jazz", "Rock", "Hip-Hop", "Classical"]
y = [37, 22, 22, 19, 16]

plt.bar(x, y, color = "purple")
plt.title('Popular Genres among Artists')
plt.xlabel('Genre')
plt.ylabel('Number of Artists')
plt.show()

## Gender 

Find the mode, count, and proportion of artists by gender.

#### Gender Mode

Male artists and groups are more common that female or mixed gender groups.

In [None]:
statistics.mode(gender_dropped['gender'])

#### Gender Count

In [None]:
#GENDER COUNT - NULL DROPPED

#count of unique genres, excluding null values
gender_counts_dropped = gender_dropped['gender'].value_counts()

#series
gender_counts_dropped

#create dictionary and convert to df
gender_counts_dict = {'Gender Count':gender_counts_dropped}
gender_counts_df = pd.DataFrame(gender_counts_dict)
gender_counts_df

#### Gender Proportion

In [None]:
#GENDER PROPORTION - NULL DROPPED

#proportion each genre represents in the data
gender_proportion_dropped = round(gender_counts_dropped / gender_counts_dropped.sum(), 2)

#series
gender_proportion_dropped

#create dictionary and convert to df
gender_proportion_dict = {'Gender Proportion':gender_proportion_dropped}
gender_proportion_df = pd.DataFrame(gender_proportion_dict)
gender_proportion_df

In [None]:
category_order = ['Male', 'Female', 'Mixed']

sns.countplot(x = gender_dropped['gender'], data = audio_df, order = category_order)
plt.title('Gender of Artists/Bands')
plt.show()

In [None]:
x = [307, 80, 37]
labels = ['Male', 'Female', 'Mixed']
myexplode = [0.2, 0, 0]

fig, ax = plt.subplots()
ax.pie(x, labels=labels, explode = myexplode, shadow = True)
ax.set_title('Gender of Artists/Bands')
plt.tight_layout()
plt.show()

## Year Formed

Find the minimum, maximum, and number of artists/bands formed per year.

In [None]:
#Find the max year
max_year_formed = year_formed_dropped['year_formed'].max()
max_year_formed

In [None]:
#Find the max year
min_year_formed = year_formed_dropped['year_formed'].min()
min_year_formed

In [None]:
#Find the number of band formed each year
year_count = year_formed_dropped.groupby('year_formed').size()

#create dictionary and convert to df
year_count_dict = {'Year Count':year_count}
year_count_df = pd.DataFrame(year_count_dict)
year_count_df

## Analysis on Multiple Columns

Find the genres for the top 3 locations.

Find the years artists/bands were formed for the top 3 genres.

#### Genres by Top 3 Locations

An analysis of the genres in the top 3 countries

In [None]:
#Genre by Location
#GROUPBYS WITH NULL VALUES DROPPED

#create new vartiables = one df with null values in 'genre' and 'location' dropped
genre_location_na = location_na.replace({'genre': r'Null'}, {'genre': np.nan}, regex=True)
genre_location_dropped = genre_location_na.dropna(axis=0, subset=['genre'], inplace=False)
genre_location_na.dropna(axis=0, subset=['location'], inplace=False)
genre_location_dropped

#groupby location
genre_location_grouped = genre_location_dropped.groupby('location')

#count location by genre
genre_by_location = genre_location_grouped['genre'].value_counts()

#sort values in descending order
genre_by_location_sorted = genre_by_location.sort_values(ascending=False)

#isolate top 3 locations
genre_by_location_top3 = genre_by_location_sorted.loc[['USA', 'UK', 'Germany']]

genre_by_location_top3

#create dictionary and convert to df
genre_by_location_top3_dict = {'Genre by Location':genre_by_location_top3}
genre_by_location_top3_df = pd.DataFrame(genre_by_location_top3_dict)
genre_by_location_top3_df

#### Genres by Year

An analysis of the year that bands formed and the genre they play in.

In [None]:
#Genre by Year
#GROUPBYS WITH NULL VALUES DROPPED

#create new vartiables = one df with null values in 'genre' and 'year_formed' dropped
genre_year_na = genre_na.replace({'year_formed': r'Null'}, {'year_formed': np.nan}, regex=True)
genre_year_dropped = genre_year_na.dropna(axis=0, subset=['year_formed'], inplace=False)
genre_year_dropped

#groupby year formed
year_genre_grouped = genre_year_dropped.groupby('genre')

#count genre on year formed groupby
year_by_genre = year_genre_grouped['year_formed'].value_counts()

year_by_genre_top3 = year_by_genre.loc[['Pop', 'Jazz', 'Rock']]

#create dictionary and convert to df
year_by_genre_top3_dict = {'Genre by Year':year_by_genre_top3}
year_by_genre_top3_df = pd.DataFrame(year_by_genre_top3_dict)
year_by_genre_top3_df