# Week 15 Project


In [1]:
#Dependencies
import pandas as pd
import numpy as np
import requests
import time
from pprint import pprint
import json
import random

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#modules for statistics
#import math
#import statistics
#from scipy import stats

#### Generate Dictionary of Artist Data from Web API

In [3]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [4]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(1200)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [5]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])


#### Convert Dictionary to DataFrame

In [6]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
928,130132,Marea,Male,5,Rock/Pop,Rock,1997,,ES,
929,168652,Marlene Dietrich,Female,1,,,,,DE,
930,113975,Xandria,Mixed,4,Metal,Symphonic Metal,1997,,DE,
931,162484,Moaning,,,,,0,,,


# Preprocessing

#### Converting to NaN
- Cleaning the data to ensure that all null values are represented in the same way.  
- This data contains NaN, Null, None, and empty string '' --all representing missing values.  
- We replaced these missing values with Null with null. For year disbanded, replaced None with no, meaning that they haven't disbanded.

In [8]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id           0
artist_name         0
gender            276
member_count      276
style             281
genre              48
year_formed       113
year_disbanded    879
country_code        0
label             882
dtype: int64


In [9]:
#convert empty strings and null values to NaN
audio_df.replace({'0':np.nan,
                  0:np.nan,
                  'None':np.nan, 
                  None:np.nan, 
                  '':np.nan}, inplace = True)

In [10]:
audio_df.isnull().sum()

artist_id           0
artist_name         0
gender            406
member_count      276
genre             422
year_formed       354
year_disbanded    880
country_code      179
label             882
dtype: int64

#### Converting datatypes

In [11]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [16]:
#convert dtypes from string to numeric dtypes
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].apply(pd.to_numeric,errors='coerce',downcast='integer')
#convert float to integer
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True, convert_string=False)
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count        Int8
genre             object
year_formed        Int16
year_disbanded      Int8
country_code      object
label             object
dtype: object

In [26]:
#convert year_disbanded and year_formed to datetime

pd.to_datetime(audio_df['year_formed'], 
               format= '%Y', errors='coerce')
#.astype('Int64')

#audio_df['year_formed'].dt.year
#audio_df['formed_year'] = pd.DatetimeIndex(audio_df['year_formed']).year
#audio_df['formed_year']


#Angela- still working on getting the datetime properly converted

0     2006-01-01
1     1983-01-01
2            NaT
3     1994-01-01
4     1979-01-01
         ...    
928   1997-01-01
929          NaT
930   1997-01-01
931          NaT
932   2006-01-01
Name: year_formed, Length: 933, dtype: datetime64[ns]

#### Feature Engineering

In [44]:
audio_df['country_code'].value_counts()

US    229
GB     80
DE     54
FR     52
IT     28
     ... 
RS      1
CO      1
IR      1
GH      1
LT      1
Name: country_code, Length: 63, dtype: int64

In [28]:
alpha2_codes = pd.read_csv('../Datasets/alpha2_codes.csv')
#alpha2 = []
#country_code = audio_df['country_code']

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe7 in position 958: invalid continuation byte

In [None]:
# use country code to create expanded country column
audio_df.loc[audio_df['country_code'].str.contains
             ('France|French|Livorno|Paris', 
              na=False, case=False, regex=True), 'location'] = 'France'

In [None]:
# Angela - still working here on creating an additional column of expanded country names

In [34]:
#convert <NA> strings created by data cleaning into NaN
audio_df.replace({'<NA>':np.nan}, inplace = True)

## Creating Tables

Split dataframe into multiple dataframes corresponding to 3 database tables

In [35]:
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label,formed_year
0,140892,Ice Nine Kills,Male,4.0,Metalcore,2006.0,,US,,1970.0
1,132276,Savage,Male,1.0,Synthpop,1983.0,,IT,,1970.0
2,158416,Jason Hawk Harris,,,,,,,,
3,133565,Violent Work of Art,Mixed,4.0,Industrial Metal,1994.0,,SE,,1970.0
4,112476,James Horner,Male,1.0,OST,1979.0,,US,,1970.0
...,...,...,...,...,...,...,...,...,...,...
928,130132,Marea,Male,5.0,Rock,1997.0,,ES,,1970.0
929,168652,Marlene Dietrich,Female,1.0,,,,DE,,
930,113975,Xandria,Mixed,4.0,Symphonic Metal,1997.0,,DE,,1970.0
931,162484,Moaning,,,,,,,,


In [None]:
# split data into multiple data frames for tables
artist_df = audio_df.iloc[:,:3]
info_df = audio_df.iloc[:, [0,8,6,7,3]]
location_df = audio_df.iloc[]