# Week 15 ETL Project
Angela Spencer

Clarine Esperance

Ayesha Lastname?

In [1]:
#Dependencies
import pandas as pd
import numpy as np
import requests
import json
import random

#### Generate Dictionary of Artist Data from Web API

In [2]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [3]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(1500)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [4]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])


#### Convert Dictionary to DataFrame

In [5]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
1162,113477,Terror,Male,5,Rock/Pop,Hardcore,2000,,US,
1163,157114,DJ Baur,,,,,0,,,
1164,114531,Toots & The Maytals,,1,Reggae,Reggae,1963,,JM,
1165,117428,Gendai Kano,Male,1,Classical,Musical,1938,,JP,


# Preprocessing

#### Converting to Null Values
- Cleaning the data to ensure that all null values are represented in the same way.  
- All missing values (NaN, Null, None, and empty string) were replaced with 'None' to indicate that there is no available data.

In [6]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id            0
artist_name          0
gender             350
member_count       350
style              356
genre               62
year_formed        153
year_disbanded    1097
country_code         0
label             1102
dtype: int64


In [7]:
#convert empty strings and null values to NaN
audio_df.replace({'0':'None',
                  0:'None', 
                  None:'None', 
                  '':'None',
                  np.nan:'None',
                  'NaN':'None'}, inplace = True)

In [8]:
audio_df.isnull().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Converting datatypes

In [9]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [10]:
#convert dtypes from string to numeric dtypes
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].apply(pd.to_numeric,errors='coerce',downcast='integer')
#convert float to integer
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True, convert_string=False)
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

In [11]:
#convert year_disbanded and year_formed to datetime

pd.to_datetime(audio_df['year_formed'], 
               format= '%Y', errors='coerce')

0      2006-01-01
1      1983-01-01
2             NaT
3      1994-01-01
4      1979-01-01
          ...    
1162   2000-01-01
1163          NaT
1164   1963-01-01
1165   1938-01-01
1166   2016-01-01
Name: year_formed, Length: 1167, dtype: datetime64[ns]

In [12]:
#convert NaN created by numeric conversion into 'None'
audio_df.replace({np.nan:'None'}, inplace = True)
audio_df.isna().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Feature Engineering
Creating a new colum of expanded country names corresponding to country codes

In [13]:
audio_df['country_code'].value_counts()

US      292
None    221
GB      110
DE       65
FR       61
       ... 
MY        1
CO        1
GH        1
IR        1
SW        1
Name: country_code, Length: 72, dtype: int64

In [30]:
#import csv containing standard 2 digit alpha codes for countries
alpha2_codes = pd.read_csv('alpha2_codes.csv')

#convert alpha2 codes to dataframe
countries_df = pd.DataFrame(alpha2_codes)
countries_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
245,Yemen,YE
246,Zambia,ZM
247,Zimbabwe,ZW
248,Aland Islands,AX


In [18]:
#merge list of country codes onto audio_df to create new column of country names
audio_df = pd.merge(audio_df, countries_df)

## Creating Tables

Split dataframe into multiple dataframes corresponding to 3 database tables
- artist_df
- info_df
- location_df

In [20]:
audio_df.head(3)

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label,country_name
0,140892,Ice Nine Kills,Male,4,Metalcore,2006.0,,US,,United States of America (the)
1,112476,James Horner,Male,1,OST,1979.0,,US,,United States of America (the)
2,160023,Mickey Petralia,Male,1,,,,US,,United States of America (the)


In [22]:
# split data into multiple data frames for tables
artist_df = audio_df.iloc[:,:3]
artist_df

Unnamed: 0,artist_id,artist_name,gender
0,140892,Ice Nine Kills,Male
1,112476,James Horner,Male
2,160023,Mickey Petralia,Male
3,169251,3D Natee,Female
4,150340,Dick Schory's New Percussion Ensemble,Male
...,...,...,...
1095,130853,İlhan Erşahin,
1096,167192,Veronika Povilioniene,Female
1097,149602,李雲迪,Male
1098,167328,ريم بنا,Female


In [27]:
info_df = audio_df.iloc[:, [0,7,4,3,5,6,8]]
info_df

Unnamed: 0,artist_id,country_code,genre,member_count,year_formed,year_disbanded,label
0,140892,US,Metalcore,4,2006,,
1,112476,US,OST,1,1979,,
2,160023,US,,1,,,
3,169251,US,Rap,1,,,
4,150340,US,Acoustic,1,1958,,
...,...,...,...,...,...,...,...
1095,130853,TR,,1,1965,,
1096,167192,LT,Folk,1,1987,,
1097,149602,CN,Pianist,1,,,
1098,167328,PS,World/Ethnic,1,,,


In [29]:
location_df = audio_df.iloc[:,[-3, -1]]
location_df

Unnamed: 0,country_code,country_name
0,US,United States of America (the)
1,US,United States of America (the)
2,US,United States of America (the)
3,US,United States of America (the)
4,US,United States of America (the)
...,...,...
1095,TR,Turkey
1096,LT,Lithuania
1097,CN,China
1098,PS,"Palestine, State of"


## Connecting to PostgreSQL