# ETL Project - Angela's Workbook


In [1]:
#Dependencies
import pandas as pd
import numpy as np
import requests
#import time
#from pprint import pprint
import json
import random
import psycopg2
from password import password
from sqlalchemy import create_engine

#import matplotlib.pyplot as plt
#import seaborn as sns

In [2]:
#modules for statistics
#import math
#import statistics
#from scipy import stats

#### Generate Dictionary of Artist Data from Web API

In [3]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [4]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(150)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [5]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])


#### Convert Dictionary to DataFrame

In [6]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
109,147463,The Poni-Tails,,,,,0,,US,
110,132545,Nervosa,Female,3,,Thrash Metal,2010,,BR,
111,153229,Sálvate si puedes,Male,5,Punk,Punk Rock,1990,Yes,ES,
112,128272,Archie Bronson Outfit,Male,3,,,0,,GB,


# Preprocessing

#### Converting to Null values
- Cleaning the data to ensure that all null values are represented in the same way.  
- All missing values (NaN, Null, None, and empty string) were replaced with 'None' to indicate that there is no available data.

In [7]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id           0
artist_name         0
gender             23
member_count       23
style              23
genre               5
year_formed        11
year_disbanded     99
country_code        0
label             103
dtype: int64


In [8]:
#convert empty strings and null values to NaN
audio_df.replace({'0':'None',
                  0:'None', 
                  None:'None', 
                  '':'None',
                  np.nan:'None',
                  'NaN':'None'}, inplace = True)

In [9]:
audio_df.isnull().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Converting datatypes

In [10]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [11]:
#convert dtypes from string to numeric dtypes
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].apply(pd.to_numeric,errors='coerce',downcast='integer')
#convert float to integer
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True, convert_string=False)
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

In [12]:
#convert year_disbanded and year_formed to datetime

pd.to_datetime(audio_df['year_formed'], 
               format= '%Y', errors='coerce')

0     2006-01-01
1     1983-01-01
2            NaT
3     1994-01-01
4     1979-01-01
         ...    
109          NaT
110   2010-01-01
111   1990-01-01
112          NaT
113   1981-01-01
Name: year_formed, Length: 114, dtype: datetime64[ns]

In [13]:
#convert NaN created by numeric conversion into 'None'
#audio_df.replace({np.nan:'None'}, inplace = True)
#audio_df.isna().sum()

In [17]:
#convert null values in numeric columns from NaN to 0 after numeric conversion in order to query later
audio_df.replace({np.nan:0}, inplace = True)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,Metalcore,2006,0,US,
1,132276,Savage,Male,1,Synthpop,1983,0,IT,
2,158416,Jason Hawk Harris,,0,,0,0,,
3,133565,Violent Work of Art,Mixed,4,Industrial Metal,1994,0,SE,
4,112476,James Horner,Male,1,OST,1979,0,US,
...,...,...,...,...,...,...,...,...,...
109,147463,The Poni-Tails,,0,,0,0,US,
110,132545,Nervosa,Female,3,Thrash Metal,2010,0,BR,
111,153229,Sálvate si puedes,Male,5,Punk Rock,1990,0,ES,
112,128272,Archie Bronson Outfit,Male,3,,0,0,GB,


In [18]:
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

#### Feature Engineering
Creating a new colum of expanded country names corresponding to country codes

In [19]:
audio_df['country_code'].value_counts()

US      33
None    18
IT       9
GB       8
FR       8
ES       6
DE       5
NL       5
CA       3
JP       3
SE       2
UK       2
BR       2
KR       1
IE       1
AU       1
NZ       1
CZ       1
DK       1
RU       1
CO       1
MX       1
NO       1
Name: country_code, dtype: int64

In [20]:
#import csv containing standard 2 digit alpha codes for countries
alpha2_codes = pd.read_csv('../Datasets/alpha2_codes.csv')

#convert alpha2 codes to dataframe
countries_df = pd.DataFrame(alpha2_codes)
countries_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
245,Yemen,YE
246,Zambia,ZM
247,Zimbabwe,ZW
248,Aland Islands,AX


In [21]:
audio_df = pd.merge(audio_df, countries_df)

In [22]:
#drop few duplicate artist_ids
audio_df.drop_duplicates(subset=['artist_id'], inplace=True)
audio_df.reset_index(inplace=True, drop=True)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label,country_name
0,140892,Ice Nine Kills,Male,4,Metalcore,2006,0,US,,United States of America (the)
1,112476,James Horner,Male,1,OST,1979,0,US,,United States of America (the)
2,160023,Mickey Petralia,Male,1,,0,0,US,,United States of America (the)
3,169251,3D Natee,Female,1,Rap,0,0,US,,United States of America (the)
4,150340,Dick Schory's New Percussion Ensemble,Male,1,Acoustic,1958,0,US,,United States of America (the)
...,...,...,...,...,...,...,...,...,...,...
106,162001,Orion,Male,1,Psy Trance,1996,0,DK,,Denmark
107,115027,Cho Young-Wuk,Male,1,,1962,0,KR,,Korea (the Republic of)
108,121908,Princess Chelsea,Female,1,Indie Pop,0,0,NZ,,New Zealand
109,154662,OTT,Male,5,Pop,1995,0,IE,Sony Music Entertainment Downloads LLC,Ireland


## Creating Tables

Split dataframe into multiple dataframes corresponding to 3 database tables
- artist_df
- info_df
- location_df

In [23]:
# split data into multiple data frames for tables
artist_df = audio_df.iloc[:,:3]
artist_df

Unnamed: 0,artist_id,artist_name,gender
0,140892,Ice Nine Kills,Male
1,112476,James Horner,Male
2,160023,Mickey Petralia,Male
3,169251,3D Natee,Female
4,150340,Dick Schory's New Percussion Ensemble,Male
...,...,...,...
106,162001,Orion,Male
107,115027,Cho Young-Wuk,Male
108,121908,Princess Chelsea,Female
109,154662,OTT,Male


In [24]:
info_df = audio_df.iloc[:, [0,7,4,3,5,6,8]]
info_df

Unnamed: 0,artist_id,country_code,genre,member_count,year_formed,year_disbanded,label
0,140892,US,Metalcore,4,2006,0,
1,112476,US,OST,1,1979,0,
2,160023,US,,1,0,0,
3,169251,US,Rap,1,0,0,
4,150340,US,Acoustic,1,1958,0,
...,...,...,...,...,...,...,...
106,162001,DK,Psy Trance,1,1996,0,
107,115027,KR,,1,1962,0,
108,121908,NZ,Indie Pop,1,0,0,
109,154662,IE,Pop,5,1995,0,Sony Music Entertainment Downloads LLC


In [25]:
location_df = countries_df.drop([249, 152])
location_df.reset_index(inplace = True, drop=True)
location_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
243,Western Sahara,EH
244,Yemen,YE
245,Zambia,ZM
246,Zimbabwe,ZW


## Connecting to PostgreSQL

In [42]:
# define a function that will take in a dataframe and a table_name 
# function connect tp postgres through psycopg2, creates a sqlalchemy engine
# and uses the parameters to create a table in the audio_data database
# using the to_sql function

def create_table(df, table_name):
    try:
        #connect to postgres through psycopg2
        conn = psycopg2.connect(host="localhost", 
                                dbname="audio_data", 
                                user="postgres", 
                                password=password)
        
        #create an engine to communicate with postgres
        postgres_str = f'postgresql+psycopg2://postgres:'+password+'@localhost:5432/audio_data'
        engine = create_engine(postgres_str)

        #use the to_sql function to create a table 
        df.to_sql(table_name, con=engine, index=False, if_exists='replace')
        
        #verify that table was created
        return engine.table_names()
        
    # raise errors in connection or function
    except Exception as error:
        print(error)
    
    #close connection and engine if success or error
    finally:
        conn.close()
        engine.dispose

In [43]:
#create table containing all data
create_table(audio_df, 'audio')
create_table(location_df, 'location_info')

  return engine.table_names()


['audio', 'location_info', 'artist', 'artist_info']

In [44]:
DDL_queries = [
    "ALTER TABLE audio ALTER COLUMN artist_id TYPE integer USING artist_id::integer;",
    "ALTER TABLE audio ALTER COLUMN year_formed TYPE integer USING year_formed::integer;",
    
    "CREATE TABLE IF NOT EXISTS artist AS SELECT artist_id, artist_name, gender, member_count FROM audio;",  
    "ALTER TABLE artist ALTER COLUMN artist_id SET NOT NULL;",
    "ALTER TABLE artist DROP CONSTRAINT IF EXISTS artist_pkey;",
    "ALTER TABLE artist ADD PRIMARY KEY (artist_id);",

    "CREATE TABLE IF NOT EXISTS artist_info AS SELECT artist_id, country_code, genre, year_formed, year_disbanded, label FROM audio;",
    "ALTER TABLE artist_info ALTER COLUMN artist_id SET NOT NULL;",
    "ALTER TABLE artist_info DROP CONSTRAINT IF EXISTS artist_info_pkey;",
    "ALTER TABLE artist_info ADD PRIMARY KEY (artist_id);",

    "ALTER TABLE location_info ALTER COLUMN country_code SET NOT NULL;",
    "ALTER TABLE location_info DROP CONSTRAINT IF EXISTS location_info_pkey;",
    "ALTER TABLE location_info ADD PRIMARY KEY (country_code);",

    "DROP TABLE IF EXISTS audio;"
   ]


In [45]:
cur = None
conn = None

#connect to postgres through psycopg2
try:
    conn = psycopg2.connect(host="localhost", 
                            dbname="audio_data", 
                            user="postgres", 
                            password=password)

    #create cursor to give commands to postgres
    cur = conn.cursor()
    
    #create a loop to execute each DDL 
    for query in DDL_queries:
        
        #execute each query in DDL_queries list
        cur.execute(query)
        
        #string explanation 
        print('execute '+query)
        
    
    conn.commit()
    print('queries committed')
    
# raise errors in connection or function
except Exception as error:
    print(error)

#close cursor, connection and engine if success or error
finally:
    if cur is not None:
        cur.close()
        print('cursor closed')
    if conn is not None:
        conn.close()
        print('connection closed')


execute ALTER TABLE audio ALTER COLUMN artist_id TYPE integer USING artist_id::integer;
execute ALTER TABLE audio ALTER COLUMN year_formed TYPE integer USING year_formed::integer;
execute CREATE TABLE IF NOT EXISTS artist AS SELECT artist_id, artist_name, gender, member_count FROM audio;
execute ALTER TABLE artist ALTER COLUMN artist_id SET NOT NULL;
execute ALTER TABLE artist DROP CONSTRAINT IF EXISTS artist_pkey;
execute ALTER TABLE artist ADD PRIMARY KEY (artist_id);
execute CREATE TABLE IF NOT EXISTS artist_info AS SELECT artist_id, country_code, genre, year_formed, year_disbanded, label FROM audio;
execute ALTER TABLE artist_info ALTER COLUMN artist_id SET NOT NULL;
execute ALTER TABLE artist_info DROP CONSTRAINT IF EXISTS artist_info_pkey;
execute ALTER TABLE artist_info ADD PRIMARY KEY (artist_id);
execute ALTER TABLE location_info ALTER COLUMN country_code SET NOT NULL;
execute ALTER TABLE location_info DROP CONSTRAINT IF EXISTS location_info_pkey;
execute ALTER TABLE location_