# ETL Project - Angela's Workbook


In [1]:
#Dependencies
import pandas as pd
import numpy as np
import requests
#import time
#from pprint import pprint
import json
import random
import psycopg2
from password import password
from sqlalchemy import create_engine

#import matplotlib.pyplot as plt
#import seaborn as sns

In [2]:
#modules for statistics
#import math
#import statistics
#from scipy import stats

#### Generate Dictionary of Artist Data from Web API

In [3]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [4]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(100)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [5]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])


#### Convert Dictionary to DataFrame

In [6]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
69,164585,Benedetto Ferrari,,,,,0,,IT,
70,111651,B.o.B,Male,1,Urban/R&B,Hip-Hop,2010,,US,Grand Hustle
71,122786,Siedah Garrett,Female,1,Urban/R&B,R&B,1960,,US,
72,163063,Terry Scott Taylor,,1,,Folk Rock,1950,,US,


# Preprocessing

#### Converting to Null values
- Cleaning the data to ensure that all null values are represented in the same way.  
- All missing values (NaN, Null, None, and empty string) were replaced with 'None' to indicate that there is no available data.

In [7]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id          0
artist_name        0
gender            11
member_count      11
style             11
genre              3
year_formed        5
year_disbanded    65
country_code       0
label             65
dtype: int64


In [8]:
#convert empty strings and null values to NaN
audio_df.replace({'0':'None',
                  0:'None', 
                  None:'None', 
                  '':'None',
                  np.nan:'None',
                  'NaN':'None'}, inplace = True)

In [9]:
audio_df.isnull().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Converting datatypes

In [10]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [11]:
#convert dtypes from string to numeric dtypes
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].apply(pd.to_numeric,errors='coerce',downcast='integer')
#convert float to integer
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True, convert_string=False)
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

In [12]:
#convert year_disbanded and year_formed to datetime

pd.to_datetime(audio_df['year_formed'], 
               format= '%Y', errors='coerce')

0    2006-01-01
1    1983-01-01
2           NaT
3    1994-01-01
4    1979-01-01
        ...    
69          NaT
70   2010-01-01
71   1960-01-01
72   1950-01-01
73   1993-01-01
Name: year_formed, Length: 74, dtype: datetime64[ns]

In [13]:
#convert NaN created by numeric conversion into 'None'
audio_df.replace({np.nan:'None'}, inplace = True)
audio_df.isna().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Feature Engineering
Creating a new colum of expanded country names corresponding to country codes

In [14]:
audio_df['country_code'].value_counts()

US      19
None     9
IT       8
GB       6
NL       5
DE       5
FR       4
ES       4
UK       2
JP       2
RU       1
KR       1
DK       1
BR       1
MX       1
CO       1
CZ       1
CA       1
SE       1
NZ       1
Name: country_code, dtype: int64

In [15]:
#import csv containing standard 2 digit alpha codes for countries
alpha2_codes = pd.read_csv('../Datasets/alpha2_codes.csv')

#convert alpha2 codes to dataframe
countries_df = pd.DataFrame(alpha2_codes)
countries_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
245,Yemen,YE
246,Zambia,ZM
247,Zimbabwe,ZW
248,Aland Islands,AX


In [16]:
audio_df = pd.merge(audio_df, countries_df)

## Creating Tables

Split dataframe into multiple dataframes corresponding to 3 database tables
- artist_df
- info_df
- location_df

In [17]:
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label,country_name
0,140892,Ice Nine Kills,Male,4,Metalcore,2006,,US,,United States of America (the)
1,112476,James Horner,Male,1,OST,1979,,US,,United States of America (the)
2,160023,Mickey Petralia,Male,1,,,,US,,United States of America (the)
3,169251,3D Natee,Female,1,Rap,,,US,,United States of America (the)
4,150340,Dick Schory's New Percussion Ensemble,Male,1,Acoustic,1958,,US,,United States of America (the)
...,...,...,...,...,...,...,...,...,...,...
67,124107,Владимир Кузьмин,Male,1,,1955,,RU,,Russian Federation (the)
68,137066,Bezerra da Silva,Male,1,,1927,,BR,,Brazil
69,162001,Orion,Male,1,Psy Trance,1996,,DK,,Denmark
70,115027,Cho Young-Wuk,Male,1,,1962,,KR,,Korea (the Republic of)


In [18]:
# split data into multiple data frames for tables
artist_df = audio_df.iloc[:,:3]
artist_df

Unnamed: 0,artist_id,artist_name,gender
0,140892,Ice Nine Kills,Male
1,112476,James Horner,Male
2,160023,Mickey Petralia,Male
3,169251,3D Natee,Female
4,150340,Dick Schory's New Percussion Ensemble,Male
...,...,...,...
67,124107,Владимир Кузьмин,Male
68,137066,Bezerra da Silva,Male
69,162001,Orion,Male
70,115027,Cho Young-Wuk,Male


In [19]:
info_df = audio_df.iloc[:, [0,7,4,3,5,6,8]]
info_df

Unnamed: 0,artist_id,country_code,genre,member_count,year_formed,year_disbanded,label
0,140892,US,Metalcore,4,2006,,
1,112476,US,OST,1,1979,,
2,160023,US,,1,,,
3,169251,US,Rap,1,,,
4,150340,US,Acoustic,1,1958,,
...,...,...,...,...,...,...,...
67,124107,RU,,1,1955,,
68,137066,BR,,1,1927,,
69,162001,DK,Psy Trance,1,1996,,
70,115027,KR,,1,1962,,


In [20]:
location_df = audio_df.iloc[:,[-3, -1]]
location_df

Unnamed: 0,country_code,country_name
0,US,United States of America (the)
1,US,United States of America (the)
2,US,United States of America (the)
3,US,United States of America (the)
4,US,United States of America (the)
...,...,...
67,RU,Russian Federation (the)
68,BR,Brazil
69,DK,Denmark
70,KR,Korea (the Republic of)


## Connecting to PostgreSQL

In [41]:
conn = psycopg2.connect(host="localhost", 
                        dbname="audio_data", 
                        user="postgres", 
                        password=password)
postgres_str = f'postgresql://postgres:'+password+'@localhost:5432/audio_data'
engine = create_engine(postgres_str)


In [43]:
audio_df.to_sql('artist', con=engine, index=False, if_exists='replace')

In [80]:
conn = None
cur = None
cols = "','".join([str(i) for i in artist_df.columns.tolist()])
cols

"artist_id','artist_name','gender"

In [39]:
#df_dict = {artist_df:'artist', info_df:'artist_info', location_df:'location'}
df_list = [artist_df, info_df, location_df]
name_list = ['artist', 'artist_info', 'location']

In [48]:


def create_table(df, table_name):
    try:
        conn = psycopg2.connect(host="localhost", 
                                dbname="audio_data", 
                                user="postgres", 
                                password=password)

        postgres_str = f'postgresql://postgres:'+password+'@localhost:5432/audio_data'
        engine = create_engine(postgres_str)

        df.to_sql(table_name, con=engine, index=False, if_exists='replace')

    except Exception as error:
        print(error)

    finally:
        conn.close()


In [49]:
create_table(artist_df, 'artist')
create_table(info_df, 'artist_info')
create_table(location_df, 'location')

In [40]:


#establish connection to database
try:
    conn = psycopg2.connect(host="localhost", 
                            dbname="audio_data", 
                            user="postgres", 
                            password=password)
    
    postgres_str = f'postgresql://postgres:'+password+'@localhost:5432/audio_data'
    engine = create_engine(postgres_str)


    # to interact with the database, you need cursors(commands) 
    #cur = conn.cursor()
    
    for df in df_list and name in name_list:
        df.to_sql(name, con=engine, index=False, if_exists='replace')

    #cur.execute(create_script)

    
    #conn.commit()
    
except Exception as error:
    print(error)

finally:
    if cur is not None:
        cur.close()
    if conn is not None:
        conn.close()


name 'name' is not defined


In [85]:
conn = None
cur = None
cols = "','".join([str(i) for i in artist_df.columns.tolist()])

#establish connection to database
try:
    conn = psycopg2.connect(host="localhost", 
                            dbname="audio_data", 
                            user="postgres", 
                            password=password)
    postgres_str = f'postgresql://postgres:'+password+'@localhost:5432/audio_data'
    engine = create_engine(postgres_str)


    # to interact with the database, you need cursors(commands) 
    cur = conn.cursor()
    
    #engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/audio_data', echo=False)
    #with engine.begin() as connection:
    
    create_script = '''CREATE TABLE IF NOT EXISTS artist (artist_id int PRIMARY KEY, artist_name varchar(40) NOT NULL)'''
    #create_script = artist_df.to_sql(name='artist', con=engine)
    cur.execute(create_script)
    
    #insert_script = 'INSERT INTO artist(artist_id, artist_name, gender) VALUES(%s, %s, %s, %s)'
    #insert_value = (140892, 'Ice Nine Kills', 'Male')
    #cur.execute(insert_script, insert_value)
    #conn.commit()
    
    for i,row in artist_df.iterrows():
        sql_script = '''INSERT INTO artist (cols) VALUES (" + "%s,"*(len(row)-1) + "%s)'''
        cur.execute(sql_script, tuple(row))
    
    conn.commit()
    
except Exception as error:
    print(error)

finally:
    if cur is not None:
        cur.close()
    if conn is not None:
        conn.close()


not all arguments converted during string formatting


In [42]:
#with open('../Datasets/alpha2_codes.csv', 'r') as f:
 #   next(f)
  #  cur.copy_from(f, 'country codes', sep=',')
#conn.commit()

try:
   for query in list_of_querys:
       #query format => "INSERT INTO <database.table> VALUES (<values>)"
       cursor.execute(query)
       connection.commit()
except BaseException as e:
   connection.rollback()

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [None]:
cur.execute("SELECT artist_id")

In [None]:
#Verify tables
engine.table_names()