# Week 15 Project


In [2]:
#Dependencies
import pandas as pd
import numpy as np
import requests
import json
import random
import math
import statistics
from scipy import stats
import psycopg2
from sqlalchemy import create_engine
from password import password 
import warnings
warnings.filterwarnings('ignore')

In [3]:
#modules for statistics
import math
import statistics
from scipy import stats
import psycopg2
from password import password 
# The sql module now uses sqlalchemy to support different database flavors. 
#You can pass a sqlalchemy engine for a postgresql database 
# https://www.tutorialspoint.com/sqlalchemy/sqlalchemy_introduction.htm
from sqlalchemy import create_engine

# Generate Dictionary of Artist Data from Web API

In [4]:
# Find a code to get all the artists from this api
# https://www.theaudiodb.com/api_guide.php
# theaudiodb.com/api/v1/json/1/search.php?s=coldplay

# empty list to be populated with artist info
artist_names = []

audio_data = {'artist_id':[],
               'artist_name':[],
               'gender':[],
               'member_count':[],
               'style':[], 
               'genre':[],
               'year_formed':[],
               'year_disbanded':[],
               'country_code':[],
               'label':[]}

In [5]:
#generate random numbers in the range of available artist IDs
#used to populate artist_names list
random.seed(66)
random_nums = [random.randint(100000, 170000) for i in range(1500)]

#test with known IDs and intentional errors
#random_nums = [112024, 0, 100000, 114364]

In [6]:
#this artist list uses a random number generator to test the database for artist IDs
#if an ID corresponds to an artist, it append the artist's information to info lists
#create a base url
#example: https://theaudiodb.com/api/v1/json/2/artist.php?i=112024
id_url = "https://theaudiodb.com/api/v1/json/2/artist.php?i="

#create a loop that uses random numbers list
for num in random_nums:
    unique_url = id_url + f'{num}'
    
    #call api url
    request = requests.get(unique_url)
    
    #for each attempt try to convert information to json
    try:
        #convert to json
        info = request.json()
        
    #exception json decode error
    #https://docs.python.org/3/tutorial/controlflow.html
    except json.JSONDecodeError:
        #end this iteration and continue new iteration of for loop
        continue
    #if json conversion successful then
    #try to create a variable for single artist to reference later
    try:
        artist = info['artists'][0]['strArtist']
        
    #TypeError exception if url responds with {'artists': None}
    except (TypeError):
        continue
        
    #if the artist is not already in the list then 
    if artist not in artist_names:
        
        #populate artist_names list using info(json)
        #response-> {'artists': [{'idArtist': '114364', 'strArtist': 'Beyoncé', ...
        #dictionary{'artists':['{dictionary}']}
        audio_data['artist_id'].append(info['artists'][0]['idArtist'])
        audio_data['artist_name'].append(info['artists'][0]['strArtist'])
        audio_data['gender'].append(info['artists'][0]["strGender"])
        audio_data['member_count'].append(info['artists'][0]["intMembers"])
        audio_data['style'].append(info['artists'][0]["strStyle"])
        audio_data['genre'].append(info['artists'][0]["strGenre"])
        audio_data['year_formed'].append(info['artists'][0]["intFormedYear"])
        audio_data['year_disbanded'].append(info['artists'][0]["strDisbanded"])
        audio_data['country_code'].append(info['artists'][0]['strCountryCode'])
        audio_data['label'].append(info['artists'][0]['strLabel'])

### Convert Dictionary to DataFrame

In [7]:
audio_df = pd.DataFrame(audio_data)
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,style,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,,Metalcore,2006,,US,
1,132276,Savage,Male,1,Electronic,Synthpop,1983,,IT,
2,158416,Jason Hawk Harris,,,,,0,,,
3,133565,Violent Work of Art,Mixed,4,,Industrial Metal,1994,,SE,
4,112476,James Horner,Male,1,Classical,OST,1979,,US,
...,...,...,...,...,...,...,...,...,...,...
1161,113477,Terror,Male,5,Rock/Pop,Hardcore,2000,,US,
1162,157114,DJ Baur,,,,,0,,,
1163,114531,Toots & The Maytals,,1,Reggae,Reggae,1963,,JM,
1164,117428,Gendai Kano,Male,1,Classical,Musical,1938,,JP,


# Preprocessing


#### Converting to Null Values
- Cleaning the data to ensure that all null values are represented in the same way.
- All missing values (NaN, Null, None, and empty string) were replaced with 'None' to indicate that there is no available data.

In [8]:
#sum of null values in each column
print(audio_df.isnull().sum())

#style has more null values than genre, drop style column
audio_df.drop(['style'], axis=1, inplace=True)

artist_id            0
artist_name          0
gender             350
member_count       350
style              356
genre               62
year_formed        153
year_disbanded    1096
country_code         0
label             1101
dtype: int64


In [9]:
#convert empty strings and null values to NaN
audio_df.replace({'0':'None',
                  0:'None', 
                  None:'None', 
                  '':'None',
                  np.nan:'None'}, inplace = True)

In [10]:
audio_df.isnull().sum()

artist_id         0
artist_name       0
gender            0
member_count      0
genre             0
year_formed       0
year_disbanded    0
country_code      0
label             0
dtype: int64

#### Converting datatypes

In [11]:
audio_df.dtypes

artist_id         object
artist_name       object
gender            object
member_count      object
genre             object
year_formed       object
year_disbanded    object
country_code      object
label             object
dtype: object

In [12]:
#convert dtypes from string to numeric dtypes
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].apply(pd.to_numeric,errors='coerce',downcast='integer')
#convert float to integer
audio_df[['artist_id', 
          'member_count', 
          'year_formed', 
          'year_disbanded']] = audio_df[['artist_id', 
                                         'member_count', 
                                         'year_formed', 
                                         'year_disbanded']].convert_dtypes(convert_integer=True, convert_string=False)
audio_df.dtypes

artist_id          Int32
artist_name       object
gender            object
member_count       Int64
genre             object
year_formed        Int64
year_disbanded     Int64
country_code      object
label             object
dtype: object

In [13]:
#convert year_disbanded and year_formed to datetime

pd.to_datetime(audio_df['year_formed'], 
               format= '%Y', errors='coerce')

0      2006-01-01
1      1983-01-01
2             NaT
3      1994-01-01
4      1979-01-01
          ...    
1161   2000-01-01
1162          NaT
1163   1963-01-01
1164   1938-01-01
1165   2016-01-01
Name: year_formed, Length: 1166, dtype: datetime64[ns]

In [14]:
##convert NaN created by numeric conversion into 0 in order to query later
audio_df.replace({np.nan:0}, inplace = True)
audio_df


#audio_df.replace({np.nan:'None'}, inplace = True)

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label
0,140892,Ice Nine Kills,Male,4,Metalcore,2006,0,US,
1,132276,Savage,Male,1,Synthpop,1983,0,IT,
2,158416,Jason Hawk Harris,,0,,0,0,,
3,133565,Violent Work of Art,Mixed,4,Industrial Metal,1994,0,SE,
4,112476,James Horner,Male,1,OST,1979,0,US,
...,...,...,...,...,...,...,...,...,...
1161,113477,Terror,Male,5,Hardcore,2000,0,US,
1162,157114,DJ Baur,,0,,0,0,,
1163,114531,Toots & The Maytals,,1,Reggae,1963,0,JM,
1164,117428,Gendai Kano,Male,1,Musical,1938,0,JP,


#### Featuring Engineering
Creating a new colum of expanded country names corresponding to country codes

In [15]:
audio_df['country_code'].value_counts()

US      291
None    221
GB      110
DE       65
FR       61
       ... 
MY        1
CO        1
GH        1
IR        1
SW        1
Name: country_code, Length: 72, dtype: int64

In [16]:
#import csv containing standard 2 digit alpha codes for countries
alpha2_codes = pd.read_csv('alpha2_codes.csv')

#convert alpha2 codes to dataframe
countries_df = pd.DataFrame(alpha2_codes)
countries_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
245,Yemen,YE
246,Zambia,ZM
247,Zimbabwe,ZW
248,Aland Islands,AX


In [17]:
#merge list of country codes onto audio_df to create new column of country names
audio_df = pd.merge(audio_df, countries_df)

In [18]:
audio_df

Unnamed: 0,artist_id,artist_name,gender,member_count,genre,year_formed,year_disbanded,country_code,label,country_name
0,140892,Ice Nine Kills,Male,4,Metalcore,2006,0,US,,United States of America (the)
1,112476,James Horner,Male,1,OST,1979,0,US,,United States of America (the)
2,160023,Mickey Petralia,Male,1,,0,0,US,,United States of America (the)
3,169251,3D Natee,Female,1,Rap,0,0,US,,United States of America (the)
4,150340,Dick Schory's New Percussion Ensemble,Male,1,Acoustic,1958,0,US,,United States of America (the)
...,...,...,...,...,...,...,...,...,...,...
1094,130853,İlhan Erşahin,,1,,1965,0,TR,,Turkey
1095,167192,Veronika Povilioniene,Female,1,Folk,1987,0,LT,,Lithuania
1096,149602,李雲迪,Male,1,Pianist,0,0,CN,,China
1097,167328,ريم بنا,Female,1,World/Ethnic,0,0,PS,,"Palestine, State of"


### Drop duplicate artists_ids and country codes

In [19]:
#drop 5 duplicate artist_ids that were discovered
audio_df.drop_duplicates(subset=['artist_id'], inplace=True)
audio_df.reset_index(inplace=True, drop=True)

In [20]:
#countries_df contains only unique values
# drop two discovered codes that had null values
location_df = countries_df.drop([249, 152])
location_df.reset_index(inplace = True, drop=True)

## Creating Tables

Split dataframe into multiple dataframes corresponding to 3 database tables

In [21]:
artist_df = audio_df.iloc[:,:3]
artist_df

Unnamed: 0,artist_id,artist_name,gender
0,140892,Ice Nine Kills,Male
1,112476,James Horner,Male
2,160023,Mickey Petralia,Male
3,169251,3D Natee,Female
4,150340,Dick Schory's New Percussion Ensemble,Male
...,...,...,...
1089,130853,İlhan Erşahin,
1090,167192,Veronika Povilioniene,Female
1091,149602,李雲迪,Male
1092,167328,ريم بنا,Female


In [22]:
info_df = audio_df.iloc[:, [0,7,4,3,5,6,8]]
info_df

Unnamed: 0,artist_id,country_code,genre,member_count,year_formed,year_disbanded,label
0,140892,US,Metalcore,4,2006,0,
1,112476,US,OST,1,1979,0,
2,160023,US,,1,0,0,
3,169251,US,Rap,1,0,0,
4,150340,US,Acoustic,1,1958,0,
...,...,...,...,...,...,...,...
1089,130853,TR,,1,1965,0,
1090,167192,LT,Folk,1,1987,0,
1091,149602,CN,Pianist,1,0,0,
1092,167328,PS,World/Ethnic,1,0,0,


In [23]:
location_df

Unnamed: 0,country_name,country_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,American Samoa,AS
4,Andorra,AD
...,...,...
243,Western Sahara,EH
244,Yemen,YE
245,Zambia,ZM
246,Zimbabwe,ZW


## Connecting ot PostgreSQL
#### Convert dataframes to SQL tables

#### Connectiing to PostgreSQL

The cur object calls the execute method and if successful, will return none
To get the values from my query, need to call fetchone() - returns the first row or None, or fetchall() - returns a list of each row in the table or empty list [] if there are no rows. 

In [24]:
# define a function that will take in a dataframe and a table_name 
# function connect tp postgres through psycopg2, creates a sqlalchemy engine
# and uses the parameters to create a table in the audio_data database
# using the to_sql function

def create_table(df, table_name):
    try:
        #connect to postgres through psycopg2
        conn = psycopg2.connect(host="localhost", 
                                dbname="audio_data", 
                                user="postgres", 
                                password=password)
        
        #create an engine to communicate with postgres
        postgres_str = f'postgresql+psycopg2://postgres:'+password+'@localhost:5432/audio_data'
        engine = create_engine(postgres_str)

        #use the to_sql function to create a table 
        df.to_sql(table_name, con=engine, index=False, if_exists='replace')
        
        #verify that table was created
        return engine.table_names()
        
    # raise errors in connection or function
    except Exception as error:
        print(error)
    
    #close connection and engine if success or error
    finally:
        conn.close()
        engine.dispose

#### Create a table containing all data

In [25]:
create_table(audio_df, 'audio')
create_table(artist_df,'artist')
create_table(info_df, 'artist_info')
create_table(location_df, 'location_info')


['audio', 'artist', 'artist_info', 'location_info']

#### Creating and altering tables with SQL DDL

In [26]:
DDL_queries = [
    #audio table DDL
    "ALTER TABLE audio ALTER COLUMN artist_id TYPE integer USING artist_id::integer;",
    "ALTER TABLE audio ALTER COLUMN year_formed TYPE integer USING year_formed::integer;",
    "ALTER TABLE artist ALTER COLUMN artist_id SET NOT NULL;",
    
    #location_info table DDL
    "ALTER TABLE location_info ALTER COLUMN country_code SET NOT NULL;",
    "ALTER TABLE location_info DROP CONSTRAINT IF EXISTS location_info_pkey;",
    "ALTER TABLE location_info ADD PRIMARY KEY (country_code);",
    
    #artist table DDL
    "CREATE TABLE IF NOT EXISTS artist AS SELECT artist_id, artist_name, gender, member_count FROM audio;",  
    "ALTER TABLE artist DROP CONSTRAINT IF EXISTS artist_pkey;",
    "ALTER TABLE artist ADD PRIMARY KEY (artist_id);",

    #artist_info table DDL
    "CREATE TABLE IF NOT EXISTS artist_info AS SELECT artist_id, country_code, genre, year_formed, year_disbanded, label FROM audio;",
    "ALTER TABLE artist_info DROP CONSTRAINT IF EXISTS artist_info_pkey;",
    "ALTER TABLE artist_info ADD PRIMARY KEY (artist_id);",

    #drop redundant audio table
    "DROP TABLE IF EXISTS audio;"
   ]

In [27]:
cur = None
conn = None

#connect to postgres through psycopg2
try:
    conn = psycopg2.connect(host="localhost", 
                            dbname="audio_data", 
                            user="postgres", 
                            password=password)

    #create cursor to give commands to postgres
    cur = conn.cursor()
    
    #create a loop to execute each DDL 
    for query in DDL_queries:
        
        #execute each query in DDL_queries list
        cur.execute(query)
        
        #string explanation 
        print('execute '+query)
        
    #this commits any changes/transactions to the database
    conn.commit()
    print('queries committed')
    
# raise errors in connection or function
except Exception as error:
    print(error)

#close cursor, connection and engine if success or error
finally:
    if cur is not None:
        cur.close()
        print('cursor closed')
    if conn is not None:
        conn.close()
        print('connection closed')

execute ALTER TABLE audio ALTER COLUMN artist_id TYPE integer USING artist_id::integer;
execute ALTER TABLE audio ALTER COLUMN year_formed TYPE integer USING year_formed::integer;
execute ALTER TABLE artist ALTER COLUMN artist_id SET NOT NULL;
execute ALTER TABLE location_info ALTER COLUMN country_code SET NOT NULL;
execute ALTER TABLE location_info DROP CONSTRAINT IF EXISTS location_info_pkey;
execute ALTER TABLE location_info ADD PRIMARY KEY (country_code);
execute CREATE TABLE IF NOT EXISTS artist AS SELECT artist_id, artist_name, gender, member_count FROM audio;
execute ALTER TABLE artist DROP CONSTRAINT IF EXISTS artist_pkey;
execute ALTER TABLE artist ADD PRIMARY KEY (artist_id);
execute CREATE TABLE IF NOT EXISTS artist_info AS SELECT artist_id, country_code, genre, year_formed, year_disbanded, label FROM audio;
execute ALTER TABLE artist_info DROP CONSTRAINT IF EXISTS artist_info_pkey;
execute ALTER TABLE artist_info ADD PRIMARY KEY (artist_id);
execute DROP TABLE IF EXISTS aud

### Run queries to answer some of the project questions

In [36]:
#establish connection through psycopg2
conn = psycopg2.connect(host="localhost", 
                        dbname="audio_data", 
                        user="postgres", 
                        password=password)

#create cursor to give commands to postgres
cur = conn.cursor()

**1) Which genres are popular in specific countries?**

In [37]:
query1 = "SELECT country_name, genre, COUNT(genre) AS genre_count FROM location_info INNER JOIN artist_info ON location_info.country_code = artist_info.country_code GROUP BY genre, country_name ORDER BY genre_count DESC;"

#execute each query in DDL_queries list
cur.execute(query1)

#fetch results and store in results variable
results_query1 = cur.fetchall()

#create and display data frame
query1_df = pd.DataFrame(results_query1, columns = ['country_name', 'genre', 'genre_count'])
display(query1_df)

Unnamed: 0,country_name,genre,genre_count
0,United States of America (the),,110
1,United Kingdom of Great Britain and Northern I...,,32
2,Germany,,26
3,France,,20
4,United States of America (the),Jazz,20
...,...,...,...
355,Sweden,Classic Rock,1
356,Denmark,Electronic,1
357,United Kingdom of Great Britain and Northern I...,Ambient,1
358,Norway,Electronic,1


**2) On avg how many bands were formed from 2009-2019? In 2020?**

In [38]:
query2 = "SELECT COUNT(artist_name) AS artist_count, year_formed FROM artist INNER JOIN artist_info ON artist.artist_id = artist_info.artist_id GROUP BY year_formed ORDER BY artist_count DESC;"

**3) Which artists were formed in the 90s? --> 1990-1999**

In [39]:
query3 = "SELECT DISTINCT artist_name, year_formed FROM artist INNER JOIN artist_info ON artist.artist_id = artist_info.artist_id WHERE year_formed BETWEEN 1990 AND 1999 ORDER BY year_formed DESC;"

#execute each query in DDL_queries list
cur.execute(query3)

#fetch results and store in results variable
results_query3 = cur.fetchall()

#create and display data frame
query3_df = pd.DataFrame(results_query3, columns = ['artist_name', 'year_formed'])
display(query3_df)

Unnamed: 0,artist_name,year_formed
0,BAZRA,1999
1,D-Ground,1999
2,Damien Saez,1999
3,El Chojin,1999
4,Grafvolluth,1999
...,...,...
141,Ai Ninomiya,1990
142,O.C.,1990
143,Sálvate si puedes,1990
144,The Lord Weird Slough Feg,1990


### Additional Queries to Run

**Show all artists whose name begins with B. from the year 2000 this dataset?**

In [32]:
query4 = "SELECT artist_name, year_formed FROM artist INNER JOIN artist_info ON artist.artist_id = artist_info.artist_id WHERE artist_name LIKE 'B%' AND year_formed = '2000'"

#execute each query in DDL_queries list
cur.execute(query4)

#fetch results and store in results variable
results_query4 = cur.fetchall()

#create and display data frame
query4_df = pd.DataFrame(results_query4, columns = ['artist_name_B', 'year_formed'])
display(query4_df)


Unnamed: 0,artist_name_B,year_formed
0,Blitzen Trapper,2000
1,Belanova,2000


**What are the top three popular genres among artists in this dataset?**

In [41]:
query5 = "SELECT genre, COUNT(*) AS amount FROM artist_info GROUP BY genre ORDER BY amount DESC LIMIT 4;"

#execute each query in DDL_queries list
cur.execute(query5)

#fetch results and store in results variable
results_query5 = cur.fetchall()

#create and display data frame
query5_df = pd.DataFrame(results_query5, columns = ['genre', 'amount'])
display(query5_df)

Unnamed: 0,genre,amount
0,,501
1,Pop,46
2,Jazz,40
3,Rock,28


**Get the names of artists from the US who were formed in 2000.**

In [40]:
query6= "SELECT artist_name, li.country_code, year_formed FROM artist AS ar INNER JOIN artist_info AS ai ON ar.artist_id = ai.artist_id INNER JOIN location_info AS li ON ai.country_code = li.country_code WHERE year_formed = 2000 AND li.country_code = 'US';"

#execute each query in DDL_queries list
cur.execute(query6)

#fetch results and store in results variable
results_query6 = cur.fetchall()

#create and display data frame
query6_df = pd.DataFrame(results_query6, columns = ['artists', 'country', 'year formed'])
display(query6_df)

Unnamed: 0,artists,country,year formed
0,Blitzen Trapper,US,2000
1,The Fiery Furnaces,US,2000
2,Ms. Toi,US,2000
3,Terror,US,2000
