In [1]:
from time import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd
import boto3
import json
import sql_queries
import psycopg2
import create_tables
import etl

### I read the config file again to access and interact with the Redshift Cluster.

In [2]:
config = configparser.ConfigParser()
config.read_file(open('project_dwh.cfg'))

DWH_IAM_ROLE_NAME = config.get('IAM_ROLE','DWH_IAM_ROLE_NAME')

DB_NAME = config.get('CLUSTER','DB_NAME')
HOST = config.get('CLUSTER','HOST')
DB_USER = config.get('CLUSTER','DB_USER')
DB_PASSWORD = config.get('CLUSTER','DB_PASSWORD')
DB_PORT = config.get('CLUSTER','DB_PORT')
DWH_ENDPOINT = config.get('CLUSTER','DWH_ENDPOINT')

### I initiated the connection to the Redshift Cluster

In [3]:
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(DWH_ENDPOINT,DB_NAME,DB_USER,DB_PASSWORD,DB_PORT))
cur = conn.cursor()


### Here I drop and recreate the necessary tables: the 2 staging tables and other star schema tables

In [4]:
create_tables.drop_tables(cur,conn)
create_tables.create_tables(cur,conn)

DROP TABLE IF EXISTS staging_events CASCADE table dropped
dropped
DROP TABLE IF EXISTS staging_songs CASCADE table dropped
dropped
DROP TABLE IF EXISTS songplay CASCADE table dropped
dropped
DROP TABLE IF EXISTS users CASCADE table dropped
dropped
DROP TABLE IF EXISTS songs CASCADE table dropped
dropped
DROP TABLE IF EXISTS artists CASCADE table dropped
dropped
DROP TABLE IF EXISTS playtimes CASCADE table dropped
dropped
table created
table created
table created
table created
table created
table created
table created


In [5]:
#conn.close()

### Here I load the 2 staging tables with the data from S3 buckets.
#### To save time, I only used the files in the A/A/ folder of the song_data bucket.

In [6]:
etl.load_staging_tables(cur,conn)


    copy staging_events from 's3://udacity-dend/log_data/2018/11'
    credentials 'aws_iam_role=arn:aws:iam::672895613677:role/dwhRole'
    format json as 's3://udacity-dend/log_json_path.json'     
    dateformat 'auto';


    copy staging_songs from 's3://udacity-dend/song_data/A/A'
    credentials 'aws_iam_role=arn:aws:iam::672895613677:role/dwhRole'
    format as json 'auto' 
    region 'us-west-2';



### The data in the staging tables are loaded to the production tables inside the Redshift Cluster.
#### Since, this is a SQL-to-SQL ETl, I used the combination of Insert Into and Select statements. We see which queries are ran without any error messages.

In [7]:
etl.insert_tables(cur,conn)


	INSERT INTO songplay (start_time, user_id,level,song_id,artist_id,session_id,location,user_agent)
	SELECT 
		e.ts,
	    e.userid,
	    e.level,
	    s.song_id,
	    s.artist_id,
	    e.sessionid,
	    e.location,
	    e.useragent
	FROM staging_events e
	inner JOIN staging_songs s
	ON e.song = s.title


	INSERT INTO users (user_id,first_name,last_name,gender,level)
	SELECT distinct 
		userId,
		firstname,
		lastname,
		gender,
		level
	FROM staging_events


	INSERT INTO songs (song_id,title,artist_id,year,duration)
	SELECT distinct 
		song_id,
		title,
		artist_id,
		year,
		duration
	FROM staging_songs


	INSERT INTO artists (artist_id,name,location,latitude,longitude)
	SELECT distinct 
		artist_id,
		artist_name,
		artist_location,
		artist_latitude,
		artist_longitude
	FROM staging_songs


	INSERT INTO playtimes (start_time,hour,day,week,month,year,weekday)
	SELECT start_time,
		EXTRACT(HOUR from start_date) as hour,
		EXTRACT(DAY from start_date) as day,
		EXTRACT(WEEK from start_