# Entregable # 1

El primer entregable para CODER HOUSE debe extraer datos de una API publica y crear una tabla en Redshift. Una vez creada la tabla, se debe cargar los datos en Redshift.

In [1]:
from colorama import Back, Fore, Style
import pandas as pd
import requests
import psycopg2
import random
import os

In [2]:
# Function that requests over HTTP a JSON object from a given URL

JSON = int | str | float | bool | None | dict[str, "JSON"] | list["JSON"]
JSONObject = dict[str, JSON]

def http_get_sync(url: str) -> JSONObject:
    """Synchronously performs an HTTP GET request and returns the JSON response."""
    try :
        print(Back.BLACK + Fore.CYAN + "GET: " + url + Style.RESET_ALL)
        return requests.get(url).json()
    except:
        print(Back.BLACK + Fore.RED + "ERROR: " + url + Style.RESET_ALL)
        return {}


In [3]:
# The limit parameters for the API calls

API_KEY = os.environ.get("NAPSTER_API_KEY")
OFFSET = 1000  # Number of artists to seed in db

In [4]:
# The URL for the API call

# artist_page_offset = 116 # johnny-cash 212 Bob Dylan
artist_page_offset = random.randint(0, OFFSET)
napster_url = f'https://napi-v2-2-cloud-run-b3gtd5nmxq-uw.a.run.app/v2.2/artists/top{API_KEY}&limit=1&offset={artist_page_offset}'

In [5]:
napster_url

'https://napi-v2-2-cloud-run-b3gtd5nmxq-uw.a.run.app/v2.2/artists/top?apikey=MjZkYmFhZTctMjFkZi00NjY3LWEwNGMtZDYzNmQ4YmM3OThi&limit=1&offset=11'

In [6]:
# Get the JSON object from the URL

napster_json = http_get_sync(napster_url)

[40m[36mGET: https://napi-v2-2-cloud-run-b3gtd5nmxq-uw.a.run.app/v2.2/artists/top?apikey=MjZkYmFhZTctMjFkZi00NjY3LWEwNGMtZDYzNmQ4YmM3OThi&limit=1&offset=11[0m


In [7]:
napster_json

{'artists': [{'type': 'artist',
   'id': 'art.9557743',
   'href': 'https://api.napster.com/v2.2/artists/art.9557743',
   'name': 'Miley Cyrus',
   'shortcut': 'miley-cyrus',
   'amg': '823418',
   'blurbs': ["Once best known as the cute kid star of Disney's Hannah Montana, Miley Cyrus reinvented herself as an all-grown pop star.",
    'Assisting in that reinvention? A naked photo session in Vanity Fair, a foam finger at the VMAs and an affinity for twerking.',
    "Miley's becoming as famous for her interpersonal relations as her music, including a 2013 feud with Sinead O'Connor.",
    'You may also have heard of her father, none other than Mr. "Achy Breaky Heart," Billy Ray Cyrus.',
    'Miley was born Destiny Hope Cyrus in Nashville. The name Miley came from her childhood nickname "Smiley."'],
   'bios': [{'title': 'Bebop Digital',
     'author': 'Bebop Digital',
     'publishDate': '',
     'bio': "Once best known as the cute kid star of Disney's <i>Hannah Montana</i>, Miley Cyrus 

In [8]:
# Get all the keys inside artist object

for key in napster_json['artists'][0].keys():
    print(key)

type
id
href
name
shortcut
amg
blurbs
bios
albumGroups
links


In [9]:
# Get all the keys inside artist object

for key in napster_json['artists'][0]['links'].keys():
    print(key)

albums
images
posts
topTracks
genres
stations
contemporaries
followers
influences
relatedProjects


In [10]:
# Get all the keys inside artist object

for key in napster_json['artists'][0]['links']['images'].keys():
    print(key)

href


# Load to Redshift

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

In [12]:
# Spark session & context

# Postgres and Redshift JDBC URLs

driver_postgres_path = "/home/marm1984/github/data_engineering_coder_house/entregable_uno/jar_files/postgresql-42.6.0.jar"

spark = SparkSession.builder \
        .master("local") \
        .appName("pipeline_napster") \
        .config("spark.jars", driver_postgres_path) \
        .config("spark.driver.extraClassPath", driver_postgres_path) \
        .getOrCreate()
        

your 131072x1 screen size is bogus. expect trouble
23/06/08 00:07:13 WARN Utils: Your hostname, Geomario-Desktop resolves to a loopback address: 127.0.1.1; using 192.168.140.9 instead (on interface eth0)
23/06/08 00:07:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/06/08 00:07:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [13]:
spark

# Creating Table

The table for the artist, creates an Artist. Each artist has an ID, type, blurbs, name, hred and shorcut.

## Artist Table 

![Artist Table](images/artist_table.png)

# Data Structures

id = string
type = string
href = string
name = string
shortcut = string

In [14]:
# Import environment variables

host = os.environ['HOST']
port = os.environ['PORT']
user = os.environ['USER']
password = os.environ['PASSWORD']
database = os.environ['DATABASE']

In [15]:
# Create a connection to the database

try:
    conn = psycopg2.connect(
    host=host,
    port=port,
    database=database,
    user = user,
    password = password
    )
    print(Back.BLACK + Fore.GREEN + "SUCCESS: Connection to database" + Style.RESET_ALL)
except psycopg2.Error as e:
    print(Back.BLACK + Fore.RED + "ERROR: Connection to database" + Style.RESET_ALL)
    print(e)
    

[40m[32mSUCCESS: Connection to database[0m


In [16]:
# # Check the connection
personal_schema = os.environ['PERSONAL_SCHEMA']

cur = conn.cursor()
#  Check schema marm1984_coderhouse	
if cur:
    print("Connected to Redshift")
else:
    print("Connection failed")
 
#  Check schema exists from CODER
cur.execute(f"SELECT * FROM information_schema.tables WHERE table_schema = '{personal_schema}';")
print(cur.fetchall())


Connected to Redshift
[('data-engineer-database', 'marm1984_coderhouse', 'artist', 'BASE TABLE', None, None, None, None, None), ('data-engineer-database', 'marm1984_coderhouse', 'artist_napster', 'BASE TABLE', None, None, None, None, None)]


In [17]:
# Drop table if exists

cur.execute(f"DROP TABLE IF EXISTS {personal_schema}.artist_napster;")

In [18]:
# Create a table for artists

try:
    cur.execute(f"""
                CREATE TABLE IF NOT EXISTS {personal_schema}.artist_napster (
                    id VARCHAR(255) PRIMARY KEY,
                    name VARCHAR(255),
                    shortcut VARCHAR(255),
                    url VARCHAR(255),
                    type VARCHAR(255)
                );
                """)
    print(Back.BLACK + Fore.GREEN + "SUCCESS: Table created" + Style.RESET_ALL)
except:
    print(Back.BLACK + Fore.RED + "ERROR: Table not created" + Style.RESET_ALL)
    

[40m[32mSUCCESS: Table created[0m


In [19]:
napster_json['artists']

[{'type': 'artist',
  'id': 'art.9557743',
  'href': 'https://api.napster.com/v2.2/artists/art.9557743',
  'name': 'Miley Cyrus',
  'shortcut': 'miley-cyrus',
  'amg': '823418',
  'blurbs': ["Once best known as the cute kid star of Disney's Hannah Montana, Miley Cyrus reinvented herself as an all-grown pop star.",
   'Assisting in that reinvention? A naked photo session in Vanity Fair, a foam finger at the VMAs and an affinity for twerking.',
   "Miley's becoming as famous for her interpersonal relations as her music, including a 2013 feud with Sinead O'Connor.",
   'You may also have heard of her father, none other than Mr. "Achy Breaky Heart," Billy Ray Cyrus.',
   'Miley was born Destiny Hope Cyrus in Nashville. The name Miley came from her childhood nickname "Smiley."'],
  'bios': [{'title': 'Bebop Digital',
    'author': 'Bebop Digital',
    'publishDate': '',
    'bio': "Once best known as the cute kid star of Disney's <i>Hannah Montana</i>, Miley Cyrus reinvented herself as an a

# SPARK

In [20]:
# Create spark dataframe from JSON

df_artist = spark.createDataFrame(napster_json['artists'])

In [21]:
df_artist.show()

+--------------------+------+--------------------+--------------------+--------------------+-----------+--------------------+-----------+-----------+------+
|         albumGroups|   amg|                bios|              blurbs|                href|         id|               links|       name|   shortcut|  type|
+--------------------+------+--------------------+--------------------+--------------------+-----------+--------------------+-----------+-----------+------+
|{compilations -> ...|823418|[{publishDate -> ...|[Once best known ...|https://api.napst...|art.9557743|{relatedProjects ...|Miley Cyrus|miley-cyrus|artist|
+--------------------+------+--------------------+--------------------+--------------------+-----------+--------------------+-----------+-----------+------+



In [22]:
# Drop columns is not necessary

columns_table = ['id', 'name', 'shortcut', 'href', 'type']

for column in df_artist.columns:
    if column not in columns_table:
        df_artist = df_artist.drop(column)

In [23]:
df_artist.show()

+--------------------+-----------+-----------+-----------+------+
|                href|         id|       name|   shortcut|  type|
+--------------------+-----------+-----------+-----------+------+
|https://api.napst...|art.9557743|Miley Cyrus|miley-cyrus|artist|
+--------------------+-----------+-----------+-----------+------+



In [24]:
df_artist.printSchema()

root
 |-- href: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- shortcut: string (nullable = true)
 |-- type: string (nullable = true)



In [25]:
# insert data into table

df_artist.write \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://{host}:{port}/{database}") \
    .option("dbtable", f"{personal_schema}.artist_napster") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .mode("overwrite") \
    .save()
    

23/06/08 00:07:23 WARN PgConnection: Unsupported Server Version: 8.0.2
