# Process Song data : Limited

In [15]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek
from pyspark.sql.types import *
from tqdm import tqdm

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))
os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

# S3 save location 

In [14]:
# save location
output_data = "s3a://salas-bucket/temp-save/"

# Check song_data LIMITED

In [5]:
import boto3 

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('udacity-dend')
song_data_list = []
for object_summary in my_bucket.objects.filter(Prefix="song-data/A/A/A/"):
    song_data_list.append(object_summary.key)

In [6]:
len(song_data_list)

24

In [7]:
song_data_list

['song-data/A/A/A/TRAAAAK128F9318786.json',
 'song-data/A/A/A/TRAAAAV128F421A322.json',
 'song-data/A/A/A/TRAAABD128F429CF47.json',
 'song-data/A/A/A/TRAAACN128F9355673.json',
 'song-data/A/A/A/TRAAAEA128F935A30D.json',
 'song-data/A/A/A/TRAAAED128E0783FAB.json',
 'song-data/A/A/A/TRAAAEM128F93347B9.json',
 'song-data/A/A/A/TRAAAEW128F42930C0.json',
 'song-data/A/A/A/TRAAAFD128F92F423A.json',
 'song-data/A/A/A/TRAAAGR128F425B14B.json',
 'song-data/A/A/A/TRAAAHD128F42635A5.json',
 'song-data/A/A/A/TRAAAHJ128F931194C.json',
 'song-data/A/A/A/TRAAAHZ128E0799171.json',
 'song-data/A/A/A/TRAAAIR128F1480971.json',
 'song-data/A/A/A/TRAAAJN128F428E437.json',
 'song-data/A/A/A/TRAAAND12903CD1F1B.json',
 'song-data/A/A/A/TRAAANK128F428B515.json',
 'song-data/A/A/A/TRAAAOF128F429C156.json',
 'song-data/A/A/A/TRAAAPK128E0786D96.json',
 'song-data/A/A/A/TRAAAQN128F9353BA0.json',
 'song-data/A/A/A/TRAAAQO12903CD8E1C.json',
 'song-data/A/A/A/TRAAAUC128F428716F.json',
 'song-data/A/A/A/TRAAAUR128F428

# song_data `LIMITED`

In [8]:
# path of data
input_data = "s3a://udacity-dend/"
song_data = os.path.join(input_data, 'song_data/A/A/A/*.json')
song_data

's3a://udacity-dend/song_data/A/A/A/*.json'

# Create dataframe

In [9]:
%%time
song_df = spark.read.json(song_data)

CPU times: user 512 µs, sys: 7.42 ms, total: 7.93 ms
Wall time: 17.7 s


In [10]:
song_df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [11]:
song_df

artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
ARTC1LV1187B9A4858,51.4536,Goldsmith's Colle...,-0.01802,The Bonzo Dog Band,301.40036,1,SOAFBCP12A8C13CC7D,King Of Scurf (20...,1972
ARA23XO1187B9AF18F,40.57885,"Carteret, New Jersey",-74.21956,The Smithereens,192.522,1,SOKTJDS12AF72A25E5,Drown In My Own T...,0
ARSVTNL1187B992A91,51.50632,"London, England",-0.12714,Jonathan King,129.85424,1,SOEKAZG12AB018837E,I'll Slap Your Fa...,2001
AR73AIO1187B9AD57B,37.77916,"San Francisco, CA",-122.42005,Western Addiction,118.07302,1,SOQPWCR12A6D4FB2A3,A Poor Recipe For...,2005
ARXQBR11187B98A2CC,,"Liverpool, England",,Frankie Goes To H...,821.05424,1,SOBRKGM12A8C139EF6,Welcome to the Pl...,1985
ARSZ7L31187FB4E610,39.74001,"Denver, CO",-104.99226,Devotchka,337.81506,1,SORRNOC12AB017F52B,The Last Beat Of ...,2004
AR10USD1187B99F3F1,,"Burlington, Ontar...",,Tweeterfriendly M...,189.57016,1,SOHKNRJ12A6701D1F8,Drop of Rain,0
ARZ5H0P1187B98A1DD,33.76672,"Long Beach, CA",-118.1924,Snoop Dogg,230.42567,1,SOAPERH12A58A787DC,The One And Only ...,0
AR1KTV21187B9ACD72,34.05349,California - LA,-118.24532,Cristina,343.87546,1,SOSMJFC12A8C13DE0C,Is That All There...,0
ARCLYBR1187FB53913,37.54703,"San Mateo, CA",-122.31483,Neal Schon,304.56118,1,SOOVHYF12A8C134892,I'll Be Waiting,1989


--
# Create `songs_table`

In [12]:
songs_table = song_df.select("song_id","title","artist_id","year","duration").drop_duplicates()
songs_table

song_id,title,artist_id,year,duration
SODZYPO12A8C13A91E,Burn My Body (Alb...,AR1C2IX1187B99BF74,0,177.99791
SOIGHOD12A8C13B5A1,Indian Angel,ARY589G1187B9A9F4E,2004,171.57179
SOOVHYF12A8C134892,I'll Be Waiting,ARCLYBR1187FB53913,1989,304.56118
SOAPERH12A58A787DC,The One And Only ...,ARZ5H0P1187B98A1DD,0,230.42567
SOHKNRJ12A6701D1F8,Drop of Rain,AR10USD1187B99F3F1,0,189.57016
SOHOZBI12A8C132E3C,Smash It Up,AR0MWD61187B9B2B12,2000,195.39546
SOERIDA12A6D4F8506,I Want You (Album...,ARBZIN01187FB362CC,2006,192.28689
SOXZYWX12A6310ED0C,It's About Time,ARC1IHZ1187FB4E920,0,246.9873
SOBLFFE12AF72AA5BA,Scream,ARJNIUY12298900C91,2009,213.9424
SOTAZDY12AB0187616,Drillbit,ARZKCQM1257509D107,0,374.62159


# Save song_table
```bash
CPU times: user 72.6 ms, sys: 21.6 ms, total: 94.3 ms
Wall time: 6min 7s
```

In [18]:
%%time
songs_table.write.parquet(output_data + "songs/", mode="overwrite", partitionBy=["year","artist_id"])

CPU times: user 72.6 ms, sys: 21.6 ms, total: 94.3 ms
Wall time: 6min 7s


---
# Create `artists_table`

In [16]:
artists_table = song_df.select("artist_id","artist_name","artist_location","artist_latitude","artist_longitude").drop_duplicates()
artists_table

artist_id,artist_name,artist_location,artist_latitude,artist_longitude
ARC1IHZ1187FB4E920,Jamie Cullum,,,
ARZKCQM1257509D107,Dataphiles,,,
AREWD471187FB49873,Son Kite,,,
ARGE7G11187FB37E05,Cyndi Lauper,"Brooklyn, NY",,
ARSVTNL1187B992A91,Jonathan King,"London, England",51.50632,-0.12714
AR9Q9YC1187FB5609B,Quest_ Pup_ Kevo,New Jersey,,
AR73AIO1187B9AD57B,Western Addiction,"San Francisco, CA",37.77916,-122.42005
AR0MWD61187B9B2B12,International Noi...,,,
ARMJAGH1187FB546F3,The Box Tops,"Memphis, TN",35.14968,-90.04892
AR1KTV21187B9ACD72,Cristina,California - LA,34.05349,-118.24532


# Save artist_Table
```bash
CPU times: user 58.3 ms, sys: 22.4 ms, total: 80.7 ms
Wall time: 5min 38s
```

In [17]:
%%time
artists_table.write.parquet(output_data + "artists/", mode="overwrite")

0it [00:00, ?it/s]

CPU times: user 58.3 ms, sys: 22.4 ms, total: 80.7 ms
Wall time: 5min 38s


0it [00:00, ?it/s]