## Describe Project Data

### Imports & Configs

In [2]:
import configparser
import boto3
import pandas as pd
import json

In [3]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

s3_resource = boto3.resource('s3')

### Common Functions

In [4]:
def print_bucket_size(bucket, prefix):
    total_size = 0
    count = 0
    for obj in s3_resource.Bucket(bucket).objects.filter(Prefix=prefix):
        if obj.key.endswith('.json'):
            total_size += obj.size
            count += 1
    total_size = total_size / 1024 / 1024
    print(f"Bucket 's3://{bucket}/{prefix}' json files:\n\tCount: {count}\n\tTotal Size: {round(total_size, 1)} MiB")

def get_nth_json_file(bucket, prefix, nth=1):
    index = 0
    for obj in s3_resource.Bucket(bucket).objects.filter(Prefix=prefix):
        if obj.key.endswith('.json'):
            index += 1
            if index == nth:
                return obj.key, obj.get()['Body'].read().decode('utf-8')

def describe_nth_json_file(bucket, prefix, nth):
    file_key, file_content = get_nth_json_file(bucket, prefix, nth)
    df = pd.read_json(file_content, lines=True)

    print(f"First JSON file {bucket}/{file_key}: \n")
    display(df.head())
    display(df.info())

### Project Files - Bucket Info & Stats

In [12]:
print_bucket_size('udacity-dend', 'log-data/')

Bucket 's3://udacity-dend/log-data/' json files:
	Count: 30
	Total Size: 3.6 MiB


In [None]:
print_bucket_size('udacity-dend', 'song-data/')
# the code above took too much time. The recorded last result was:
#
# Bucket 's3://udacity-dend/song-data/' json files:
# 	Count: 385252
# 	Total Size: 98.7 MiB

### Project Files - Content & Field Types

In [5]:
describe_nth_json_file('udacity-dend', 'log-data/', 2)

First JSON file udacity-dend/log-data/2018/11/2018-11-02-events.json: 



Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,N.E.R.D. FEATURING MALICE,Logged In,Jayden,M,0,Fox,288.9922,free,"New Orleans-Metairie, LA",PUT,NextSong,1541033612796,184,Am I High (Feat. Malice),200,1541121934796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",101
1,,Logged In,Stefany,F,0,White,,free,"Lubbock, TX",GET,Home,1540708070796,82,,200,1541122176796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",83
2,Death Cab for Cutie,Logged In,Stefany,F,1,White,216.42404,free,"Lubbock, TX",PUT,NextSong,1540708070796,82,A Lack Of Color (Album Version),200,1541122241796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",83
3,Tracy Gang Pussy,Logged In,Stefany,F,2,White,221.33506,free,"Lubbock, TX",PUT,NextSong,1540708070796,82,I Have A Wish,200,1541122457796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",83
4,Skillet,Logged In,Kevin,M,0,Arellano,178.02404,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540006905796,153,Monster (Album Version),200,1541126568796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",66


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   artist         155 non-null    object 
 1   auth           171 non-null    object 
 2   firstName      171 non-null    object 
 3   gender         171 non-null    object 
 4   itemInSession  171 non-null    int64  
 5   lastName       171 non-null    object 
 6   length         155 non-null    float64
 7   level          171 non-null    object 
 8   location       171 non-null    object 
 9   method         171 non-null    object 
 10  page           171 non-null    object 
 11  registration   171 non-null    int64  
 12  sessionId      171 non-null    int64  
 13  song           155 non-null    object 
 14  status         171 non-null    int64  
 15  ts             171 non-null    int64  
 16  userAgent      171 non-null    object 
 17  userId         171 non-null    int64  
dtypes: float64

None

In [14]:
describe_nth_json_file('udacity-dend', 'song-data/', 2)

First JSON file udacity-dend/song-data/A/A/A/TRAAAAV128F421A322.json: 



Unnamed: 0,song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
0,SOQPWCR12A6D4FB2A3,1,A Poor Recipe For Civic Cohesion,Western Addiction,37.77916,2005,118.07302,AR73AIO1187B9AD57B,-122.42005,"San Francisco, CA"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song_id           1 non-null      object 
 1   num_songs         1 non-null      int64  
 2   title             1 non-null      object 
 3   artist_name       1 non-null      object 
 4   artist_latitude   1 non-null      float64
 5   year              1 non-null      int64  
 6   duration          1 non-null      float64
 7   artist_id         1 non-null      object 
 8   artist_longitude  1 non-null      float64
 9   artist_location   1 non-null      object 
dtypes: float64(3), int64(2), object(5)
memory usage: 208.0+ bytes


None

In [6]:
# Content of: s3://udacity-dend/log_json_path.json
obj = s3_resource.Object('udacity-dend', 'log_json_path.json')
json.loads(obj.get()['Body'].read().decode('utf-8'))

{'jsonpaths': ["$['artist']",
  "$['auth']",
  "$['firstName']",
  "$['gender']",
  "$['itemInSession']",
  "$['lastName']",
  "$['length']",
  "$['level']",
  "$['location']",
  "$['method']",
  "$['page']",
  "$['registration']",
  "$['sessionId']",
  "$['song']",
  "$['status']",
  "$['ts']",
  "$['userAgent']",
  "$['userId']"]}