In [2]:
import boto3
import json
import os
import pandas as pd


#### Data Quality & Exploration Insights:

- Identified missing values in all playlist_details columns due to API rate limit(these values were not filled). Approximately 200 incomplete records were removed via dbt to ensure clean analysis.

- Detected and removed duplicate records via dbt across multiple tables to maintain data integrity.

- Explored relationships between playlist metrics: playlists with more tracks tend to have slightly higher follower counts, though the correlation is weak.

- Noted extreme values in podcast data—for example, some shows like IHeartRadio have over 10,000 episodes—highlighting the need for careful aggregation in analyses.

- NOTE : There is 1 artist named 'rock' which has diff artist id's but same artist name. Means they are different artists with the same name

In [None]:

s3 = boto3.client(
    "s3",
    region_name="us-east-1",
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket = "fpa-analysis"
keys = ["tracks.json", "playlists.json", "artists.json", "podcasts.json", "new_releases.json", "playlist_details.json", "podcast_details.json"]

data_frames = {}

for key in keys:
    # Fetch object from S3
    obj = s3.get_object(Bucket=bucket, Key=key)
    file_content = obj['Body'].read().decode('utf-8')

    # Handle JSON
    data = []
    for line in file_content.splitlines():
        if line.strip():  # skip empty lines
            data.append(json.loads(line))

    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Store in dictionary with key name
    data_frames[key] = df

    print(f"{key}: Loaded {len(df)} records")

tracks_df = data_frames["tracks.json"]
playlists_df = data_frames["playlists.json"]
artists_df = data_frames["artists.json"]
podcasts_df = data_frames["podcasts.json"]
new_releases_df = data_frames["new_releases.json"]
playlist_details_df = data_frames["playlist_details.json"]
podcast_details_df = data_frames["podcast_details.json"]

tracks_df.head()

tracks.json: Loaded 1000 records
playlists.json: Loaded 854 records
artists.json: Loaded 1028 records
podcasts.json: Loaded 3000 records
new_releases.json: Loaded 100 records
playlist_details.json: Loaded 854 records
podcast_details.json: Loaded 1169 records


Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,is_playable,name,popularity,preview_url,track_number,type,uri,query
0,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,268413,False,{'isrc': 'USWD10527739'},{'spotify': 'https://open.spotify.com/track/5a...,https://api.spotify.com/v1/tracks/5aNagthlHAud...,5aNagthlHAudbDZozQMjYP,False,True,Stick to the Status Quo,60,,5,track,spotify:track:5aNagthlHAudbDZozQMjYP,"[pop, rock, hip-hop]"
1,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,314280,True,{'isrc': 'USUM71502773'},{'spotify': 'https://open.spotify.com/track/06...,https://api.spotify.com/v1/tracks/069VGijrAsQV...,069VGijrAsQVSY9ihFv1Px,False,True,Best Friend,62,,8,track,spotify:track:069VGijrAsQVSY9ihFv1Px,"[pop, rock, hip-hop]"
2,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,323962,True,{'isrc': 'USRC11500797'},{'spotify': 'https://open.spotify.com/track/2g...,https://api.spotify.com/v1/tracks/2gAGWaK4wvt2...,2gAGWaK4wvt2xrFUlR4mK8,False,True,Jukebox Joints (feat. Joe Fox & Kanye West),75,,9,track,spotify:track:2gAGWaK4wvt2xrFUlR4mK8,"[pop, rock, hip-hop]"
3,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,145052,False,{'isrc': 'QZW9K2521015'},{'spotify': 'https://open.spotify.com/track/7J...,https://api.spotify.com/v1/tracks/7JQFjUucD7QP...,7JQFjUucD7QPhPnaPxTDBc,False,True,SATURACIÓN POP,58,,1,track,spotify:track:7JQFjUucD7QPhPnaPxTDBc,"[pop, rock, hip-hop]"
4,"{'album_type': 'compilation', 'artists': [{'ex...",[{'external_urls': {'spotify': 'https://open.s...,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",1,181013,False,{'isrc': 'USWD10935873'},{'spotify': 'https://open.spotify.com/track/6n...,https://api.spotify.com/v1/tracks/6ntkwU1MhehT...,6ntkwU1MhehTKgx4BWxX3f,False,True,Hoedown Throwdown,58,,6,track,spotify:track:6ntkwU1MhehTKgx4BWxX3f,"[pop, rock, hip-hop]"


In [None]:
# shape chk
for name, df in data_frames.items():
    print(f"{name}: {df.shape}")
    df.head(3)


tracks.json: (1000, 19)
playlists.json: (854, 15)
artists.json: (1028, 10)
podcasts.json: (3000, 18)
new_releases.json: (100, 13)
playlist_details.json: (854, 16)
podcast_details.json: (1169, 18)


In [10]:
# missing values
for name, df in data_frames.items():
    print(f"Missing values in {name}:")
    print(df.isna().sum())


Missing values in tracks.json:
album                   0
artists                 0
available_markets       0
disc_number             0
duration_ms             0
explicit                0
external_ids            0
external_urls           0
href                    0
id                      0
is_local                0
is_playable             0
name                    0
popularity              0
preview_url          1000
track_number            0
type                    0
uri                     0
query                   0
dtype: int64
Missing values in playlists.json:
collaborative      0
description        0
external_urls      0
href               0
id                 0
images             0
name               0
owner              0
primary_color    854
public             0
snapshot_id        0
tracks             0
type               0
uri                0
query              0
dtype: int64
Missing values in artists.json:
external_urls    0
followers        0
genres           0
href       

In [24]:
playlist_details_df.isna().sum()

collaborative      267
description        267
external_urls      267
followers          267
href               267
id                 267
images             267
name               267
owner              267
primary_color      854
public             267
snapshot_id        267
tracks             267
type               267
uri                267
error              587
followers_total      0
dtype: int64

In [22]:
playlist_details_df[playlist_details_df["collaborative"].isna()].head()

# drop these null rows, its because of api rate limit - > dropped via bigquery

Unnamed: 0,collaborative,description,external_urls,followers,href,id,images,name,owner,primary_color,public,snapshot_id,tracks,type,uri,error,followers_total
95,,,,,,,,,,,,,,,,"{'status': 429, 'message': 'API rate limit exc...",0
96,,,,,,,,,,,,,,,,"{'status': 429, 'message': 'API rate limit exc...",0
99,,,,,,,,,,,,,,,,"{'status': 429, 'message': 'API rate limit exc...",0
101,,,,,,,,,,,,,,,,"{'status': 429, 'message': 'API rate limit exc...",0
102,,,,,,,,,,,,,,,,"{'status': 429, 'message': 'API rate limit exc...",0


In [20]:
tracks_df[tracks_df["id"] == "52IuMfbQa9aqRPz2oYPAI8"].head()

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,is_local,is_playable,name,popularity,preview_url,track_number,type,uri,query
53,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AG, AL, AM, AO, AZ, BB, BF, BI, BJ, BN, BS, B...",1,265101,True,{'isrc': 'USPO10000879'},{'spotify': 'https://open.spotify.com/track/52...,https://api.spotify.com/v1/tracks/52IuMfbQa9aq...,52IuMfbQa9aqRPz2oYPAI8,False,True,Express Yourself,61,,8,track,spotify:track:52IuMfbQa9aqRPz2oYPAI8,"[pop, rock, hip-hop]"
105,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"[AG, AL, AM, AO, AZ, BB, BF, BI, BJ, BN, BS, B...",1,265101,True,{'isrc': 'USPO10000879'},{'spotify': 'https://open.spotify.com/track/52...,https://api.spotify.com/v1/tracks/52IuMfbQa9aq...,52IuMfbQa9aqRPz2oYPAI8,False,True,Express Yourself,61,,8,track,spotify:track:52IuMfbQa9aqRPz2oYPAI8,"[pop, rock, hip-hop]"


In [15]:
# duplicates

for name, df in data_frames.items():
    if 'id' in df.columns:
        print(f"Duplicates in {name}: {df.duplicated(subset=['id']).sum()}")
    else:
        print(f"{name} has no 'id' column, skipping duplicate check")



Duplicates in tracks.json: 109
Duplicates in playlists.json: 267
Duplicates in artists.json: 0
Duplicates in podcasts.json: 416
Duplicates in new_releases.json: 0
Duplicates in playlist_details.json: 411
Duplicates in podcast_details.json: 46


In [26]:
podcast_details_df.head()

Unnamed: 0,available_markets,copyrights,description,html_description,explicit,external_urls,href,id,images,is_externally_hosted,languages,media_type,name,publisher,type,uri,total_episodes,episodes
0,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Listen for a comprehensive audio report on tod...,<p>Listen for a comprehensive audio report on ...,False,{'spotify': 'https://open.spotify.com/show/5q8...,https://api.spotify.com/v1/shows/5q8wg5rFYbbeD...,5q8wg5rFYbbeDk0kk7t6Uc,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en-US],audio,Bloomberg News Now,Bloomberg,show,spotify:show:5q8wg5rFYbbeDk0kk7t6Uc,901,{'href': 'https://api.spotify.com/v1/shows/5q8...
1,"[AD, AE, AG, AL, AM, AR, AT, AU, BA, BB, BE, B...",[],Covering Geopolitical News from around the world.,Covering Geopolitical News from around the world.,True,{'spotify': 'https://open.spotify.com/show/1Tm...,https://api.spotify.com/v1/shows/1Tm6Snolm6lrP...,1Tm6Snolm6lrPhqJcbaYTt,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],mixed,Business Basics,Business Basics,show,spotify:show:1Tm6Snolm6lrPhqJcbaYTt,326,{'href': 'https://api.spotify.com/v1/shows/1Tm...
2,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Seeking Alpha's flagship Wall Street Breakfast...,Seeking Alpha&#39;s flagship Wall Street Break...,False,{'spotify': 'https://open.spotify.com/show/05u...,https://api.spotify.com/v1/shows/05uLjJxkVgQsR...,05uLjJxkVgQsRk8LWLCLpx,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],audio,Wall Street Breakfast,Seeking Alpha,show,spotify:show:05uLjJxkVgQsRk8LWLCLpx,1000,{'href': 'https://api.spotify.com/v1/shows/05u...
3,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Business of Home's host Dennis Scully intervie...,Business of Home&#39;s host Dennis Scully inte...,False,{'spotify': 'https://open.spotify.com/show/44R...,https://api.spotify.com/v1/shows/44REgaXjBseF4...,44REgaXjBseF4ZanktPyjL,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],audio,Business of Home Podcast,"Business of Home, Dennis Scully",show,spotify:show:44REgaXjBseF4ZanktPyjL,528,{'href': 'https://api.spotify.com/v1/shows/44R...
4,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Learn how companies work from the people who k...,<p>Learn how companies work from the people wh...,False,{'spotify': 'https://open.spotify.com/show/417...,https://api.spotify.com/v1/shows/417NPBWqtMbDU...,417NPBWqtMbDU0FlWZTRDC,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],mixed,Business Breakdowns,Colossus | Investing & Business Podcasts,show,spotify:show:417NPBWqtMbDU0FlWZTRDC,251,{'href': 'https://api.spotify.com/v1/shows/417...


In [None]:
# data types

for name, df in data_frames.items():
    print(f"Data types in {name}:")
    print(df.dtypes)


Data types in tracks.json:
album                object
artists              object
available_markets    object
disc_number           int64
duration_ms           int64
explicit               bool
external_ids         object
external_urls        object
href                 object
id                   object
is_local               bool
is_playable            bool
name                 object
popularity            int64
preview_url          object
track_number          int64
type                 object
uri                  object
query                object
dtype: object
Data types in playlists.json:
collaborative      bool
description      object
external_urls    object
href             object
id               object
images           object
name             object
owner            object
primary_color    object
public             bool
snapshot_id      object
tracks           object
type             object
uri              object
query            object
dtype: object
Data types in artists.j

In [16]:
# summary stats

for name, df in data_frames.items():
    print(f"Summary of numeric columns in {name}:")
    print(df.describe())


Summary of numeric columns in tracks.json:
       disc_number   duration_ms   popularity  track_number
count  1000.000000  1.000000e+03  1000.000000   1000.000000
mean      1.014000  2.165711e+05    30.776000      5.335000
std       0.140797  1.012247e+05    18.781229      5.624092
min       1.000000  3.078300e+04     0.000000      1.000000
25%       1.000000  1.645710e+05    12.000000      1.000000
50%       1.000000  2.050065e+05    34.500000      3.000000
75%       1.000000  2.503710e+05    44.000000      8.000000
max       4.000000  1.300000e+06    76.000000     56.000000
Summary of numeric columns in playlists.json:
       collaborative description  \
count            854         854   
unique             2         369   
top            False               
freq             852         307   

                                            external_urls  \
count                                                 854   
unique                                                587   
top    

In [26]:
podcasts_df[podcasts_df["total_episodes"] > 10000].head()

Unnamed: 0,available_markets,copyrights,description,html_description,explicit,external_urls,href,id,images,is_externally_hosted,languages,media_type,name,publisher,type,uri,total_episodes,query
2261,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Brooke & Jeffrey in the Morning is a nationall...,<p>Brooke &amp; Jeffrey in the Morning is a na...,False,{'spotify': 'https://open.spotify.com/show/6wm...,https://api.spotify.com/v1/shows/6wmqpNulHEqJN...,6wmqpNulHEqJNeR2iqmgyj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en-US],audio,Brooke and Jeffrey,iHeartPodcasts,show,spotify:show:6wmqpNulHEqJNeR2iqmgyj,10958.0,comedy


iHeartPodcasts has more than 10k episodes

In [29]:
new_releases_df["release_date"].max()

'2024-04-19'

In [39]:
# Categorical analysis
artists_df["genres"].explode().value_counts().head(10)


genres
east coast hip hop     59
old school hip hop     41
underground hip hop    36
hip hop                31
boom bap               30
jazz rap               27
v-pop                  26
vietnamese hip hop     25
german hip hop         25
christian hip hop      23
Name: count, dtype: int64

In [38]:
# engagement metrics
# top 10 tracks by streams
tracks_df[["name", "artists", "popularity", "duration_ms"]].sort_values("popularity", ascending=False).head(10)


Unnamed: 0,name,artists,popularity,duration_ms
81,Juicy - 2005 Remaster,[{'external_urls': {'spotify': 'https://open.s...,76,302760
38,Thong Song,[{'external_urls': {'spotify': 'https://open.s...,76,253733
16,Monster,[{'external_urls': {'spotify': 'https://open.s...,75,378893
2,Jukebox Joints (feat. Joe Fox & Kanye West),[{'external_urls': {'spotify': 'https://open.s...,75,323962
10,Lonely Road (with Jelly Roll),[{'external_urls': {'spotify': 'https://open.s...,74,189356
350,Family Matters,[{'external_urls': {'spotify': 'https://open.s...,71,456933
251,Family Matters,[{'external_urls': {'spotify': 'https://open.s...,71,456933
164,Yeniden,[{'external_urls': {'spotify': 'https://open.s...,71,220207
216,Yeniden,[{'external_urls': {'spotify': 'https://open.s...,71,220207
100,Thong Song,[{'external_urls': {'spotify': 'https://open.s...,71,180966


In [14]:
podcast_details_df.iloc[148, 0:]

available_markets       [AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...
copyrights                                                             []
description             The Ramsey Show Highlights is a quick, daily d...
html_description        <p>The Ramsey Show Highlights is a quick, dail...
explicit                                                            False
external_urls           {'spotify': 'https://open.spotify.com/show/0O9...
href                    https://api.spotify.com/v1/shows/0O9DsvFgkkzp7...
id                                                 0O9DsvFgkkzp7Ze60R2JrV
images                  [{'height': 640, 'url': 'https://i.scdn.co/ima...
is_externally_hosted                                                False
languages                                                            [en]
media_type                                                          mixed
name                                           The Ramsey Show Highlights
publisher                             

In [5]:
podcast_details_df.head()

Unnamed: 0,available_markets,copyrights,description,html_description,explicit,external_urls,href,id,images,is_externally_hosted,languages,media_type,name,publisher,type,uri,total_episodes,episodes
0,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Listen for a comprehensive audio report on tod...,<p>Listen for a comprehensive audio report on ...,False,{'spotify': 'https://open.spotify.com/show/5q8...,https://api.spotify.com/v1/shows/5q8wg5rFYbbeD...,5q8wg5rFYbbeDk0kk7t6Uc,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en-US],audio,Bloomberg News Now,Bloomberg,show,spotify:show:5q8wg5rFYbbeDk0kk7t6Uc,901,{'href': 'https://api.spotify.com/v1/shows/5q8...
1,"[AD, AE, AG, AL, AM, AR, AT, AU, BA, BB, BE, B...",[],Covering Geopolitical News from around the world.,Covering Geopolitical News from around the world.,True,{'spotify': 'https://open.spotify.com/show/1Tm...,https://api.spotify.com/v1/shows/1Tm6Snolm6lrP...,1Tm6Snolm6lrPhqJcbaYTt,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],mixed,Business Basics,Business Basics,show,spotify:show:1Tm6Snolm6lrPhqJcbaYTt,326,{'href': 'https://api.spotify.com/v1/shows/1Tm...
2,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Seeking Alpha's flagship Wall Street Breakfast...,Seeking Alpha&#39;s flagship Wall Street Break...,False,{'spotify': 'https://open.spotify.com/show/05u...,https://api.spotify.com/v1/shows/05uLjJxkVgQsR...,05uLjJxkVgQsRk8LWLCLpx,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],audio,Wall Street Breakfast,Seeking Alpha,show,spotify:show:05uLjJxkVgQsRk8LWLCLpx,1000,{'href': 'https://api.spotify.com/v1/shows/05u...
3,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Business of Home's host Dennis Scully intervie...,Business of Home&#39;s host Dennis Scully inte...,False,{'spotify': 'https://open.spotify.com/show/44R...,https://api.spotify.com/v1/shows/44REgaXjBseF4...,44REgaXjBseF4ZanktPyjL,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],audio,Business of Home Podcast,"Business of Home, Dennis Scully",show,spotify:show:44REgaXjBseF4ZanktPyjL,528,{'href': 'https://api.spotify.com/v1/shows/44R...
4,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",[],Learn how companies work from the people who k...,<p>Learn how companies work from the people wh...,False,{'spotify': 'https://open.spotify.com/show/417...,https://api.spotify.com/v1/shows/417NPBWqtMbDU...,417NPBWqtMbDU0FlWZTRDC,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",False,[en],mixed,Business Breakdowns,Colossus | Investing & Business Podcasts,show,spotify:show:417NPBWqtMbDU0FlWZTRDC,251,{'href': 'https://api.spotify.com/v1/shows/417...


In [5]:
playlist_details_df.head(1)

Unnamed: 0,collaborative,description,external_urls,followers,href,id,images,name,owner,primary_color,public,snapshot_id,tracks,type,uri,error
0,False,,{'spotify': 'https://open.spotify.com/playlist...,"{'href': None, 'total': 214971}",https://api.spotify.com/v1/playlists/327k1ryJ7...,327k1ryJ7j1xD7gWKpdc6o,"[{'height': 640, 'url': 'https://mosaic.scdn.c...",Popular Christian Music,"{'display_name': 'Juliann Hale', 'external_url...",,True,AAABTkI5wd2y1BmM+Eu9hBpkTCkrhakz,{'href': 'https://api.spotify.com/v1/playlists...,playlist,spotify:playlist:327k1ryJ7j1xD7gWKpdc6o,


In [None]:
playlist_details_df['followers_total'] = playlist_details_df['followers'].apply(
    lambda x: x['total'] if isinstance(x, dict) and 'total' in x else 0
)

playlist_details_df['tracks_total'] = playlist_details_df['tracks'].apply(
    lambda x: x['total'] if isinstance(x, dict) and 'total' in x else 0
)


# Check
playlist_details_df[['followers_total']].head()


Unnamed: 0,followers_total
0,214971
1,73005
2,77141
3,73892
4,664175


In [13]:
# relationship correlation

playlist_details_df_copy = playlist_details_df.copy()
playlist_details_df_copy['followers_total'] = playlist_details_df_copy['followers'].apply(
    lambda x: x['total'] if isinstance(x, dict) and 'total' in x else 0
)

playlist_details_df_copy['tracks_total'] = playlist_details_df_copy['tracks'].apply(
    lambda x: x['total'] if isinstance(x, dict) and 'total' in x else 0
)

playlist_details_df_copy[['followers_total','tracks_total']].corr()


Unnamed: 0,followers_total,tracks_total
followers_total,1.0,0.046391
tracks_total,0.046391,1.0


longer playlists might attract slightly more engagement, which could impact ad revenue, but it’s not the dominant driver