# Spotify Data Visualization

## 1.1 Importing data and libraries

In [45]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from dask import dataframe as dd

In [46]:
# reading a file as a pandas dataframe requires 13 GB+ memory alocation. The file loaded this way is too large.
# reading a file as a dask dataframe due to size with a dtypes choosed by default pandas reader
# more about dask in dataframes: https://docs.dask.org/en/stable/dataframe.html

dask_dataframe = dd.read_csv('spotify_charts.csv', 
                     dtype=object)

## 1.2 Data exploration and optimization stage

In [47]:
# dask dataframe size info

dask_dataframe.info(memory_usage='deep')


<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, title to streams
dtypes: object(9)
memory usage: 1.8 GB


In [48]:
dask_dataframe.head()

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956


In [49]:
# checking the number of unique values 

for col in dask_dataframe.columns:
    print(f'unique values in {col}: {dask_dataframe[col].nunique().compute()}')
    
# its longer to compute every single column in dask dataframe but working with a 13 GB pandas dataframe is avoided. 

unique values in title: 164758
unique values in rank: 200
unique values in date: 1826
unique values in artist: 96115
unique values in url: 217644
unique values in region: 70
unique values in chart: 2
unique values in trend: 4
unique values in streams: 788013


In [50]:
# changing of data types from object to category/int for elements that have relatively few unique values (in comparison to number of all entries)

to_category_cols = ['title', 'url', 'date', 'artist', 'region', 'trend', 'chart']
to_int32_cols = ['rank']

for col in to_category_cols:
    dask_dataframe[col] = dask_dataframe[col].astype('category')
        
for col in to_int32_cols:
    dask_dataframe[col] = dask_dataframe[col].astype('int32')


In [51]:
# rechecking dask dataframe size info

dask_dataframe.info(memory_usage='deep')

<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, title to streams
dtypes: category(7), object(1), int32(1)
memory usage: 643.2 MB


In [52]:
# pandas dataframe size info

dataframe = dask_dataframe.compute()

dataframe.info(memory_usage='deep')

# file is much smaller and after first steps of optimization is possible to work on pandas dataframe again

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26147953 entries, 0 to 498950
Data columns (total 9 columns):
 #   Column   Dtype   
---  ------   -----   
 0   title    category
 1   rank     int32   
 2   date     category
 3   artist   category
 4   url      category
 5   region   category
 6   chart    category
 7   trend    category
 8   streams  object  
dtypes: category(7), int32(1), object(1)
memory usage: 2.1 GB


In [53]:
# checking the number of NaN values 

for col in dataframe.columns:
    print(f'NaN values in {col}: {dataframe[col].isna().sum()}')
    

NaN values in title: 11
NaN values in rank: 0
NaN values in date: 0
NaN values in artist: 18
NaN values in url: 0
NaN values in region: 1
NaN values in chart: 1
NaN values in trend: 1
NaN values in streams: 5826049


In [54]:
# preparing a NaN values filters

title_is_nan = dataframe['title'].isna()
artist_is_nan = dataframe['artist'].isna()
region_is_nan = dataframe['region'].isna()
chart_is_nan = dataframe['chart'].isna()
trend_is_nan = dataframe['trend'].isna()
streams_is_nan = dataframe['streams'].isna()


In [55]:
# NaN data exploratiom
dataframe[title_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
72604,,120,2019-06-24,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,NEW_ENTRY,11942
169157,,167,2019-06-25,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,MOVE_DOWN,10310
246763,,128,2019-06-26,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,MOVE_UP,11620


In [56]:
# NaN data exploratiom
dataframe[artist_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
291605,NO GOOD,10,2020-07-13,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,NEW_ENTRY,
311398,NO GOOD,10,2020-07-14,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,SAME_POSITION,
335035,NO GOOD,10,2020-07-15,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,SAME_POSITION,


In [57]:
# NaN data exploratiom
dataframe[region_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
498950,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [58]:
# NaN data exploratiom
dataframe[chart_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
498950,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [59]:
# NaN data exploratiom
dataframe[trend_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
498950,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [60]:
# NaN data exploratiom
dataframe[streams_is_nan].sample(5)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
215201,Would You Ever,4,2017-08-13,"Skrillex, Poo Bear",https://open.spotify.com/track/57p8CBvPOxrvyCb...,Greece,viral50,MOVE_UP,
61978,More Than Friends,13,2018-09-09,ASHS,https://open.spotify.com/track/1B5D54trx9aVufP...,Canada,viral50,MOVE_UP,
432008,Corazón,2,2017-12-19,"Maluma, Nego do Borel",https://open.spotify.com/track/4lESS6vuruP6a79...,Argentina,viral50,SAME_POSITION,
25745,Bad boy,16,2021-01-12,Marwa Loud,https://open.spotify.com/track/0RE4crnT3jRms1x...,El Salvador,viral50,SAME_POSITION,
417706,Shotta Flow,3,2019-02-17,NLE Choppa,https://open.spotify.com/track/4dAMdQ6g4kGmnc1...,Canada,viral50,MOVE_UP,


In [61]:
# checking if 'streams' contains any '0' values

streams_is_null = dataframe['streams'] == 0

len(dataframe[streams_is_null])



0

In [62]:
# replacing NaN values with a zeros without losing any data

dataframe['streams'].fillna(value=0, inplace=True)

In [63]:
# changing of data types of 'streams' from float64 to int32

dataframe['streams'] = dataframe['streams'].astype('int32')

In [64]:
# droping rest records containing NaN values
dataframe.dropna(inplace=True)


In [65]:
# rechecking the number of NaN values 

for col in dataframe.columns:
    print(f'NaN values in {col}: {dataframe[col].isna().sum()}')

NaN values in title: 0
NaN values in rank: 0
NaN values in date: 0
NaN values in artist: 0
NaN values in url: 0
NaN values in region: 0
NaN values in chart: 0
NaN values in trend: 0
NaN values in streams: 0


In [66]:
# checking if there is a memory usage saving
dataframe.info(memory_usage='deep')




<class 'pandas.core.frame.DataFrame'>
Int64Index: 26147923 entries, 0 to 498949
Data columns (total 9 columns):
 #   Column   Dtype   
---  ------   -----   
 0   title    category
 1   rank     int32   
 2   date     category
 3   artist   category
 4   url      category
 5   region   category
 6   chart    category
 7   trend    category
 8   streams  int32   
dtypes: category(7), int32(2)
memory usage: 881.2 MB
