# Spotify Data Visualization

## 1.1 Importing data and libraries

In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# reading a file
df_raw = pd.read_csv('spotify_charts.csv')

## 1.2 Data exploration and optimization stage

In [4]:
# working on the source file is necessary due to its size

df_raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26147953 entries, 0 to 26147952
Data columns (total 9 columns):
 #   Column   Dtype  
---  ------   -----  
 0   title    object 
 1   rank     int64  
 2   date     object 
 3   artist   object 
 4   url      object 
 5   region   object 
 6   chart    object 
 7   trend    object 
 8   streams  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 13.1 GB


In [5]:
# checking the number of unique values 

for col in df_raw.columns:
    print(f'unique values in {col}: {df_raw[col].nunique()}')

unique values in title: 164758
unique values in rank: 200
unique values in date: 1826
unique values in artist: 96115
unique values in url: 217644
unique values in region: 70
unique values in chart: 2
unique values in trend: 4
unique values in streams: 788013


In [6]:
# changing of data types from object to category/int for elements that have relatively few unique values (in comparison to number of all entries)

df_raw['title'] = df_raw['title'].astype('category')
df_raw['rank'] = df_raw['rank'].astype('int32')
df_raw['date'] = df_raw['date'].astype('category')
df_raw['artist'] = df_raw['artist'].astype('category')
df_raw['url'] = df_raw['url'].astype('category')
df_raw['region'] = df_raw['region'].astype('category')
df_raw['chart'] = df_raw['chart'].astype('category')
df_raw['trend'] = df_raw['trend'].astype('category')

In [7]:
# checking the number of NaN values 

for col in df_raw.columns:
    print(f'NaN values in {col}: {df_raw[col].isna().sum()}')
    

NaN values in title: 11
NaN values in rank: 0
NaN values in date: 0
NaN values in artist: 18
NaN values in url: 0
NaN values in region: 1
NaN values in chart: 1
NaN values in trend: 1
NaN values in streams: 5826049


In [9]:
# preparing a NaN values filters

title_is_nan = df_raw['title'].isna()
artist_is_nan = df_raw['artist'].isna()
region_is_nan = df_raw['region'].isna()
chart_is_nan = df_raw['chart'].isna()
trend_is_nan = df_raw['trend'].isna()
streams_is_nan = df_raw['streams'].isna()


In [11]:
# NaN data exploratiom
df_raw[title_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
7305002,,120,2019-06-24,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,NEW_ENTRY,11942.0
7401555,,167,2019-06-25,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,MOVE_DOWN,10310.0
7479161,,128,2019-06-26,Nissy,https://open.spotify.com/track/4cP6KmNvTFkLHZo...,Japan,top200,MOVE_UP,11620.0


In [12]:
# NaN data exploratiom
df_raw[artist_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
20596664,NO GOOD,10,2020-07-13,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,NEW_ENTRY,
20616457,NO GOOD,10,2020-07-14,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,SAME_POSITION,
20640094,NO GOOD,10,2020-07-15,,https://open.spotify.com/track/4Qnz8tARYhUtDNe...,Japan,viral50,SAME_POSITION,


In [13]:
# NaN data exploratiom
df_raw[region_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
26147952,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [14]:
# NaN data exploratiom
df_raw[chart_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
26147952,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [15]:
# NaN data exploratiom
df_raw[trend_is_nan].head(3)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
26147952,Tengo La Personalidad,47,2021-07-24,Chikybombom La Pantera,https://open.spotify.com/trac,,,,


In [16]:
# NaN data exploratiom
df_raw[streams_is_nan].sample(5)

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
190071,There You Are,23,2017-08-10,Colin Chase,https://open.spotify.com/track/21eUnho2KAUmokP...,New Zealand,viral50,MOVE_UP,
12815232,No Te Vas,9,2018-03-29,Nacho,https://open.spotify.com/track/6yuArSnBrhykjkC...,Colombia,viral50,MOVE_UP,
11974630,Oye Pablo,48,2019-11-30,Danna Paola,https://open.spotify.com/track/4wJFExEZZo62tmw...,Ecuador,viral50,NEW_ENTRY,
25642391,GoodMorning (Spanish Version),24,2021-03-12,CRZ,https://open.spotify.com/track/51J9linsa2SjzT9...,Mexico,viral50,MOVE_DOWN,
13616541,Tie Me Down (with Elley Duhé),42,2018-08-31,Gryffin,https://open.spotify.com/track/4QVS8YCpK71R4Fs...,El Salvador,viral50,MOVE_UP,


In [17]:
# checking if 'streams' contains any '0' values

streams_is_null = df_raw['streams'] == 0

len(df_raw[streams_is_null])



0

In [18]:
# replacing NaN values with a zeros without losing any data

df_raw['streams'].fillna(value=0, inplace=True)

In [19]:
#changing of data types of 'streams' from float64 to int32

df_raw['streams'] = df_raw['streams'].astype('int32')

In [20]:
# droping rest records containing NaN values
df_raw.dropna(inplace=True)


In [21]:
#rechecking the number of NaN values 

for col in df_raw.columns:
    print(f'NaN values in {col}: {df_raw[col].isna().sum()}')

NaN values in title: 0
NaN values in rank: 0
NaN values in date: 0
NaN values in artist: 0
NaN values in url: 0
NaN values in region: 0
NaN values in chart: 0
NaN values in trend: 0
NaN values in streams: 0


In [22]:
# checking if there is a memory usage saving
df_raw.info(memory_usage='deep')




<class 'pandas.core.frame.DataFrame'>
Int64Index: 26147923 entries, 0 to 26147951
Data columns (total 9 columns):
 #   Column   Dtype   
---  ------   -----   
 0   title    category
 1   rank     int32   
 2   date     category
 3   artist   category
 4   url      category
 5   region   category
 6   chart    category
 7   trend    category
 8   streams  int32   
dtypes: category(7), int32(2)
memory usage: 880.0 MB
