### Importing Modules

In [294]:
import pandas as pd
import numpy as np
import os 

### 1. Importing Data

In [296]:
data_path = r'C:\Users\gerar\CareerFoundry Python\Spotify charts streaming analysis'
df_spotify_raw = pd.read_csv(os.path.join(data_path, '02 Data', 'Original Data', 'final.csv'), index_col = False, low_memory = False)
#df_spotify_raw_test = pd.read_csv(r'C:/Users/gerar\CareerFoundry Python\Spotify charts streaming analysis\02 Data')

In [297]:
df_spotify_raw.shape

(1787999, 36)

In [298]:
### Row and column settings ### 
pd.options.display.max_columns = 30
pd.options.display.max_rows = 20

In [299]:
df_spotify_raw.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
uri,spotify:track:2gpQi3hbcUAcEG8m2dlgfB,spotify:track:2x8oBuYaObjqHqgGuIUZ0b,spotify:track:2SJZdZ5DLtlRosJ2xHJJJa,spotify:track:1O2pcBJGej0pmH2Y9XZMs6,spotify:track:1TpZKxGnHp37ohJRszTSiq
rank,1,2,3,5,6
artist_names,Paulo Londra,WOS,Paulo Londra,Cris Mj,Emilia
artists_num,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...
duration,178203.0,183547.0,204003.0,153750.0,133895.0
country,Argentina,Argentina,Argentina,Argentina,Argentina
region,South America,South America,South America,South America,South America
language,Spanish,Spanish,Spanish,Spanish,Spanish


In [300]:
df_spotify_raw.dtypes

Unnamed: 0       int64
uri             object
rank            object
artist_names    object
artists_num     object
                 ...  
duration        object
country         object
region          object
language        object
pivot           object
Length: 36, dtype: object

### 2. Cleaning Data

In [302]:
# Some rows have invalid values. Instead of a value they only have the name of the column as their value.
# Identify such values
non_numeric_values = df_spotify_raw[~df_spotify_raw['rank'].apply(lambda x: str(x).isdigit())]
print(non_numeric_values.head(5))


print('\nThis is the shape of wrong/blank entries:')
non_numeric_values.shape


        Unnamed: 0  uri  rank  artist_names  artists_num  artist_individual  \
29476        29476  uri  rank  artist_names  artists_num  artist_individual   
51224        51224  uri  rank  artist_names  artists_num  artist_individual   
75474        75474  uri  rank  artist_names  artists_num  artist_individual   
81333        81333  uri  rank  artist_names  artists_num  artist_individual   
104818      104818  uri  rank  artist_names  artists_num  artist_individual   

        artist_id  artist_genre  artist_img  collab  track_name  release_date  \
29476   artist_id  artist_genre  artist_img  collab  track_name  release_date   
51224   artist_id  artist_genre  artist_img  collab  track_name  release_date   
75474   artist_id  artist_genre  artist_img  collab  track_name  release_date   
81333   artist_id  artist_genre  artist_img  collab  track_name  release_date   
104818  artist_id  artist_genre  artist_img  collab  track_name  release_date   

        album_num_tracks  album_cover 

(73, 36)

In [303]:
# remove Invalid rows 
df_spotify_clean_1 = df_spotify_raw.drop(non_numeric_values.index)

print('Shape comparison:')
print(df_spotify_raw.shape)
print(df_spotify_clean_1.shape)

Shape comparison:
(1787999, 36)
(1787926, 36)


In [304]:
# Remove useless columns
df_spotify_clean_2 = df_spotify_clean_1.drop(columns=['artist_img', 'album_cover','source','Unnamed: 0','release_date'])


In [305]:
# Find and remove NaN values


# show those "NaN" values
print(df_spotify_clean_2.isna().sum())
df_spotify_clean_3 = df_spotify_clean_2.copy()
df_spotify_clean_3 = df_spotify_clean_3.dropna(how = 'any')

# drop NaN values
print(' ')
print('Shape comparison')
print(df_spotify_clean_2.shape)
print(df_spotify_clean_3.shape)

uri                    0
rank                   0
artist_names           0
artists_num            0
artist_individual      0
                    ... 
duration             395
country                0
region                 0
language               0
pivot                  0
Length: 31, dtype: int64
 
Shape comparison
(1787926, 31)
(1787531, 31)


### 1.2 Correcting Data Types

In [307]:
df_spotify_clean_3.head(5)

Unnamed: 0,uri,rank,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,previous_rank,weeks_on_chart,streams,week,...,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language,pivot
0,spotify:track:2gpQi3hbcUAcEG8m2dlgfB,1,Paulo Londra,1.0,Paulo Londra,spotify:artist:3vQ0GE3mI0dAaxIMYe5g7z,argentine hip hop,0,Plan A,1.0,1,1,4,3003411,2022-04-14,...,0.8340000000000001,0.0,1.0,-4.875,0.0444,0.0495,0.0,0.0658,0.557,173.935,178203.0,Argentina,South America,Spanish,0
1,spotify:track:2x8oBuYaObjqHqgGuIUZ0b,2,WOS,1.0,WOS,spotify:artist:5YCc6xS5Gpj3EkaYGdjyNK,argentine indie,0,ARRANCARMELO,1.0,2,129,2,2512175,2022-04-14,...,0.354,5.0,1.0,-7.358,0.0738,0.7240000000000001,0.0,0.134,0.262,81.956,183547.0,Argentina,South America,Spanish,0
2,spotify:track:2SJZdZ5DLtlRosJ2xHJJJa,3,Paulo Londra,1.0,Paulo Londra,spotify:artist:3vQ0GE3mI0dAaxIMYe5g7z,argentine hip hop,0,Chance,2.0,3,59,2,2408983,2022-04-14,...,0.463,1.0,0.0,-9.483,0.0646,0.241,0.0,0.0929,0.216,137.915,204003.0,Argentina,South America,Spanish,0
3,spotify:track:1O2pcBJGej0pmH2Y9XZMs6,5,Cris Mj,1.0,Cris Mj,spotify:artist:1Yj5Xey7kTwvZla8sqdsdE,urbano chileno,0,Una Noche en Medellín,1.0,5,5,8,2080139,2022-04-14,...,0.5479999999999999,10.0,0.0,-5.252999999999999,0.077,0.0924,4.6e-05,0.0534,0.8320000000000001,96.018,153750.0,Argentina,South America,Spanish,0
4,spotify:track:1TpZKxGnHp37ohJRszTSiq,6,Emilia,1.0,Emilia,spotify:artist:0AqlFI0tz2DsEoJlKSIiT9,pop argentino,0,cuatro veinte,1.0,6,9,3,1923270,2022-04-14,...,0.696,7.0,0.0,-3.817,0.0505,0.0811,6.25e-05,0.101,0.501,95.066,133895.0,Argentina,South America,Spanish,0


In [308]:
# Assign correct data types
df_spotify_clean_3['uri'] = df_spotify_clean_3['uri'].astype('str')
df_spotify_clean_3['rank'] = df_spotify_clean_3['rank'].astype('int')
df_spotify_clean_3['artist_names'] = df_spotify_clean_3['artist_names'].astype('str')
df_spotify_clean_3['artists_num'] = df_spotify_clean_3['artists_num'].astype('float')
df_spotify_clean_3['artist_individual'] = df_spotify_clean_3['artist_individual'].astype('str')
df_spotify_clean_3['artist_id'] = df_spotify_clean_3['artist_id'].astype('str')
df_spotify_clean_3['artist_genre'] = df_spotify_clean_3['artist_genre'].astype('str')
df_spotify_clean_3['collab'] = df_spotify_clean_3['collab'].astype('int')
df_spotify_clean_3['track_name'] = df_spotify_clean_3['track_name'].astype('str')
df_spotify_clean_3['album_num_tracks'] = df_spotify_clean_3['album_num_tracks'].astype('float')

df_spotify_clean_3['peak_rank'] = df_spotify_clean_3['peak_rank'].astype('int')
df_spotify_clean_3['previous_rank'] = df_spotify_clean_3['previous_rank'].astype('int')
df_spotify_clean_3['weeks_on_chart'] = df_spotify_clean_3['weeks_on_chart'].astype('int')
df_spotify_clean_3['streams'] = df_spotify_clean_3['streams'].astype('int')
df_spotify_clean_3['week'] = pd.to_datetime(df_spotify_clean_3['week'])
df_spotify_clean_3['danceability'] = df_spotify_clean_3['danceability'].astype('float')
df_spotify_clean_3['energy'] = df_spotify_clean_3['energy'].astype('float')
df_spotify_clean_3['key'] = df_spotify_clean_3['key'].astype('float')
df_spotify_clean_3['mode'] = df_spotify_clean_3['mode'].astype('float')
df_spotify_clean_3['loudness'] = df_spotify_clean_3['loudness'].astype('float')

df_spotify_clean_3['speechiness'] = df_spotify_clean_3['speechiness'].astype('float')
df_spotify_clean_3['acousticness'] = df_spotify_clean_3['acousticness'].astype('float')
df_spotify_clean_3['instrumentalness'] = df_spotify_clean_3['instrumentalness'].astype('float')
df_spotify_clean_3['liveness'] = df_spotify_clean_3['liveness'].astype('float')
df_spotify_clean_3['valence'] = df_spotify_clean_3['valence'].astype('float')
df_spotify_clean_3['tempo'] = df_spotify_clean_3['tempo'].astype('float')
df_spotify_clean_3['duration'] = df_spotify_clean_3['duration'].astype('float')
df_spotify_clean_3['country'] = df_spotify_clean_3['country'].astype('str')
df_spotify_clean_3['region'] = df_spotify_clean_3['region'].astype('str')
df_spotify_clean_3['language'] = df_spotify_clean_3['language'].astype('str')

df_spotify_clean_3['pivot'] = df_spotify_clean_3['pivot'].astype('int')


In [309]:
df_spotify_clean_3.dtypes

uri                   object
rank                   int32
artist_names          object
artists_num          float64
artist_individual     object
                      ...   
duration             float64
country               object
region                object
language              object
pivot                  int32
Length: 31, dtype: object

In [310]:
df_spotify_clean_3[df_spotify_clean_3['week']< '2017-01-01'].sample(n=40)

# During the analysis it was found out that the data from 2016 has faulty data. Specifically the 'weeks on chart' is always 1 which is impossible.
# Therefore the year 2016 cannot be used. 

Unnamed: 0,uri,rank,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,previous_rank,weeks_on_chart,streams,week,...,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language,pivot
612724,spotify:track:3zJw3rugfpVrmBeDDnUYzy,87,"John Lennon, Yoko Ono",4.0,The Harlem Community Choir,spotify:artist:0vneleczrRRNPF1vj0x0vy,0,1,Happy Xmas (War Is Over) - Remastered 2010,15.0,87,-1,1,4885251,2016-12-29,...,0.607,2.0,1.0,-11.076,0.0315,0.3180,0.000000,0.7650,0.395,146.537,213880.0,Global,Global,Global,1
612774,spotify:track:2gbCG4Rt9984UZ7Tc0dIFd,142,"Michael Bublé, Shania Twain",2.0,Shania Twain,spotify:artist:5e4Dhzv426EvQe3aDb64jL,country dawn,1,White Christmas (with Shania Twain),16.0,142,-1,1,3512996,2016-12-29,...,0.484,10.0,1.0,-7.588,0.0319,0.4690,0.000012,0.1260,0.642,63.476,216627.0,Global,Global,Global,1
612708,spotify:track:4dASQiO1Eoo3RJvt74FtXB,78,"Lil Wayne, Wiz Khalifa, Imagine Dragons, X Amb...",6.0,Imagine Dragons,spotify:artist:53XhwfbYqKCa1cC15pYq2q,modern rock,1,"Sucker for Pain (with Wiz Khalifa, Imagine Dra...",1.0,78,-1,1,5183750,2016-12-29,...,0.786,9.0,0.0,-4.378,0.3170,0.2550,0.000000,0.6500,0.739,169.021,243491.0,Global,Global,Global,1
612807,spotify:track:2EEeOnHehOozLq4aS0n6SL,186,"KYLE, Lil Yachty",2.0,KYLE,spotify:artist:4qBgvVog0wzW75IQ48mU7v,pop rap,1,iSpy (feat. Lil Yachty),1.0,186,-1,1,2979074,2016-12-29,...,0.653,7.0,1.0,-6.745,0.2890,0.3780,0.000000,0.2290,0.672,75.016,253107.0,Global,Global,Global,0
612746,spotify:track:3VTNVsTTu05dmTsVFrmGpK,110,"The Pogues, Kirsty MacColl",2.0,The Pogues,spotify:artist:2wzMOQwNT6ZvVB4amvhFAH,celtic,1,Fairytale of New York (feat. Kirsty MacColl),19.0,110,-1,1,4245077,2016-12-29,...,0.551,2.0,1.0,-7.480,0.0457,0.5820,0.000042,0.2560,0.428,78.111,272467.0,Global,Global,Global,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612721,spotify:track:5uDASfU19gDxSjW8cnCaBp,86,"Rob $tone, J. Davi$, Spooks",3.0,J. Davi$,spotify:artist:1rs3y69kDwkIdGJcOYngQt,0,1,Chill Bill (feat. J. Davi$ & Spooks),1.0,86,-1,1,4888031,2016-12-29,...,0.427,6.0,1.0,-10.028,0.1450,0.0312,0.000990,0.0906,0.230,108.034,177184.0,Global,Global,Global,1
612796,spotify:track:0v9Wz8o0BT8DU38R4ddjeH,168,"Chance the Rapper, Lil Wayne, 2 Chainz",3.0,Chance the Rapper,spotify:artist:1anyVhU62p31KFi8MEzkbf,rap,1,No Problem (feat. Lil Wayne & 2 Chainz),14.0,168,-1,1,3153421,2016-12-29,...,0.795,11.0,0.0,-5.192,0.1740,0.1560,0.000000,0.1230,0.788,135.018,304607.0,Global,Global,Global,0
612702,spotify:track:4tCtwWceOPWzenK2HAIJSb,69,"Fifth Harmony, Ty Dolla $ign",2.0,Fifth Harmony,spotify:artist:1l8Fu6IkuTP0U5QetQJ5Xt,girl group,1,Work from Home (feat. Ty Dolla $ign),12.0,69,-1,1,5426035,2016-12-29,...,0.585,8.0,1.0,-5.861,0.0432,0.1030,0.000004,0.0644,0.593,105.017,214480.0,Global,Global,Global,0
577491,spotify:track:02WacdrRpm4zlP8H7X6bnQ,148,Calum Scott,1.0,Calum Scott,spotify:artist:6ydoSd3N2mwgwBHtF6K7eX,pop,0,Dancing On My Own,1.0,148,-1,1,3454666,2016-12-29,...,0.174,1.0,1.0,-8.796,0.0332,0.8510,0.000019,0.0855,0.193,112.175,260004.0,Global,Global,Global,0


**Remove year 2016**

In [312]:
df_spotify_clean_3 = df_spotify_clean_3[df_spotify_clean_3['week'] >= '2017-01-01']

**Check genre 0**

In [314]:
df_spotify_clean_3[(df_spotify_clean_3['artist_genre'] == '0') & (df_spotify_clean_3['weeks_on_chart'] > 40 )& (df_spotify_clean_3['region'] == 'Global' ) ]

Unnamed: 0,uri,rank,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,previous_rank,weeks_on_chart,streams,week,...,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language,pivot
596625,spotify:track:1lXQskY1Op1KzFG4GVcVpe,177,"Natti Natasha, Ozuna",1.0,Trío Los Josefinos,spotify:artist:67mFhErdTWmJ9slGMC6vOf,0,0,Criminal,16.0,21,182,46,3856488,2018-08-02,...,0.794,2.0,0.0,-3.821,0.0689,0.0416,0.000060,0.1390,0.837,79.983,232699.0,Global,Global,Global,0
606240,spotify:track:02VBYrHfVwfEWXk5DXyf0T,188,"Bruno Mars, Anderson .Paak, Silk Sonic",3.0,Silk Sonic,spotify:artist:6PvvGcCY2XtUcSRld1Wilr,0,1,Leave The Door Open,10.0,5,143,55,5487002,2022-03-24,...,0.616,5.0,1.0,-7.964,0.0324,0.1820,0.000000,0.0927,0.719,148.088,242096.0,Global,Global,Global,1
608757,spotify:track:1LPSkqVhWVRUkKE03YUkpB,100,"Bruno Mars, Anderson .Paak, Silk Sonic",3.0,Silk Sonic,spotify:artist:6PvvGcCY2XtUcSRld1Wilr,0,1,Leave The Door Open,9.0,5,99,46,6940029,2022-01-20,...,0.616,5.0,1.0,-7.964,0.0324,0.1820,0.000000,0.0927,0.719,148.088,242096.0,Global,Global,Global,1
608798,spotify:track:4cG7HUWYHBV6R6tHn1gxrl,154,"Riton, Nightcrawlers, Mufasa & Hypeman, Dopamine",4.0,Dopamine,spotify:artist:3Edve4VIATi0OZngclQlkN,0,1,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,1.0,15,156,48,5712313,2022-01-20,...,0.862,2.0,1.0,-3.424,0.1260,0.0076,0.000132,0.3030,0.801,122.980,169153.0,Global,Global,Global,1
611603,spotify:track:02VBYrHfVwfEWXk5DXyf0T,197,"Bruno Mars, Anderson .Paak, Silk Sonic",3.0,Silk Sonic,spotify:artist:6PvvGcCY2XtUcSRld1Wilr,0,1,Leave The Door Open,10.0,5,188,56,5391575,2022-03-31,...,0.616,5.0,1.0,-7.964,0.0324,0.1820,0.000000,0.0927,0.719,148.088,242096.0,Global,Global,Global,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663844,spotify:track:4cG7HUWYHBV6R6tHn1gxrl,68,"Riton, Nightcrawlers, Mufasa & Hypeman, Dopamine",4.0,Dopamine,spotify:artist:3Edve4VIATi0OZngclQlkN,0,1,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,1.0,15,194,46,8046772,2022-01-06,...,0.862,2.0,1.0,-3.424,0.1260,0.0076,0.000132,0.3030,0.801,122.980,169153.0,Global,Global,Global,1
663847,spotify:track:1LPSkqVhWVRUkKE03YUkpB,69,"Bruno Mars, Anderson .Paak, Silk Sonic",3.0,Silk Sonic,spotify:artist:6PvvGcCY2XtUcSRld1Wilr,0,1,Leave The Door Open,9.0,5,110,44,7987911,2022-01-06,...,0.616,5.0,1.0,-7.964,0.0324,0.1820,0.000000,0.0927,0.719,148.088,242096.0,Global,Global,Global,1
664063,spotify:track:1LPSkqVhWVRUkKE03YUkpB,99,"Bruno Mars, Anderson .Paak, Silk Sonic",3.0,Silk Sonic,spotify:artist:6PvvGcCY2XtUcSRld1Wilr,0,1,Leave The Door Open,9.0,5,69,45,7181617,2022-01-13,...,0.616,5.0,1.0,-7.964,0.0324,0.1820,0.000000,0.0927,0.719,148.088,242096.0,Global,Global,Global,1
664109,spotify:track:4cG7HUWYHBV6R6tHn1gxrl,156,"Riton, Nightcrawlers, Mufasa & Hypeman, Dopamine",4.0,Dopamine,spotify:artist:3Edve4VIATi0OZngclQlkN,0,1,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,1.0,15,68,47,5741857,2022-01-13,...,0.862,2.0,1.0,-3.424,0.1260,0.0076,0.000132,0.3030,0.801,122.980,169153.0,Global,Global,Global,1


In [315]:
# Clean genres
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Ke Personajes', 'artist_genre'] = 'cumbia'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Vang, Cloud 5', 'artist_genre'] = 'chill pop'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Lu, Willistic, datfitzx', 'artist_genre'] = 'r&b'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'RYH', 'artist_genre'] = 'cumbia'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Orinn, Huy Vạc', 'artist_genre'] = 'sad lo-fi'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'La T y La M', 'artist_genre'] = 'cumbia'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'BoyWithUke', 'artist_genre'] = 'bedroom pop'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Kai Đinh, SIVAN', 'artist_genre'] = 'r&b'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'Macklemore & Ryan Lewis, Macklemore, Ryan Lewis, Ray Dalton', 'artist_genre'] = 'hip hop'
df_spotify_clean_3.loc[df_spotify_clean_3['artist_names'] == 'HIEUTHUHAI, Harmonie', 'artist_genre'] = 'rap'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:5stPVcRqb4qixbafP9e8lt', 'artist_genre'] = 'reggaeton'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:5ZzNGwBBGaewMGG1LucGfp', 'artist_genre'] = 'trap latino'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:57Ba26w9eoXx8BBUNgRNLV', 'artist_genre'] = 'reggaeton'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:2UypFzxfaYgMUnQX2k4qtj', 'artist_genre'] = 'latin'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:13Rf6RaZT13TW4xLQR3MQr', 'artist_genre'] = 'rap'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:2q9udNV9NK0BL3q9p6TLxf', 'artist_genre'] = 'tropical'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:2rzZ0IGxgrGUAPwrWn80z7', 'artist_genre'] = 'pop'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:02VBYrHfVwfEWXk5DXyf0T', 'artist_genre'] = 'soul'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:1LPSkqVhWVRUkKE03YUkpB', 'artist_genre'] = 'soul'
df_spotify_clean_3.loc[df_spotify_clean_3['uri'] == 'spotify:track:4cG7HUWYHBV6R6tHn1gxrl', 'artist_genre'] = 'dance pop'

In [316]:
df_spotify_clean_3[df_spotify_clean_3['artist_genre'] == '0']

Unnamed: 0,uri,rank,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,previous_rank,weeks_on_chart,streams,week,...,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language,pivot
518,spotify:track:7GcN9q4AAH4uzQKvyuvByT,179,LA YOUNG,1.0,LA YOUNG,spotify:artist:2Xi0zO1N71Ff9KymMs93YX,0,0,Como Estrellas,1.0,179,180,2,189716,2021-04-22,...,0.495,6.0,1.0,-6.058,0.1100,0.764,0.000000,0.2610,0.752,170.044,219000.0,Argentina,South America,Spanish,0
1906,spotify:track:7CAdT0HdiQNlt1C7xk2hep,145,Jung Kook,1.0,Jung Kook,spotify:artist:6HaGTQPmzraVmaVxvz6EUc,0,0,Stay Alive (Prod. SUGA of BTS),1.0,145,-1,1,279533,2022-02-17,...,0.760,2.0,0.0,-5.889,0.0682,0.299,0.000000,0.1030,0.495,130.096,210928.0,Argentina,South America,Spanish,0
2313,spotify:track:4jl44FUrq3Mrexi05J7K7h,116,FUTURE BEATS,1.0,FUTURE BEATS,spotify:artist:6Y6LEiyvXx2tTKggz1jF5q,0,0,CHOCOPOP RKT - TURREO EDIT,1.0,116,-1,1,321900,2022-05-19,...,0.754,2.0,1.0,-2.040,0.2920,0.191,0.000000,0.1640,0.622,180.082,136046.0,Argentina,South America,Spanish,0
2863,spotify:track:4jl44FUrq3Mrexi05J7K7h,89,FUTURE BEATS,1.0,FUTURE BEATS,spotify:artist:6Y6LEiyvXx2tTKggz1jF5q,0,0,CHOCOPOP RKT - TURREO EDIT,1.0,89,116,2,444167,2022-05-26,...,0.754,2.0,1.0,-2.040,0.2920,0.191,0.000000,0.1640,0.622,180.082,136046.0,Argentina,South America,Spanish,0
5281,spotify:track:7GcN9q4AAH4uzQKvyuvByT,180,LA YOUNG,1.0,LA YOUNG,spotify:artist:2Xi0zO1N71Ff9KymMs93YX,0,0,Como Estrellas,1.0,180,-1,1,180772,2021-04-15,...,0.495,6.0,1.0,-6.058,0.1100,0.764,0.000000,0.2610,0.752,170.044,219000.0,Argentina,South America,Spanish,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787959,spotify:track:2MHGIiWus5xHykiZGuDtNe,158,"Sean, Lửa",2.0,Sean,spotify:artist:1xNqmjTeWon7iX8kbPKpZz,0,1,Em Thích,2.0,65,152,20,44740,2022-05-05,...,0.330,9.0,1.0,-12.065,0.2260,0.688,0.000000,0.1110,0.156,175.423,176151.0,Vietnam,Asia,Vietnamese,0
1787960,spotify:track:2MHGIiWus5xHykiZGuDtNe,158,"Sean, Lửa",2.0,Lửa,spotify:artist:4HOm37irsRwVEddNx6d0lq,0,1,Em Thích,2.0,65,152,20,44740,2022-05-05,...,0.330,9.0,1.0,-12.065,0.2260,0.688,0.000000,0.1110,0.156,175.423,176151.0,Vietnam,Asia,Vietnamese,1
1787967,spotify:track:1Uj89dqT3gdnnKZnaOMWI8,166,"TIA, fueled by boba",2.0,fueled by boba,spotify:artist:4NKeJPVrlhASY3FXkQsLGn,0,1,in the dark,2.0,156,172,5,43084,2022-05-05,...,0.382,9.0,0.0,-9.915,0.0505,0.422,0.000254,0.1180,0.209,179.799,194578.0,Vietnam,Asia,Vietnamese,1
1787983,spotify:track:1c0LbgYt51JeBUMAzofjkq,183,"TGSN, tlinh, RZ Mas",3.0,RZ Mas,spotify:artist:39UJlviUwzZGZrA8DiQy31,0,1,Siren,1.0,31,187,17,39591,2022-05-05,...,0.732,11.0,0.0,-6.092,0.1150,0.548,0.000001,0.2580,0.777,130.001,157067.0,Vietnam,Asia,Vietnamese,1


### 2. Basic Descriptive Analysis

In [318]:
df_spotify_clean_3.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
rank,1787213.0,99.072611,1.0,49.0,98.0,149.0,200.0,57.740651
artists_num,1787213.0,2.244509,1.0,1.0,2.0,3.0,20.0,1.610716
collab,1787213.0,0.627659,0.0,0.0,1.0,1.0,1.0,0.483429
album_num_tracks,1787213.0,8.465175,1.0,1.0,7.0,14.0,199.0,8.550258
peak_rank,1787213.0,40.064027,1.0,5.0,21.0,61.0,200.0,45.608224
...,...,...,...,...,...,...,...,...
liveness,1787213.0,0.169375,0.0134,0.0922,0.118,0.209,0.99,0.126491
valence,1787213.0,0.553048,0.00001,0.391,0.558,0.729,0.992,0.222453
tempo,1787213.0,121.992354,31.262,96.699,119.932,141.026,232.018,30.165851
duration,1787213.0,211536.174058,30133.0,173948.0,202735.0,234000.0,1787030.0,58058.277548


### 

**Delete extra columns**

In [321]:
df_spotify_clean_3.drop(['pivot','speechiness', 'rank', 'previous_rank'], axis=1, inplace=True)

**Round miliseconds to seconds**

In [323]:
df_spotify_clean_3['duration'] = (df_spotify_clean_3['duration']/1000).round()


**Separate Global and not global entries**

In [325]:
df_spotify_clean_global = df_spotify_clean_3[df_spotify_clean_3['country'] == 'Global']

df_spotify_clean_no_global = df_spotify_clean_3[df_spotify_clean_3['country'] != 'Global']

df_spotify_clean_no_global.sample(n=7)

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
1333096,spotify:track:7umEhzfURpGTSRbcX84qLy,Young Leosia,1.0,Young Leosia,spotify:artist:0iBTVnJ1Sff92zCDujfvyJ,polish pop,0,Szklanki,1.0,1,6,783987,2021-03-11,0.953,0.595,0.0,1.0,-8.01,0.235,0.0,0.0679,0.675,124.965,163.0,Poland,Europe,Polish
1221364,spotify:track:5mNOfozRJBZGnq4gO4rwM2,"Gur Sidhu, Jasmine Sandlas, Kaptaan",3.0,Kaptaan,spotify:artist:0F4kRjMBP6NrjpEBEoL0Xb,0,1,Bamb Agya,1.0,116,5,18151,2022-07-07,0.823,0.709,1.0,1.0,-3.937,0.0235,0.0,0.0834,0.499,93.012,200.0,Pakistan,Asia,Urdu
771273,spotify:track:3Kdlue3eVr9jf0Ns5wNPMj,"Jer 柳應廷, 香港兒童合唱團",2.0,香港兒童合唱團,spotify:artist:20GdoyuHyDOzJe7T2EcFqI,0,1,神奇之路 (電影《媽媽的神奇小子》主題曲),1.0,42,1,69180,2021-08-12,0.414,0.372,4.0,1.0,-9.048,0.841,0.0,0.147,0.188,79.866,333.0,Hong Kong,Asia,Cantonese
285669,spotify:track:2K1CQMoG2Dy5nmeOYnhoi8,"J Balvin, KAROL G, Nicky Jam, Crissin, Totoy E...",6.0,Natan & Shander,spotify:artist:5OBK3iQwjNQqElPmn4TgAE,reggaeton colombiano,1,Poblado - Remix,24.0,11,41,28598,2022-03-31,0.813,0.81,3.0,0.0,-5.382,0.112,2e-06,0.377,0.632,93.005,393.0,Costa Rica,Central America,Spanish
1249376,spotify:track:6b2HYgqcK9mvktt4GxAu72,"Justin Quiles, Chimbala, Zion & Lennox",3.0,Justin Quiles,spotify:artist:14zUHaJZo1mnYtn6IBRaRP,reggaeton,1,Loco,14.0,37,34,16285,2022-01-27,0.771,0.885,8.0,1.0,-3.66,0.222,0.0,0.0918,0.826,128.031,232.0,Panama,Central America,Spanish
1125552,spotify:track:4LaGu95Ui2s4vprSQYWUAZ,"Pop Smoke, 21 Savage, 42 Dugg",3.0,21 Savage,spotify:artist:1URnnhqYAYcrqrcwql10ft,hip hop,1,Bout A Million (feat. 42 Dugg & 21 Savage),20.0,64,1,314505,2021-07-22,0.716,0.67,0.0,0.0,-6.995,0.526,0.0,0.13,0.411,145.043,204.0,Netherlands,Europe,Dutch
1640954,spotify:track:2jyjhRf6DVbMPU5zxagN2h,Passenger,1.0,Passenger,spotify:artist:0gadJ2b9A4SKsB1RFkBb66,neo mellow,0,Let Her Go,12.0,117,32,8370,2021-02-18,0.509,0.538,7.0,1.0,-7.335,0.385,0.0,0.104,0.244,75.089,253.0,United Arab Emirates,Middle East,Arabic


In [326]:
df_spotify_clean_3.sample(n=10)

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
456585,spotify:track:6K5BsR04ijf3FHNzjbaagD,"Myke Towers, Farruko, Arcangel, Sech, Zion",5.0,Farruko,spotify:artist:329e4yvIujISKGKz1BZZbO,reggaeton,1,Si Se Da - Remix,1.0,18,112,17639,2021-09-23,0.817,0.672,11.0,0.0,-5.612,0.441,0.0,0.13,0.774,93.999,332.0,El Salvador,Central America,Spanish
565431,spotify:track:3iw6V4LH7yPj1ESORX9RIN,"Nathan Evans, 220 KID, Billen Ted",3.0,Billen Ted,spotify:artist:5PoZtBo8xZKqPWlZrIDq82,0,1,Wellerman - Sea Shanty / 220 KID x Billen Ted ...,1.0,2,54,403775,2022-02-24,0.722,0.893,0.0,0.0,-3.255,0.0441,0.000937,0.0673,0.439,119.932,117.0,Germany,Europe,German
1295092,spotify:track:4gvrJnKCKIPiacNsWVQwEU,"CKay, Dj Yo!, AX'EL",3.0,CKay,spotify:artist:048LktY5zMnakWq7PTtFrz,afro r&b,1,love nwantiti (feat. Dj Yo! & AX'EL) - Remix,1.0,12,23,137359,2022-02-24,0.579,0.596,8.0,1.0,-4.823,0.436,0.0938,0.0549,0.443,119.9,188.0,Peru,South America,Spanish
638935,spotify:track:7E1jVNoWuemqUryI4FxsVD,"Ariana Grande, The Weeknd",2.0,Ariana Grande,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,pop,1,off the table (with The Weeknd),14.0,20,3,5308945,2020-11-19,0.411,0.522,0.0,1.0,-7.295,0.519,0.0,0.398,0.382,88.332,240.0,Global,Global,Global
1450689,spotify:track:52xJxFP6TqMuO4Yt0eOkMz,"Carolina Gaitán - La Gaita, Mauro Castillo, Ad...",7.0,Rhenzy Feliz,spotify:artist:2as15AH2BTrPk8v4gyElmr,0,1,We Don't Talk About Bruno,44.0,12,13,78223,2022-03-24,0.577,0.45,0.0,0.0,-8.516,0.357,0.0,0.111,0.83,205.863,216.0,Singapore,Asia,English
1170896,spotify:track:1s59X35jDULAyOGmBuTAnd,"Lenin Ramírez, Grupo Firme",2.0,Lenin Ramírez,spotify:artist:3hTffafUYLLgO4yuPAxb5U,nueva musica mexicana,1,Yo Ya No Vuelvo Contigo - En Vivo,1.0,140,10,6846,2021-05-13,0.643,0.35,4.0,0.0,-5.924,0.784,0.0,0.102,0.541,118.199,240.0,Nicaragua,Central America,Spanish
1694252,spotify:track:53bbEINuw0ez3ntLcqYCu2,SLAVA MARLOW,1.0,SLAVA MARLOW,spotify:artist:55jryyk7RhvMbrvoF0ndBh,russian hip hop,0,По Глазам,5.0,1,18,51248,2021-02-25,0.551,0.632,8.0,0.0,-5.873,0.15,0.0113,0.559,0.333,75.905,120.0,Ukraine,Europe,Ukrainian
1782591,spotify:track:4iDyFW4a7iCJh2VqSpwGQz,"TIA, Le Thien Hieu",2.0,TIA,spotify:artist:6LrBtADUmDoLlcTrg448Qt,v-pop,1,Ai Đưa Em Về,1.0,69,81,55245,2021-04-29,0.809,0.49,10.0,0.0,-7.08,0.839,0.0,0.117,0.353,99.967,233.0,Vietnam,Asia,Vietnamese
1517360,spotify:track:13XxxfnZmFEkU90QN8bOjz,"C.R.O, Duki, FMK",3.0,C.R.O,spotify:artist:4puAp107dCehraE47QXVQX,trap triste,1,INTERESTELAR,1.0,73,4,294636,2022-02-24,0.831,0.628,4.0,0.0,-4.353,0.0775,0.0,0.0905,0.467,90.006,173.0,Spain,Europe,Spanish
321704,spotify:track:7JYa8V4dGXSDcZnO2CiRaP,"Leaderbrain, Beyond",2.0,Beyond,spotify:artist:7KcIok6StqYSedgtjmtsqP,0,1,LOCA,1.0,82,5,4794,2022-01-20,0.722,0.719,6.0,1.0,-8.891,0.577,0.00179,0.173,0.419,104.982,164.0,Cyprus,Europe,Greek


In [327]:
df_spotify_clean_global.sample(n=10)

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
646885,spotify:track:07MDkzWARZaLEdKxo6yArG,"THE ANXIETY, WILLOW, Tyler Cole",3.0,WILLOW,spotify:artist:3rWZHrfrsPBxVy692yAIxF,dance pop,1,Meet Me At Our Spot,10.0,24,8,14407480,2021-10-28,0.773,0.47,2.0,1.0,-7.93,0.0153,0.000193,0.0851,0.399,94.995,163.0,Global,Global,Global
664058,spotify:track:0G2zPzWqVjR68iNPmx2TBe,"Mora, Bad Bunny, Sech",3.0,Sech,spotify:artist:77ziqFxp5gaInVrF2lj4ht,panamanian pop,1,Volando - Remix,1.0,19,27,7329506,2022-01-13,0.659,0.688,6.0,1.0,-4.649,0.207,0.0,0.113,0.632,153.979,273.0,Global,Global,Global
618662,spotify:track:6mICuAdrwEjh6Y6lroV2Kg,"Shakira, Maluma",2.0,Shakira,spotify:artist:0EmeFodog0BfCgMzAIvKQp,latin,1,Chantaje (feat. Maluma),13.0,13,34,4454522,2017-08-17,0.852,0.773,8.0,0.0,-2.921,0.187,3e-05,0.159,0.907,102.034,196.0,Global,Global,Global
582575,spotify:track:2NMjggapJcXXM7WccGEBUO,Bad Bunny,1.0,Bad Bunny,spotify:artist:4q3ewBCX7sLwd24euuV69X,latin,0,Enséñame a Bailar,23.0,27,3,9872149,2022-05-26,0.81,0.789,8.0,1.0,-5.08,0.191,1.1e-05,0.467,0.77,105.009,176.0,Global,Global,Global
630613,spotify:track:4cG7HUWYHBV6R6tHn1gxrl,"Riton, Nightcrawlers, Mufasa & Hypeman, Dopamine",4.0,Mufasa & Hypeman,spotify:artist:4L2dV3zY7RmkeiNO035Fi0,dance pop,1,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,1.0,15,21,15479076,2021-07-15,0.824,0.862,2.0,1.0,-3.424,0.0076,0.000132,0.303,0.801,122.98,169.0,Global,Global,Global
615698,spotify:track:17Fd6Yb7mSbinKG8LoWfFl,"Mike Posner, Seeb",2.0,Mike Posner,spotify:artist:2KsP6tYLJlTBvSUxnwlVWa,edm,1,I Took A Pill In Ibiza - Seeb Remix,18.0,40,5,5548546,2017-01-26,0.663,0.713,7.0,0.0,-6.647,0.0353,8e-06,0.0843,0.69,101.965,198.0,Global,Global,Global
660716,spotify:track:6vN77lE9LK6HP2DewaN6HZ,"Lil Baby, Drake",2.0,Drake,spotify:artist:3TVXtAsR1Inumwj472S9r4,hip hop,1,Yes Indeed,17.0,12,20,7549538,2018-09-27,0.963,0.346,5.0,0.0,-9.309,0.0355,0.0,0.108,0.562,119.957,142.0,Global,Global,Global
615969,spotify:track:7DM4BPaS7uofFul3ywMe46,"Ricky Martin, Maluma",2.0,Ricky Martin,spotify:artist:7slfeZO9LsJbWgpkIoXBUJ,latin,1,Vente Pa' Ca (feat. Maluma),1.0,49,13,4908644,2017-03-23,0.663,0.92,11.0,0.0,-4.07,0.00431,1.7e-05,0.101,0.533,99.935,259.0,Global,Global,Global
641064,spotify:track:7DcvwMAiqKJQD1rrdfxSDx,"Young Thug, J. Cole, Travis Scott",3.0,Travis Scott,spotify:artist:0Y5tJX1MQlPlqiwlOH1tJY,hip hop,1,The London (feat. J. Cole & Travis Scott),1.0,6,1,22075360,2019-05-30,0.796,0.586,4.0,0.0,-6.946,0.0247,0.0,0.132,0.179,97.981,200.0,Global,Global,Global
659754,spotify:track:5R9KPXQVeyqsHZoPLMgfNy,"Matheus Fernandes, Dilsinho",2.0,Dilsinho,spotify:artist:4NUePmzDvCYqilXBFa91Hg,pagode novo,1,Baby Me Atende,1.0,172,1,5380533,2021-06-17,0.65,0.792,6.0,0.0,-3.825,0.607,2.4e-05,0.92,0.872,154.027,226.0,Global,Global,Global


### 3. Exporting Data

In [329]:
df_spotify_clean_no_global.to_pickle(os.path.join(data_path, '02 Data', 'Prepared Data', 'spotify_clean_no_global.pkl'))

df_spotify_clean_global.to_pickle(os.path.join(data_path, '02 Data', 'Prepared Data', 'spotify_clean_global.pkl'))


In [330]:
df_spotify_clean_no_global.to_csv(os.path.join(data_path, '02 Data', 'Prepared Data', 'spotify_clean_no_global.csv'))

df_spotify_clean_global.to_csv(os.path.join(data_path, '02 Data', 'Prepared Data', 'spotify_clean_global.csv'))


In [331]:
df_spotify_clean_global.head()

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
573718,spotify:track:6MWtB6iiXyIwun0YzU6DFP,Post Malone,1.0,Post Malone,spotify:artist:246dkjvS1zLTtiykXe5h60,rap,0,Wow.,1.0,2,4,29944242,2019-01-17,0.833,0.539,11.0,0.0,-7.399,0.163,2e-06,0.101,0.385,99.947,150.0,Global,Global,Global
573719,spotify:track:2rPE9A1vEgShuZxxzR2tZH,Ariana Grande,1.0,Ariana Grande,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,pop,0,"thank u, next",1.0,1,11,27807232,2019-01-17,0.724,0.647,1.0,1.0,-5.642,0.28,0.0,0.102,0.435,106.96,207.0,Global,Global,Global
573720,spotify:track:5p7ujcrUXASCNwRaWNHR1C,Halsey,1.0,Halsey,spotify:artist:26VFTg2z8YR0cCuwLzESi2,electropop,0,Without Me,1.0,4,15,27138512,2019-01-17,0.752,0.488,6.0,1.0,-7.05,0.297,9e-06,0.0936,0.533,136.041,202.0,Global,Global,Global
573721,spotify:track:25sgk305KZfyuqVBQIahim,Ava Max,1.0,Ava Max,spotify:artist:4npEfmQ6YuiwW1GpUmaq3F,pop,0,Sweet but Psycho,1.0,6,16,22400542,2019-01-17,0.719,0.704,1.0,1.0,-4.724,0.0691,0.0,0.166,0.628,133.002,187.0,Global,Global,Global
573722,spotify:track:1rqqCSm0Qe4I9rUvWncaom,Panic! At The Disco,1.0,Panic! At The Disco,spotify:artist:20JZFwl6HVl6yg8a4H3ZqK,modern rock,0,High Hopes,11.0,9,30,19643546,2019-01-17,0.579,0.904,5.0,1.0,-2.729,0.193,0.0,0.064,0.681,82.014,191.0,Global,Global,Global


In [332]:
df_spotify_clean_global.head()

Unnamed: 0,uri,artist_names,artists_num,artist_individual,artist_id,artist_genre,collab,track_name,album_num_tracks,peak_rank,weeks_on_chart,streams,week,danceability,energy,key,mode,loudness,acousticness,instrumentalness,liveness,valence,tempo,duration,country,region,language
573718,spotify:track:6MWtB6iiXyIwun0YzU6DFP,Post Malone,1.0,Post Malone,spotify:artist:246dkjvS1zLTtiykXe5h60,rap,0,Wow.,1.0,2,4,29944242,2019-01-17,0.833,0.539,11.0,0.0,-7.399,0.163,2e-06,0.101,0.385,99.947,150.0,Global,Global,Global
573719,spotify:track:2rPE9A1vEgShuZxxzR2tZH,Ariana Grande,1.0,Ariana Grande,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,pop,0,"thank u, next",1.0,1,11,27807232,2019-01-17,0.724,0.647,1.0,1.0,-5.642,0.28,0.0,0.102,0.435,106.96,207.0,Global,Global,Global
573720,spotify:track:5p7ujcrUXASCNwRaWNHR1C,Halsey,1.0,Halsey,spotify:artist:26VFTg2z8YR0cCuwLzESi2,electropop,0,Without Me,1.0,4,15,27138512,2019-01-17,0.752,0.488,6.0,1.0,-7.05,0.297,9e-06,0.0936,0.533,136.041,202.0,Global,Global,Global
573721,spotify:track:25sgk305KZfyuqVBQIahim,Ava Max,1.0,Ava Max,spotify:artist:4npEfmQ6YuiwW1GpUmaq3F,pop,0,Sweet but Psycho,1.0,6,16,22400542,2019-01-17,0.719,0.704,1.0,1.0,-4.724,0.0691,0.0,0.166,0.628,133.002,187.0,Global,Global,Global
573722,spotify:track:1rqqCSm0Qe4I9rUvWncaom,Panic! At The Disco,1.0,Panic! At The Disco,spotify:artist:20JZFwl6HVl6yg8a4H3ZqK,modern rock,0,High Hopes,11.0,9,30,19643546,2019-01-17,0.579,0.904,5.0,1.0,-2.729,0.193,0.0,0.064,0.681,82.014,191.0,Global,Global,Global
