In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

There are two files we need to wrangle: the song data from Spotify and the World Happiness data.

We will start by wrangling the Spotify data.

## Data Wrangling: Spotify data

In [2]:
# Load song data
songs_data = pd.read_csv('./universal_top_spotify_songs.csv')

In [3]:
# Check data summary
songs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715953 entries, 0 to 715952
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   spotify_id          715953 non-null  object 
 1   name                715926 non-null  object 
 2   artists             715926 non-null  object 
 3   daily_rank          715953 non-null  int64  
 4   daily_movement      715953 non-null  int64  
 5   weekly_movement     715953 non-null  int64  
 6   country             706296 non-null  object 
 7   snapshot_date       715953 non-null  object 
 8   popularity          715953 non-null  int64  
 9   is_explicit         715953 non-null  bool   
 10  duration_ms         715953 non-null  int64  
 11  album_name          715700 non-null  object 
 12  album_release_date  715700 non-null  object 
 13  danceability        715953 non-null  float64
 14  energy              715953 non-null  float64
 15  key                 715953 non-nul

`time_signature` and `key` are categorical variables, so let's change their types to `category`.

In [4]:
# Use the .astype() method to change time_signature and key types to category.
songs_data["time_signature"] = songs_data["time_signature"].astype("category")
songs_data["key"] = songs_data["key"].astype("category")

# Check info again
songs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715953 entries, 0 to 715952
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   spotify_id          715953 non-null  object  
 1   name                715926 non-null  object  
 2   artists             715926 non-null  object  
 3   daily_rank          715953 non-null  int64   
 4   daily_movement      715953 non-null  int64   
 5   weekly_movement     715953 non-null  int64   
 6   country             706296 non-null  object  
 7   snapshot_date       715953 non-null  object  
 8   popularity          715953 non-null  int64   
 9   is_explicit         715953 non-null  bool    
 10  duration_ms         715953 non-null  int64   
 11  album_name          715700 non-null  object  
 12  album_release_date  715700 non-null  object  
 13  danceability        715953 non-null  float64 
 14  energy              715953 non-null  float64 
 15  key              

### Checking for missing data

In [5]:
# Check for missing data
missing = songs_data.isnull().sum()
missing

spotify_id               0
name                    27
artists                 27
daily_rank               0
daily_movement           0
weekly_movement          0
country               9657
snapshot_date            0
popularity               0
is_explicit              0
duration_ms              0
album_name             253
album_release_date     253
danceability             0
energy                   0
key                      0
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness         0
liveness                 0
valence                  0
tempo                    0
time_signature           0
dtype: int64

In [6]:
# Remove entries where country is null

# Check percentage of entries where country is non-null vs. entries where country is null
missing_countries = songs_data[['country']].isnull().sum(axis=1)
missing_countries.value_counts()/len(missing_countries) * 100

0    98.651168
1     1.348832
Name: count, dtype: float64

In [7]:
# Drop entries in songs_data where country is null
songs_data = songs_data.dropna(subset=['country'])

In [8]:
# Check percentage counts again
missing_countries = songs_data[['country']].isnull().sum(axis=1)
missing_countries.value_counts()/len(missing_countries) * 100

0    100.0
Name: count, dtype: float64

### Adding country names using a dictionary

Let's check which countries are in the data set:

In [9]:
# Check which countries are in the data set
countries_array = songs_data['country'].unique()
countries_array

array(['ZA', 'VN', 'VE', 'UY', 'US', 'UA', 'TW', 'TR', 'TH', 'SV', 'SK',
       'SG', 'SE', 'SA', 'RO', 'PY', 'PT', 'PL', 'PK', 'PH', 'PE', 'PA',
       'NZ', 'NO', 'NL', 'NI', 'NG', 'MY', 'MX', 'MA', 'LV', 'LU', 'LT',
       'KZ', 'KR', 'JP', 'IT', 'IS', 'IN', 'IL', 'IE', 'ID', 'HU', 'HN',
       'HK', 'GT', 'GR', 'GB', 'FR', 'FI', 'ES', 'EG', 'EE', 'EC', 'DO',
       'DK', 'DE', 'CZ', 'CR', 'CO', 'CL', 'CH', 'CA', 'BY', 'BR', 'BO',
       'BG', 'BE', 'AU', 'AT', 'AR', 'AE'], dtype=object)

Since the countries are listed only as country codes, we will add the full names of the countries as well for clarity.

Then, we will copy the `country_code` column and use `country_dict` to map the country codes to their respective country names.

In [10]:
# Create a dictionary with the full names of countries.
country_dict = {'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa', 'AD': 'Andorra', 'AO': 'Angola', 'AI': 'Anguilla', 'AQ': 'Antarctica', 'AG': 'Antigua and Barbuda', 'AR': 'Argentina', 'AM': 'Armenia', 'AW': 'Aruba', 'AU': 'Australia', 'AT': 'Austria', 'AZ': 'Azerbaijan', 'BS': 'Bahamas (the)', 'BH': 'Bahrain', 'BD': 'Bangladesh', 'BB': 'Barbados', 'BY': 'Belarus', 'BE': 'Belgium', 'BZ': 'Belize', 'BJ': 'Benin', 'BM': 'Bermuda', 'BT': 'Bhutan', 'BO': 'Bolivia (Plurinational State of)', 'BQ': 'Bonaire, Sint Eustatius and Saba', 'BA': 'Bosnia and Herzegovina', 'BW': 'Botswana', 'BV': 'Bouvet Island', 'BR': 'Brazil', 'IO': 'British Indian Ocean Territory (the)', 'BN': 'Brunei Darussalam', 'BG': 'Bulgaria', 'BF': 'Burkina Faso', 'BI': 'Burundi', 'CV': 'Cabo Verde', 'KH': 'Cambodia', 'CM': 'Cameroon', 'CA': 'Canada', 'KY': 'Cayman Islands (the)', 'CF': 'Central African Republic (the)', 'TD': 'Chad', 'CL': 'Chile', 'CN': 'China', 'CX': 'Christmas Island', 'CC': 'Cocos (Keeling) Islands (the)', 'CO': 'Colombia', 'KM': 'Comoros (the)', 'CD': 'Congo (the Democratic Republic of the)', 'CG': 'Congo (the)', 'CK': 'Cook Islands (the)', 'CR': 'Costa Rica', 'CI': "Côte d'Ivoire", 'HR': 'Croatia', 'CU': 'Cuba', 'CW': 'Curaçao', 'CY': 'Cyprus', 'CZ': 'Czechia', 'DK': 'Denmark', 'DJ': 'Djibouti', 'DM': 'Dominica', 'DO': 'Dominican Republic (the)', 'EC': 'Ecuador', 'EG': 'Egypt', 'SV': 'El Salvador', 'GQ': 'Equatorial Guinea', 'ER': 'Eritrea', 'EE': 'Estonia', 'SZ': 'Eswatini', 'ET': 'Ethiopia', 'FK': 'Falkland Islands (the) [Malvinas]', 'FO': 'Faroe Islands (the)', 'FJ': 'Fiji', 'FI': 'Finland', 'FR': 'France', 'GF': 'French Guiana', 'PF': 'French Polynesia', 'TF': 'French Southern Territories (the)', 'GA': 'Gabon', 'GM': 'Gambia (the)', 'GE': 'Georgia', 'DE': 'Germany', 'GH': 'Ghana', 'GI': 'Gibraltar', 'GR': 'Greece', 'GL': 'Greenland', 'GD': 'Grenada', 'GP': 'Guadeloupe', 'GU': 'Guam', 'GT': 'Guatemala', 'GG': 'Guernsey', 'GN': 'Guinea', 'GW': 'Guinea-Bissau', 'GY': 'Guyana', 'HT': 'Haiti', 'HM': 'Heard Island and McDonald Islands', 'VA': 'Holy See (the)', 'HN': 'Honduras', 'HK': 'Hong Kong', 'HU': 'Hungary', 'IS': 'Iceland', 'IN': 'India', 'ID': 'Indonesia', 'IR': 'Iran (Islamic Republic of)', 'IQ': 'Iraq', 'IE': 'Ireland', 'IM': 'Isle of Man', 'IL': 'Israel', 'IT': 'Italy', 'JM': 'Jamaica', 'JP': 'Japan', 'JE': 'Jersey', 'JO': 'Jordan', 'KZ': 'Kazakhstan', 'KE': 'Kenya', 'KI': 'Kiribati', 'KP': "Korea (the Democratic People's Republic of)", 'KR': 'Korea (the Republic of)', 'KW': 'Kuwait', 'KG': 'Kyrgyzstan', 'LA': "Lao People's Democratic Republic (the)", 'LV': 'Latvia', 'LB': 'Lebanon', 'LS': 'Lesotho', 'LR': 'Liberia', 'LY': 'Libya', 'LI': 'Liechtenstein', 'LT': 'Lithuania', 'LU': 'Luxembourg', 'MO': 'Macao', 'MK': 'Republic of North Macedonia', 'MG': 'Madagascar', 'MW': 'Malawi', 'MY': 'Malaysia', 'MV': 'Maldives', 'ML': 'Mali', 'MT': 'Malta', 'MH': 'Marshall Islands (the)', 'MQ': 'Martinique', 'MR': 'Mauritania', 'MU': 'Mauritius', 'YT': 'Mayotte', 'MX': 'Mexico', 'FM': 'Micronesia (Federated States of)', 'MD': 'Moldova (the Republic of)', 'MC': 'Monaco', 'MN': 'Mongolia', 'ME': 'Montenegro', 'MS': 'Montserrat', 'MA': 'Morocco', 'MZ': 'Mozambique', 'MM': 'Myanmar', 'NA': 'Namibia', 'NR': 'Nauru', 'NP': 'Nepal', 'NL': 'Netherlands (the)', 'NC': 'New Caledonia', 'NZ': 'New Zealand', 'NI': 'Nicaragua', 'NE': 'Niger (the)', 'NG': 'Nigeria', 'NU': 'Niue', 'NF': 'Norfolk Island', 'MP': 'Northern Mariana Islands (the)', 'NO': 'Norway', 'OM': 'Oman', 'PK': 'Pakistan', 'PW': 'Palau', 'PS': 'Palestine, State of', 'PA': 'Panama', 'PG': 'Papua New Guinea', 'PY': 'Paraguay', 'PE': 'Peru', 'PH': 'Philippines (the)', 'PN': 'Pitcairn', 'PL': 'Poland', 'PT': 'Portugal', 'PR': 'Puerto Rico', 'QA': 'Qatar', 'RE': 'Réunion', 'RO': 'Romania', 'RU': 'Russian Federation (the)', 'RW': 'Rwanda', 'BL': 'Saint Barthélemy', 'SH': 'Saint Helena, Ascension and Tristan da Cunha', 'KN': 'Saint Kitts and Nevis', 'LC': 'Saint Lucia', 'MF': 'Saint Martin (French part)', 'PM': 'Saint Pierre and Miquelon', 'VC': 'Saint Vincent and the Grenadines', 'WS': 'Samoa', 'SM': 'San Marino', 'ST': 'Sao Tome and Principe', 'SA': 'Saudi Arabia', 'SN': 'Senegal', 'RS': 'Serbia', 'SC': 'Seychelles', 'SL': 'Sierra Leone', 'SG': 'Singapore', 'SX': 'Sint Maarten (Dutch part)', 'SK': 'Slovakia', 'SI': 'Slovenia', 'SB': 'Solomon Islands', 'SO': 'Somalia', 'ZA': 'South Africa', 'GS': 'South Georgia and the South Sandwich Islands', 'SS': 'South Sudan', 'ES': 'Spain', 'LK': 'Sri Lanka', 'SD': 'Sudan (the)', 'SR': 'Suriname', 'SJ': 'Svalbard and Jan Mayen', 'SE': 'Sweden', 'CH': 'Switzerland', 'SY': 'Syrian Arab Republic', 'TW': 'Taiwan (Province of China)', 'TJ': 'Tajikistan', 'TZ': 'Tanzania, United Republic of', 'TH': 'Thailand', 'TL': 'Timor-Leste', 'TG': 'Togo', 'TK': 'Tokelau', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia', 'TR': 'Turkey', 'TM': 'Turkmenistan', 'TC': 'Turks and Caicos Islands (the)', 'TV': 'Tuvalu', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates (the)', 'GB': 'United Kingdom of Great Britain and Northern Ireland (the)', 'UM': 'United States Minor Outlying Islands (the)', 'US': 'United States of America (the)', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu', 'VE': 'Venezuela (Bolivarian Republic of)', 'VN': 'Viet Nam', 'VG': 'Virgin Islands (British)', 'VI': 'Virgin Islands (U.S.)', 'WF': 'Wallis and Futuna', 'EH': 'Western Sahara', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'}

In [11]:
# Rename country column to country_code
songs_data_copy = songs_data
songs_data_copy.rename(columns={"country": "country_code"}, inplace = True)

# Confirm name change
songs_data_copy

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country_code,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",1,1,0,ZA,2024-05-04,64,False,...,1,-12.937,0,0.0847,0.08790,0.016300,0.1240,0.786,113.056,4
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,2,-1,48,ZA,2024-05-04,85,True,...,1,-5.002,1,0.1100,0.04600,0.000000,0.0840,0.142,139.948,4
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",3,0,-1,ZA,2024-05-04,77,False,...,4,-11.427,0,0.0566,0.01030,0.178000,0.0187,0.505,112.014,4
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",4,0,-1,ZA,2024-05-04,68,False,...,6,-9.686,0,0.1120,0.17900,0.001260,0.1820,0.795,113.001,4
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",5,0,0,ZA,2024-05-04,64,False,...,1,-7.627,1,0.0477,0.00141,0.059500,0.0661,0.400,111.978,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715948,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",46,4,0,AE,2023-10-18,84,True,...,5,-9.243,0,0.0502,0.50800,0.000000,0.2590,0.105,88.880,3
715949,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",47,3,0,AE,2023-10-18,80,True,...,10,-5.060,1,0.0452,0.05850,0.000000,0.1320,0.476,121.879,4
715950,26b3oVLrRUaaybJulow9kz,People,Libianca,48,2,0,AE,2023-10-18,88,False,...,10,-7.621,0,0.0678,0.55100,0.000013,0.1020,0.693,124.357,5
715951,5ydjxBSUIDn26MFzU3asP4,Rainy Days,V,49,1,0,AE,2023-10-18,88,False,...,9,-8.016,0,0.0875,0.73900,0.000000,0.1480,0.282,74.828,4


In [12]:
# Duplicate country_code column
songs_data_copy['country'] = songs_data_copy['country_code']

# Map country codes to country names
songs_data_copy = songs_data_copy.replace({"country": country_dict})

# Confirm successful mapping
songs_data_copy

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country_code,snapshot_date,popularity,is_explicit,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",1,1,0,ZA,2024-05-04,64,False,...,-12.937,0,0.0847,0.08790,0.016300,0.1240,0.786,113.056,4,South Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,2,-1,48,ZA,2024-05-04,85,True,...,-5.002,1,0.1100,0.04600,0.000000,0.0840,0.142,139.948,4,South Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",3,0,-1,ZA,2024-05-04,77,False,...,-11.427,0,0.0566,0.01030,0.178000,0.0187,0.505,112.014,4,South Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",4,0,-1,ZA,2024-05-04,68,False,...,-9.686,0,0.1120,0.17900,0.001260,0.1820,0.795,113.001,4,South Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",5,0,0,ZA,2024-05-04,64,False,...,-7.627,1,0.0477,0.00141,0.059500,0.0661,0.400,111.978,4,South Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715948,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",46,4,0,AE,2023-10-18,84,True,...,-9.243,0,0.0502,0.50800,0.000000,0.2590,0.105,88.880,3,United Arab Emirates (the)
715949,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",47,3,0,AE,2023-10-18,80,True,...,-5.060,1,0.0452,0.05850,0.000000,0.1320,0.476,121.879,4,United Arab Emirates (the)
715950,26b3oVLrRUaaybJulow9kz,People,Libianca,48,2,0,AE,2023-10-18,88,False,...,-7.621,0,0.0678,0.55100,0.000013,0.1020,0.693,124.357,5,United Arab Emirates (the)
715951,5ydjxBSUIDn26MFzU3asP4,Rainy Days,V,49,1,0,AE,2023-10-18,88,False,...,-8.016,0,0.0875,0.73900,0.000000,0.1480,0.282,74.828,4,United Arab Emirates (the)


### Changing spelling of country names

To ensure that our Spotify data is compatible with our World Health data, let's change some country names.

In [13]:
songs_data_copy['country'].unique()

array(['South Africa', 'Viet Nam', 'Venezuela (Bolivarian Republic of)',
       'Uruguay', 'United States of America (the)', 'Ukraine',
       'Taiwan (Province of China)', 'Turkey', 'Thailand', 'El Salvador',
       'Slovakia', 'Singapore', 'Sweden', 'Saudi Arabia', 'Romania',
       'Paraguay', 'Portugal', 'Poland', 'Pakistan', 'Philippines (the)',
       'Peru', 'Panama', 'New Zealand', 'Norway', 'Netherlands (the)',
       'Nicaragua', 'Nigeria', 'Malaysia', 'Mexico', 'Morocco', 'Latvia',
       'Luxembourg', 'Lithuania', 'Kazakhstan', 'Korea (the Republic of)',
       'Japan', 'Italy', 'Iceland', 'India', 'Israel', 'Ireland',
       'Indonesia', 'Hungary', 'Honduras', 'Hong Kong', 'Guatemala',
       'Greece',
       'United Kingdom of Great Britain and Northern Ireland (the)',
       'France', 'Finland', 'Spain', 'Egypt', 'Estonia', 'Ecuador',
       'Dominican Republic (the)', 'Denmark', 'Germany', 'Czechia',
       'Costa Rica', 'Colombia', 'Chile', 'Switzerland', 'Canada',
   

In [14]:
name_changes = {'Bolivia (Plurinational State of)':'Bolivia',
 'Dominican Republic (the)':'Dominican Republic',
 'Korea (the Republic of)':'South Korea',
 'Netherlands (the)':'Netherlands',
 'Philippines (the)':'Philippines',
 'Taiwan (Province of China)':'Taiwan',
 'United Arab Emirates (the)':'United Arab Emirates',
 'United Kingdom of Great Britain and Northern Ireland (the)':'United Kingdom',
 'United States of America (the)':'United States',
 'Venezuela (Bolivarian Republic of)':'Venezuela',
 'Viet Nam':'Vietnam'}

In [15]:
# Change country names
songs_data_copy = songs_data_copy.replace({"country": name_changes})

In [16]:
# Confirm data manipulatoin successful
sorted(songs_data_copy['country'].unique())

['Argentina',
 'Australia',
 'Austria',
 'Belarus',
 'Belgium',
 'Bolivia',
 'Brazil',
 'Bulgaria',
 'Canada',
 'Chile',
 'Colombia',
 'Costa Rica',
 'Czechia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Guatemala',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Ireland',
 'Israel',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Mexico',
 'Morocco',
 'Netherlands',
 'New Zealand',
 'Nicaragua',
 'Nigeria',
 'Norway',
 'Pakistan',
 'Panama',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Romania',
 'Saudi Arabia',
 'Singapore',
 'Slovakia',
 'South Africa',
 'South Korea',
 'Spain',
 'Sweden',
 'Switzerland',
 'Taiwan',
 'Thailand',
 'Turkey',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Uruguay',
 'Venezuela',
 'Vietnam']

In [17]:
# Data manipulation successful, so save back to songs_data.
songs_data = songs_data_copy

# Confirm
songs_data

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country_code,snapshot_date,popularity,is_explicit,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",1,1,0,ZA,2024-05-04,64,False,...,-12.937,0,0.0847,0.08790,0.016300,0.1240,0.786,113.056,4,South Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,2,-1,48,ZA,2024-05-04,85,True,...,-5.002,1,0.1100,0.04600,0.000000,0.0840,0.142,139.948,4,South Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",3,0,-1,ZA,2024-05-04,77,False,...,-11.427,0,0.0566,0.01030,0.178000,0.0187,0.505,112.014,4,South Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",4,0,-1,ZA,2024-05-04,68,False,...,-9.686,0,0.1120,0.17900,0.001260,0.1820,0.795,113.001,4,South Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",5,0,0,ZA,2024-05-04,64,False,...,-7.627,1,0.0477,0.00141,0.059500,0.0661,0.400,111.978,4,South Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715948,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",46,4,0,AE,2023-10-18,84,True,...,-9.243,0,0.0502,0.50800,0.000000,0.2590,0.105,88.880,3,United Arab Emirates
715949,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",47,3,0,AE,2023-10-18,80,True,...,-5.060,1,0.0452,0.05850,0.000000,0.1320,0.476,121.879,4,United Arab Emirates
715950,26b3oVLrRUaaybJulow9kz,People,Libianca,48,2,0,AE,2023-10-18,88,False,...,-7.621,0,0.0678,0.55100,0.000013,0.1020,0.693,124.357,5,United Arab Emirates
715951,5ydjxBSUIDn26MFzU3asP4,Rainy Days,V,49,1,0,AE,2023-10-18,88,False,...,-8.016,0,0.0875,0.73900,0.000000,0.1480,0.282,74.828,4,United Arab Emirates


### World Regions

It might be helpful for our analysis to classify our countries into world regions.

We will use the Maddison Project Database definitions of eight world regions.

In [18]:
# Load Maddison Project data
mpd = pd.read_csv('./mpd2023_web.csv')

In [19]:
# Again, let's change the names of some countries to ensure that they match between the dataframes.

# Copy the mpd dataframe
mpd_copy = mpd

# Create a dictionary of country name corrections
country_corrections = {
    "Bolivia (Plurinational State of)": "Bolivia",
    "China, Hong Kong SAR": "Hong Kong",
    "Czech Republic": "Czechia",
    "D.P.R. of Korea": "North Korea",
    "Lao People's DR": "Laos",
    "Republic of Korea": "South Korea",
    "Russian Federation": "Russia",
    "State of Palestine": "Palestine",
    "Syrian Arab Republic": "Syria",
    "Taiwan, Province of China": "Taiwan",
    "TFYR of Macedonia": "North Macedonia",
    "United Kingdom": "United Kingdom",
    "United Republic of Tanzania": "Tanzania",
    "United States of America": "United States",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Viet Nam": "Vietnam"
}

# Change country names
mpd_copy = mpd_copy.replace({"country": country_corrections})

In [20]:
# Confirm changes
mpd_copy

Unnamed: 0,country,region
0,Afghanistan,South and South East Asia
1,Angola,Sub Saharan Africa
2,Albania,Eastern Europe
3,United Arab Emirates,Middle East and North Africa
4,Argentina,Latin America
...,...,...
164,Yemen,Middle East and North Africa
165,Former Yugoslavia,Eastern Europe
166,South Africa,Sub Saharan Africa
167,Zambia,Sub Saharan Africa


In [21]:
# Copy back to original dataframe
mpd = mpd_copy

In [22]:
# Check if we are missing any country classifications by creating a list comprehension
songs_data_countries = sorted(songs_data["country"].unique())
mpd_countries = sorted(mpd["country"])

common_countries = [x for x in mpd_countries if x in songs_data_countries]
missing_countries = [x for x in songs_data_countries if x not in common_countries]
missing_countries

[]

Confirmed that all countries from our songs_data dataframe are in our mpd dataframe. We are now ready to create a dictionary from this dataframe.

In [23]:
# Create dictionary from mpd using a dictionary comprehension
mpd_dict = {k:v for (k,v) in zip(mpd["country"], mpd["region"])}

# Check
print(mpd_dict)

{'Afghanistan': 'South and South East Asia', 'Angola': 'Sub Saharan Africa', 'Albania': 'Eastern Europe', 'United Arab Emirates': 'Middle East and North Africa', 'Argentina': 'Latin America', 'Armenia': 'Eastern Europe', 'Australia': 'Western Offshoots', 'Austria': 'Western Europe', 'Azerbaijan': 'Eastern Europe', 'Burundi': 'Sub Saharan Africa', 'Belgium': 'Western Europe', 'Benin': 'Sub Saharan Africa', 'Burkina Faso': 'Sub Saharan Africa', 'Bangladesh': 'South and South East Asia', 'Bulgaria': 'Eastern Europe', 'Bahrain': 'Middle East and North Africa', 'Bosnia and Herzegovina': 'Eastern Europe', 'Belarus': 'Eastern Europe', 'Bolivia': 'Latin America', 'Brazil': 'Latin America', 'Barbados': 'Latin America', 'Botswana': 'Sub Saharan Africa', 'Central African Republic': 'Sub Saharan Africa', 'Canada': 'Western Offshoots', 'Switzerland': 'Western Europe', 'Chile': 'Latin America', 'China': 'East Asia', "Côte d'Ivoire": 'Sub Saharan Africa', 'Cameroon': 'Sub Saharan Africa', 'D.R. of th

Let's add this dictionary to our dataframe so we can perform region-based analysis.

In [24]:
# Create a copy of the dataframe to minimize error.
songs_data_copy = songs_data

# Create our region column by first duplicating our country column
songs_data_copy["region"] = songs_data_copy["country"]

# Confirm
songs_data_copy

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country_code,snapshot_date,popularity,is_explicit,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",1,1,0,ZA,2024-05-04,64,False,...,0,0.0847,0.08790,0.016300,0.1240,0.786,113.056,4,South Africa,South Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,2,-1,48,ZA,2024-05-04,85,True,...,1,0.1100,0.04600,0.000000,0.0840,0.142,139.948,4,South Africa,South Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",3,0,-1,ZA,2024-05-04,77,False,...,0,0.0566,0.01030,0.178000,0.0187,0.505,112.014,4,South Africa,South Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",4,0,-1,ZA,2024-05-04,68,False,...,0,0.1120,0.17900,0.001260,0.1820,0.795,113.001,4,South Africa,South Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",5,0,0,ZA,2024-05-04,64,False,...,1,0.0477,0.00141,0.059500,0.0661,0.400,111.978,4,South Africa,South Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715948,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",46,4,0,AE,2023-10-18,84,True,...,0,0.0502,0.50800,0.000000,0.2590,0.105,88.880,3,United Arab Emirates,United Arab Emirates
715949,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",47,3,0,AE,2023-10-18,80,True,...,1,0.0452,0.05850,0.000000,0.1320,0.476,121.879,4,United Arab Emirates,United Arab Emirates
715950,26b3oVLrRUaaybJulow9kz,People,Libianca,48,2,0,AE,2023-10-18,88,False,...,0,0.0678,0.55100,0.000013,0.1020,0.693,124.357,5,United Arab Emirates,United Arab Emirates
715951,5ydjxBSUIDn26MFzU3asP4,Rainy Days,V,49,1,0,AE,2023-10-18,88,False,...,0,0.0875,0.73900,0.000000,0.1480,0.282,74.828,4,United Arab Emirates,United Arab Emirates


In [25]:
# Map regions to country names
songs_data_copy = songs_data_copy.replace({"region": mpd_dict})

# Confirm
songs_data_copy.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country_code,snapshot_date,popularity,is_explicit,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",1,1,0,ZA,2024-05-04,64,False,...,0,0.0847,0.0879,0.0163,0.124,0.786,113.056,4,South Africa,Sub Saharan Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,2,-1,48,ZA,2024-05-04,85,True,...,1,0.11,0.046,0.0,0.084,0.142,139.948,4,South Africa,Sub Saharan Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",3,0,-1,ZA,2024-05-04,77,False,...,0,0.0566,0.0103,0.178,0.0187,0.505,112.014,4,South Africa,Sub Saharan Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",4,0,-1,ZA,2024-05-04,68,False,...,0,0.112,0.179,0.00126,0.182,0.795,113.001,4,South Africa,Sub Saharan Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",5,0,0,ZA,2024-05-04,64,False,...,1,0.0477,0.00141,0.0595,0.0661,0.4,111.978,4,South Africa,Sub Saharan Africa


In [26]:
# Mapping successful; copy back to songs_data
songs_data = songs_data_copy

### Dropping columns

Since we aren't interested in daily_rank, daily_movement, weekly_movement, or snapshot_date, let's drop these columns:

In [27]:
drop_cols = ["daily_rank", "daily_movement", "weekly_movement", "snapshot_date"]
songs_data_df = songs_data.drop(drop_cols, axis=1)

# Confirm
songs_data_df.sort_values(by=["country", "name"])

Unnamed: 0,spotify_id,name,artists,country_code,popularity,is_explicit,duration_ms,album_name,album_release_date,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
610233,2wjzgkfw4MqYOtPAnREHoL,24/7 6.5,"YSY A, Jere Klein, ONIRIA",AR,63,False,137559,EL AFTER DEL AFTER,2023-11-11,0.925,...,1,0.2450,0.1870,0.000113,0.0928,0.639,129.985,4,Argentina,Latin America
613873,2wjzgkfw4MqYOtPAnREHoL,24/7 6.5,"YSY A, Jere Klein, ONIRIA",AR,60,False,137559,EL AFTER DEL AFTER,2023-11-11,0.925,...,1,0.2450,0.1870,0.000113,0.0928,0.639,129.985,4,Argentina,Latin America
3554,0GVPemmAwkXhFlYimhdDr3,30 GRADOS,"El Turko, Mandale Flow",AR,80,True,151441,30 Grados,2024-01-26,0.676,...,1,0.3380,0.0185,0.000000,0.0761,0.369,98.325,4,Argentina,Latin America
7204,0GVPemmAwkXhFlYimhdDr3,30 GRADOS,"El Turko, Mandale Flow",AR,80,True,151441,30 Grados,2024-01-26,0.676,...,1,0.3380,0.0185,0.000000,0.0761,0.369,98.325,4,Argentina,Latin America
10852,0GVPemmAwkXhFlYimhdDr3,30 GRADOS,"El Turko, Mandale Flow",AR,80,True,151441,30 Grados,2024-01-26,0.676,...,1,0.3380,0.0185,0.000000,0.0761,0.369,98.325,4,Argentina,Latin America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697872,08ULi904W2Po6pVj8nN7KC,đưa em về nhàa,"GREY D, Chillies",VN,67,False,240000,đưa em về nhàa,2023-04-24,0.664,...,1,0.0391,0.6740,0.000015,0.3530,0.644,90.960,4,Vietnam,South and South East Asia
701526,08ULi904W2Po6pVj8nN7KC,đưa em về nhàa,"GREY D, Chillies",VN,66,False,240000,đưa em về nhàa,2023-04-24,0.664,...,1,0.0391,0.6740,0.000015,0.3530,0.644,90.960,4,Vietnam,South and South East Asia
705172,08ULi904W2Po6pVj8nN7KC,đưa em về nhàa,"GREY D, Chillies",VN,66,False,240000,đưa em về nhàa,2023-04-24,0.664,...,1,0.0391,0.6740,0.000015,0.3530,0.644,90.960,4,Vietnam,South and South East Asia
708806,08ULi904W2Po6pVj8nN7KC,đưa em về nhàa,"GREY D, Chillies",VN,66,False,240000,đưa em về nhàa,2023-04-24,0.664,...,1,0.0391,0.6740,0.000015,0.3530,0.644,90.960,4,Vietnam,South and South East Asia


### Remove duplicate songs

Note that the countries in our Spotify data may have some duplicate song entries. These duplicates still contribute valuable data, but their popularity may vary from one snapshot to the next, so let's average their popularity before removing duplicates. Then, we will add them back in.

In [28]:
songs_data_df.columns

Index(['spotify_id', 'name', 'artists', 'country_code', 'popularity',
       'is_explicit', 'duration_ms', 'album_name', 'album_release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'country', 'region'],
      dtype='object')

In [29]:
# Create a copy of the dataframe before manipulating
songs_data_df_copy = songs_data_df

In [30]:
# Drop duplicates in copy and count the rows
songs_data_df_copy = songs_data_df_copy.drop_duplicates(subset=['country', 'spotify_id'])
songs_data_df_copy

Unnamed: 0,spotify_id,name,artists,country_code,popularity,is_explicit,duration_ms,album_name,album_release_date,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",ZA,64,False,377920,Funk Series,2024-04-05,0.900,...,0,0.0847,0.08790,0.01630,0.1240,0.786,113.056,4,South Africa,Sub Saharan Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,ZA,85,True,383639,euphoria,2024-04-30,0.831,...,1,0.1100,0.04600,0.00000,0.0840,0.142,139.948,4,South Africa,Sub Saharan Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",ZA,77,False,390000,"Tshwala Bam (feat. S.N.E, EeQue)",2024-02-23,0.857,...,0,0.0566,0.01030,0.17800,0.0187,0.505,112.014,4,South Africa,Sub Saharan Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",ZA,68,False,351200,Isimo,2023-10-27,0.806,...,0,0.1120,0.17900,0.00126,0.1820,0.795,113.001,4,South Africa,Sub Saharan Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",ZA,64,False,285034,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)",2024-04-17,0.845,...,1,0.0477,0.00141,0.05950,0.0661,0.400,111.978,4,South Africa,Sub Saharan Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712160,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,AU,85,False,157477,One Thing At A Time,2023-03-03,0.732,...,1,0.0345,0.26500,0.00000,0.6020,0.629,119.724,4,Australia,Western Offshoots
712260,4qSEvFGCpde73gqIuq3sho,HIBIKI,"Bad Bunny, Mora",AR,89,True,208000,nadie sabe lo que va a pasar mañana,2023-10-13,0.801,...,0,0.0706,0.60400,0.00000,0.1180,0.528,119.935,4,Argentina,Latin America
712307,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",AE,84,True,310490,For All The Dogs,2023-10-06,0.483,...,0,0.0502,0.50800,0.00000,0.2590,0.105,88.880,3,United Arab Emirates,Middle East and North Africa
712308,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",AE,80,True,173253,SET IT OFF,2023-10-13,0.773,...,1,0.0452,0.05850,0.00000,0.1320,0.476,121.879,4,United Arab Emirates,Middle East and North Africa


In [31]:
gk = songs_data_df.groupby(['country', 'spotify_id'])['popularity'].mean()
gk = gk.reset_index()

In [32]:
gk

Unnamed: 0,country,spotify_id,popularity
0,Argentina,00THZGfbizNvMUkJsQs74K,73.447761
1,Argentina,045ZeOHPIzhxxsm8bq5kyE,0.000000
2,Argentina,04sktg3deiYUweHfbFUZTM,59.755682
3,Argentina,05msZuGKP3OCUGQnvLBOf4,0.000000
4,Argentina,08809bzGs69SfjOFdeyssu,72.175439
...,...,...,...
22869,Vietnam,7pgbDdy7ax962o9d2xJceV,87.000000
22870,Vietnam,7tFwBnuaGXqiiONukPRaCo,66.558376
22871,Vietnam,7td8DTWoGC9u9db37mGHX6,64.000000
22872,Vietnam,7uoFMmxln0GPXQ0AcCBXRq,94.000000


Let's remove the `popularity` column from the songs_data_df_copy dataframe.

In [33]:
songs_data_df_copy = songs_data_df_copy.drop(['popularity'], axis=1)

In [34]:
# Confirm drop
songs_data_df_copy

Unnamed: 0,spotify_id,name,artists,country_code,is_explicit,duration_ms,album_name,album_release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
50,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",ZA,False,377920,Funk Series,2024-04-05,0.900,0.610,...,0,0.0847,0.08790,0.01630,0.1240,0.786,113.056,4,South Africa,Sub Saharan Africa
51,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,ZA,True,383639,euphoria,2024-04-30,0.831,0.643,...,1,0.1100,0.04600,0.00000,0.0840,0.142,139.948,4,South Africa,Sub Saharan Africa
52,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",ZA,False,390000,"Tshwala Bam (feat. S.N.E, EeQue)",2024-02-23,0.857,0.564,...,0,0.0566,0.01030,0.17800,0.0187,0.505,112.014,4,South Africa,Sub Saharan Africa
53,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",ZA,False,351200,Isimo,2023-10-27,0.806,0.767,...,0,0.1120,0.17900,0.00126,0.1820,0.795,113.001,4,South Africa,Sub Saharan Africa
54,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",ZA,False,285034,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)",2024-04-17,0.845,0.584,...,1,0.0477,0.00141,0.05950,0.0661,0.400,111.978,4,South Africa,Sub Saharan Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
712160,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,AU,False,157477,One Thing At A Time,2023-03-03,0.732,0.839,...,1,0.0345,0.26500,0.00000,0.6020,0.629,119.724,4,Australia,Western Offshoots
712260,4qSEvFGCpde73gqIuq3sho,HIBIKI,"Bad Bunny, Mora",AR,True,208000,nadie sabe lo que va a pasar mañana,2023-10-13,0.801,0.645,...,0,0.0706,0.60400,0.00000,0.1180,0.528,119.935,4,Argentina,Latin America
712307,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",AE,True,310490,For All The Dogs,2023-10-06,0.483,0.408,...,0,0.0502,0.50800,0.00000,0.2590,0.105,88.880,3,United Arab Emirates,Middle East and North Africa
712308,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",AE,True,173253,SET IT OFF,2023-10-13,0.773,0.635,...,1,0.0452,0.05850,0.00000,0.1320,0.476,121.879,4,United Arab Emirates,Middle East and North Africa


In [35]:
# Join gk to songs_data_df_copy on spotify_id
new_df = pd.merge(songs_data_df_copy, gk,  how='left', left_on=['country','spotify_id'], right_on = ['country','spotify_id'])

In [36]:
# Rearrange the columns for better readability
new_df = new_df[['country', 'region', 'spotify_id', 'name', 'artists', 'country_code', 'popularity', 'is_explicit',
       'duration_ms', 'album_name', 'album_release_date', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]
new_df

Unnamed: 0,country,region,spotify_id,name,artists,country_code,popularity,is_explicit,duration_ms,album_name,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,South Africa,Sub Saharan Africa,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",ZA,57.200000,False,377920,Funk Series,...,1,-12.937,0,0.0847,0.08790,0.01630,0.1240,0.786,113.056,4
1,South Africa,Sub Saharan Africa,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,ZA,78.666667,True,383639,euphoria,...,1,-5.002,1,0.1100,0.04600,0.00000,0.0840,0.142,139.948,4
2,South Africa,Sub Saharan Africa,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",ZA,70.390625,False,390000,"Tshwala Bam (feat. S.N.E, EeQue)",...,4,-11.427,0,0.0566,0.01030,0.17800,0.0187,0.505,112.014,4
3,South Africa,Sub Saharan Africa,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",ZA,68.565934,False,351200,Isimo,...,6,-9.686,0,0.1120,0.17900,0.00126,0.1820,0.795,113.001,4
4,South Africa,Sub Saharan Africa,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",ZA,57.153846,False,285034,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)",...,1,-7.627,1,0.0477,0.00141,0.05950,0.0661,0.400,111.978,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22869,Australia,Western Offshoots,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,AU,85.000000,False,157477,One Thing At A Time,...,9,-5.007,1,0.0345,0.26500,0.00000,0.6020,0.629,119.724,4
22870,Argentina,Latin America,4qSEvFGCpde73gqIuq3sho,HIBIKI,"Bad Bunny, Mora",AR,88.500000,True,208000,nadie sabe lo que va a pasar mañana,...,6,-5.605,0,0.0706,0.60400,0.00000,0.1180,0.528,119.935,4
22871,United Arab Emirates,Middle East and North Africa,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",AE,84.000000,True,310490,For All The Dogs,...,5,-9.243,0,0.0502,0.50800,0.00000,0.2590,0.105,88.880,3
22872,United Arab Emirates,Middle East and North Africa,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",AE,80.000000,True,173253,SET IT OFF,...,10,-5.060,1,0.0452,0.05850,0.00000,0.1320,0.476,121.879,4


In [37]:
# Reassign new_df back to songs_data_df
songs_data_df = new_df

In [38]:
# Confirm
songs_data_df

Unnamed: 0,country,region,spotify_id,name,artists,country_code,popularity,is_explicit,duration_ms,album_name,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,South Africa,Sub Saharan Africa,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",ZA,57.200000,False,377920,Funk Series,...,1,-12.937,0,0.0847,0.08790,0.01630,0.1240,0.786,113.056,4
1,South Africa,Sub Saharan Africa,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,ZA,78.666667,True,383639,euphoria,...,1,-5.002,1,0.1100,0.04600,0.00000,0.0840,0.142,139.948,4
2,South Africa,Sub Saharan Africa,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",ZA,70.390625,False,390000,"Tshwala Bam (feat. S.N.E, EeQue)",...,4,-11.427,0,0.0566,0.01030,0.17800,0.0187,0.505,112.014,4
3,South Africa,Sub Saharan Africa,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",ZA,68.565934,False,351200,Isimo,...,6,-9.686,0,0.1120,0.17900,0.00126,0.1820,0.795,113.001,4
4,South Africa,Sub Saharan Africa,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",ZA,57.153846,False,285034,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)",...,1,-7.627,1,0.0477,0.00141,0.05950,0.0661,0.400,111.978,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22869,Australia,Western Offshoots,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,AU,85.000000,False,157477,One Thing At A Time,...,9,-5.007,1,0.0345,0.26500,0.00000,0.6020,0.629,119.724,4
22870,Argentina,Latin America,4qSEvFGCpde73gqIuq3sho,HIBIKI,"Bad Bunny, Mora",AR,88.500000,True,208000,nadie sabe lo que va a pasar mañana,...,6,-5.605,0,0.0706,0.60400,0.00000,0.1180,0.528,119.935,4
22871,United Arab Emirates,Middle East and North Africa,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",AE,84.000000,True,310490,For All The Dogs,...,5,-9.243,0,0.0502,0.50800,0.00000,0.2590,0.105,88.880,3
22872,United Arab Emirates,Middle East and North Africa,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",AE,80.000000,True,173253,SET IT OFF,...,10,-5.060,1,0.0452,0.05850,0.00000,0.1320,0.476,121.879,4


---

## Data Wrangling: World Happiness Data

In [39]:
# Load world happiness data
wh_data = pd.read_csv('./wh_data.csv')

In [40]:
# Check data summary
wh_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 19 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                137 non-null    object 
 1   Ladder score                                137 non-null    float64
 2   Standard error of ladder score              137 non-null    float64
 3   upperwhisker                                137 non-null    float64
 4   lowerwhisker                                137 non-null    float64
 5   Logged GDP per capita                       137 non-null    float64
 6   Social support                              137 non-null    float64
 7   Healthy life expectancy                     136 non-null    float64
 8   Freedom to make life choices                137 non-null    float64
 9   Generosity                                  137 non-null    float64
 10  Perceptions of

In [41]:
# Check for missing data
missing = pd.concat([wh_data.isnull().sum(), 100 * wh_data.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count')

Unnamed: 0,count,%
Country name,0,0.0
Explained by: Generosity,0,0.0
Explained by: Freedom to make life choices,0,0.0
Explained by: Social support,0,0.0
Explained by: Log GDP per capita,0,0.0
Ladder score in Dystopia,0,0.0
Perceptions of corruption,0,0.0
Explained by: Perceptions of corruption,0,0.0
Generosity,0,0.0
Social support,0,0.0


In [42]:
# Check which countries are in the data set
countries_array = wh_data['Country name'].unique()
sorted(countries_array)

['Afghanistan',
 'Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Bahrain',
 'Bangladesh',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong S.A.R. of China',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kosovo',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Liberia',
 'Lithuania',
 'Luxembourg',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Mali',
 'Mal

In [43]:
# Copy wh_data for manipulation
wh_data_copy = wh_data

# Rename 'Country name' column to 'country'
wh_data_copy.rename(columns={"Country name": "country"}, inplace = True)

# Confirm name change
wh_data_copy.head()

Unnamed: 0,country,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.15,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.25,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084
2,Iceland,7.53,0.049,7.625,7.434,10.896,0.983,72.05,0.936,0.211,0.668,1.778,1.926,1.62,0.559,0.738,0.25,0.187,2.25
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691
4,Netherlands,7.403,0.029,7.46,7.346,10.942,0.93,71.55,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.11


### Change country names

There are a few countries we identified whose names do not match those that are in the Spotify dataset, so let's change them.

In [44]:
# Change some country names
# Create a dictionary with desired name changes
name_changes = {'Hong Kong S.A.R. of China':'Hong Kong', 'Taiwan Province of China':'Taiwan', 'Turkiye':'Turkey'}

# Use replace method to change names
wh_data_copy = wh_data_copy.replace({"country": name_changes})

# Confirm name changes
sorted(wh_data_copy["country"].unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Bahrain',
 'Bangladesh',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kosovo',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Liberia',
 'Lithuania',
 'Luxembourg',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Mali',
 'Malta',
 'Mauritani

In [45]:
# Save copy back to original
wh_data = wh_data_copy

# Confirm
wh_data

Unnamed: 0,country,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.150,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.250,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084
2,Iceland,7.530,0.049,7.625,7.434,10.896,0.983,72.050,0.936,0.211,0.668,1.778,1.926,1.620,0.559,0.738,0.250,0.187,2.250
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691
4,Netherlands,7.403,0.029,7.460,7.346,10.942,0.930,71.550,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,Congo (Kinshasa),3.207,0.095,3.394,3.020,7.007,0.652,55.375,0.664,0.086,0.834,1.778,0.531,0.784,0.105,0.375,0.183,0.068,1.162
133,Zimbabwe,3.204,0.061,3.323,3.084,7.641,0.690,54.050,0.654,-0.046,0.766,1.778,0.758,0.881,0.069,0.363,0.112,0.117,0.905
134,Sierra Leone,3.138,0.082,3.299,2.976,7.394,0.555,54.900,0.660,0.105,0.858,1.778,0.670,0.540,0.092,0.371,0.193,0.051,1.221
135,Lebanon,2.392,0.044,2.479,2.305,9.478,0.530,66.149,0.474,-0.141,0.891,1.778,1.417,0.476,0.398,0.123,0.061,0.027,-0.110


### Filter Countries in World Happiness Dataset

Let's filter `wh_data` to only include the countries that are in `songs_data_df`.

In [46]:
# Check length of wh_data vs. songs_data_df.
print(f"World Happiness country count: {len(wh_data['country'].unique())}\nSpotify data country count: {len(songs_data_df['country'].unique())}")

World Happiness country count: 137
Spotify data country count: 72


In [47]:
# Create a list from country_stats_spotify and use that for boolean filtration

# Create the list using a list comprehension
spotify_countries = sorted([x for x in songs_data_df["country"].unique()])

# Filter using boolean filtration
filtered_wh_data = wh_data[wh_data["country"].isin(spotify_countries)]

# Check head of new dataframe
filtered_wh_data.head()

Unnamed: 0,country,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.15,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.25,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084
2,Iceland,7.53,0.049,7.625,7.434,10.896,0.983,72.05,0.936,0.211,0.668,1.778,1.926,1.62,0.559,0.738,0.25,0.187,2.25
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691
4,Netherlands,7.403,0.029,7.46,7.346,10.942,0.93,71.55,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.11


In [48]:
len(spotify_countries)

72

We expect 72 countries, so let's check the size of our dataframe to see if we have 72 rows.

In [49]:
len(filtered_wh_data)

71

We are missing only one country. Let's check to see which one it is:

In [50]:
missing_country = [x for x in spotify_countries if x not in filtered_wh_data['country'].unique()]
missing_country

['Belarus']

Let's drop Belarus from the original Spotify data.

In [51]:
# Copy the dataframe for manipulation
songs_data_df_copy = songs_data_df

# Create a new dataframe without Belarus using the copy
songs_data_df_copy = songs_data_df_copy[songs_data_df_copy['country']!='Belarus']

In [52]:
songs_data_df_copy

Unnamed: 0,country,region,spotify_id,name,artists,country_code,popularity,is_explicit,duration_ms,album_name,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,South Africa,Sub Saharan Africa,6hqel7QjWBsjk1LJG6J0pD,Funk 99,"Shakes & Les, LeeMcKrazy",ZA,57.200000,False,377920,Funk Series,...,1,-12.937,0,0.0847,0.08790,0.01630,0.1240,0.786,113.056,4
1,South Africa,Sub Saharan Africa,77DRzu7ERs0TX3roZcre7Q,euphoria,Kendrick Lamar,ZA,78.666667,True,383639,euphoria,...,1,-5.002,1,0.1100,0.04600,0.00000,0.0840,0.142,139.948,4
2,South Africa,Sub Saharan Africa,54seQV7MCcppBrznALCdlT,"Tshwala Bam (feat. S.N.E, EeQue)","TitoM, Yuppe, EeQue, S.N.E",ZA,70.390625,False,390000,"Tshwala Bam (feat. S.N.E, EeQue)",...,4,-11.427,0,0.0566,0.01030,0.17800,0.0187,0.505,112.014,4
3,South Africa,Sub Saharan Africa,6Kijtp0DB6VwcoJIw7PJ9W,"Imithandazo (feat. Young Stunna, DJ Maphorisa,...","Kabza De Small, Mthunzi, DJ Maphorisa, Young S...",ZA,68.565934,False,351200,Isimo,...,6,-9.686,0,0.1120,0.17900,0.00126,0.1820,0.795,113.001,4
4,South Africa,Sub Saharan Africa,5DqA8IUhk1DQSGmf6XOVrI,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)","Uncle Waffles, Royal MusiQ, Ohp Sage, Pcee, Dj...",ZA,57.153846,False,285034,"Wadibusa (feat. OHP Sage, Pcee, & Djy Biza)",...,1,-7.627,1,0.0477,0.00141,0.05950,0.0661,0.400,111.978,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22869,Australia,Western Offshoots,5W4kiM2cUYBJXKRudNyxjW,You Proof,Morgan Wallen,AU,85.000000,False,157477,One Thing At A Time,...,9,-5.007,1,0.0345,0.26500,0.00000,0.6020,0.629,119.724,4
22870,Argentina,Latin America,4qSEvFGCpde73gqIuq3sho,HIBIKI,"Bad Bunny, Mora",AR,88.500000,True,208000,nadie sabe lo que va a pasar mañana,...,6,-5.605,0,0.0706,0.60400,0.00000,0.1180,0.528,119.935,4
22871,United Arab Emirates,Middle East and North Africa,0AYt6NMyyLd0rLuvr0UkMH,Slime You Out (feat. SZA),"Drake, SZA",AE,84.000000,True,310490,For All The Dogs,...,5,-9.243,0,0.0502,0.50800,0.00000,0.2590,0.105,88.880,3
22872,United Arab Emirates,Middle East and North Africa,2Gk6fi0dqt91NKvlzGsmm7,SAY MY GRACE (feat. Travis Scott),"Offset, Travis Scott",AE,80.000000,True,173253,SET IT OFF,...,10,-5.060,1,0.0452,0.05850,0.00000,0.1320,0.476,121.879,4


In [53]:
# Update the original dataframe
songs_data_df = songs_data_df_copy

Let's copy the filtered_wh_data back to the original dataframe.

In [54]:
wh_data = filtered_wh_data

### Stats for songs data

Let's create dataframes to capture summary statistics by country and by region.

In [55]:
songs_data.columns

Index(['spotify_id', 'name', 'artists', 'daily_rank', 'daily_movement',
       'weekly_movement', 'country_code', 'snapshot_date', 'popularity',
       'is_explicit', 'duration_ms', 'album_name', 'album_release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'country', 'region'],
      dtype='object')

In [56]:
# Drop columns
drop_cols = ['spotify_id', 'name', 'artists', 'daily_rank', 'daily_movement',
       'weekly_movement', 'country_code', 'snapshot_date', 'album_name', 'album_release_date']
songs_data_stats = songs_data.drop(drop_cols, axis='columns')
songs_data_stats.head()

Unnamed: 0,popularity,is_explicit,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,country,region
50,64,False,377920,0.9,0.61,1,-12.937,0,0.0847,0.0879,0.0163,0.124,0.786,113.056,4,South Africa,Sub Saharan Africa
51,85,True,383639,0.831,0.643,1,-5.002,1,0.11,0.046,0.0,0.084,0.142,139.948,4,South Africa,Sub Saharan Africa
52,77,False,390000,0.857,0.564,4,-11.427,0,0.0566,0.0103,0.178,0.0187,0.505,112.014,4,South Africa,Sub Saharan Africa
53,68,False,351200,0.806,0.767,6,-9.686,0,0.112,0.179,0.00126,0.182,0.795,113.001,4,South Africa,Sub Saharan Africa
54,64,False,285034,0.845,0.584,1,-7.627,1,0.0477,0.00141,0.0595,0.0661,0.4,111.978,4,South Africa,Sub Saharan Africa


In [57]:
songs_data_stats.columns

Index(['popularity', 'is_explicit', 'duration_ms', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'country', 'region'],
      dtype='object')

In [58]:
# Create a copy of the dataframe before manipulating
songs_data_stats_copy = songs_data_stats

In [59]:
songs_data_stats_country = songs_data_stats_copy.groupby('country').agg({
    'popularity': 'mean',
    'is_explicit': lambda x: x.mode()[0],  # Mode
    'duration_ms': 'mean',
    'danceability': 'mean',
    'energy': 'mean',
    'key': lambda x: x.mode()[0],  # Mode
    'loudness': 'mean',
    'mode': 'mean',
    'speechiness': 'mean',
    'acousticness': 'mean',
    'instrumentalness': 'mean',
    'liveness': 'mean',
    'valence': 'mean',
    'tempo': 'mean',
    'time_signature': lambda x: x.mode()[0]  # Mode
}).reset_index()

# Group by region
songs_data_stats_region = songs_data_stats_copy.groupby('region').agg({
    'popularity': 'mean',
    'is_explicit': lambda x: x.mode()[0],  # Mode
    'duration_ms': 'mean',
    'danceability': 'mean',
    'energy': 'mean',
    'key': lambda x: x.mode()[0],  # Mode
    'loudness': 'mean',
    'mode': 'mean',
    'speechiness': 'mean',
    'acousticness': 'mean',
    'instrumentalness': 'mean',
    'liveness': 'mean',
    'valence': 'mean',
    'tempo': 'mean',
    'time_signature': lambda x: x.mode()[0]  # Mode
}).reset_index()

In [60]:
songs_data_stats_country

Unnamed: 0,country,popularity,is_explicit,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Argentina,79.683897,False,183280.889854,0.731863,0.686005,11,-5.248106,0.488339,0.088648,0.238702,0.004518,0.189616,0.644262,114.017433,4
1,Australia,88.366650,False,201002.420998,0.637842,0.622122,2,-6.857000,0.702213,0.071042,0.256684,0.008286,0.172767,0.497093,125.523188,4
2,Austria,81.920905,False,180443.483680,0.655136,0.653844,2,-6.832973,0.654763,0.096732,0.258236,0.019095,0.172822,0.516225,128.266714,4
3,Belarus,63.649356,True,159353.026372,0.685833,0.641679,8,-6.880253,0.471650,0.117817,0.215263,0.064024,0.212752,0.501150,124.893331,4
4,Belgium,84.965859,False,191042.988990,0.633213,0.648327,2,-6.599179,0.562828,0.078712,0.303627,0.027270,0.167556,0.521314,125.448159,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,United Kingdom,86.801944,False,200671.130624,0.617387,0.632082,2,-6.831948,0.696349,0.074673,0.271059,0.010238,0.190402,0.489599,123.053164,4
68,United States,87.117087,True,195470.215660,0.638999,0.597085,9,-7.249818,0.665598,0.091752,0.283541,0.011034,0.182546,0.466002,127.582416,4
69,Uruguay,78.910955,False,174879.883953,0.731381,0.681460,11,-5.242455,0.566938,0.088566,0.228491,0.003888,0.181781,0.666968,112.965075,4
70,Venezuela,83.984270,True,194145.708993,0.758731,0.691575,5,-5.379293,0.419228,0.121987,0.273128,0.008224,0.168883,0.558796,119.559745,4


In [61]:
songs_data_stats_region

Unnamed: 0,region,popularity,is_explicit,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,East Asia,73.269807,False,211872.655014,0.616526,0.668824,6,-5.79932,0.717798,0.064831,0.273806,0.005713,0.170168,0.525029,126.66251,4
1,Eastern Europe,70.734392,False,179413.939433,0.692017,0.660413,1,-6.835288,0.490453,0.112168,0.235592,0.037003,0.179354,0.508742,125.264438,4
2,Latin America,83.156807,False,187505.581224,0.741082,0.699342,5,-5.229809,0.472737,0.111188,0.249596,0.006656,0.184409,0.612523,120.292065,4
3,Middle East and North Africa,71.864588,False,199541.592963,0.646434,0.626528,1,-7.252346,0.363584,0.096091,0.335898,0.019985,0.17911,0.51807,120.986484,4
4,South and South East Asia,78.465032,False,213369.322893,0.63194,0.569721,2,-7.495493,0.699334,0.066398,0.391158,0.012032,0.159034,0.484019,119.129508,4
5,Sub Saharan Africa,67.563982,False,236400.99215,0.750923,0.640596,1,-8.272585,0.522781,0.106088,0.199012,0.051084,0.141974,0.559932,117.657878,4
6,Western Europe,76.685321,False,186674.767744,0.663697,0.638706,2,-6.782139,0.533366,0.100084,0.285786,0.012491,0.172341,0.526373,123.830491,4
7,Western Offshoots,87.370128,False,199262.260364,0.640418,0.607724,2,-7.072111,0.687579,0.080843,0.268993,0.009537,0.176377,0.487689,126.416881,4


### World Regions for World Happiness Dataset

Let's add regions to the World Happiness Dataset as well.

In [62]:
# Create a copy of the dataframe to minimize error.
wh_data_copy = wh_data

# Create our region column by first duplicating our country column
wh_data_copy["region"] = wh_data_copy["country"]

# Confirm
wh_data_copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wh_data_copy["region"] = wh_data_copy["country"]


Unnamed: 0,country,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual,region
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.150,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363,Finland
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.250,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084,Denmark
2,Iceland,7.530,0.049,7.625,7.434,10.896,0.983,72.050,0.936,0.211,0.668,1.778,1.926,1.620,0.559,0.738,0.250,0.187,2.250,Iceland
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691,Israel
4,Netherlands,7.403,0.029,7.460,7.346,10.942,0.930,71.550,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.110,Netherlands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,Morocco,4.903,0.061,5.023,4.783,8.973,0.553,63.901,0.787,-0.231,0.811,1.778,1.236,0.535,0.337,0.540,0.013,0.085,2.158,Morocco
105,Turkey,4.614,0.083,4.777,4.450,10.307,0.796,68.663,0.475,-0.077,0.795,1.778,1.714,1.148,0.467,0.125,0.095,0.096,0.969,Turkey
107,Pakistan,4.555,0.077,4.707,4.404,8.540,0.601,57.313,0.766,0.008,0.787,1.778,1.081,0.657,0.158,0.511,0.141,0.102,1.907,Pakistan
120,Egypt,4.170,0.059,4.287,4.054,9.367,0.726,63.503,0.732,-0.183,0.580,1.778,1.377,0.972,0.326,0.467,0.038,0.250,0.740,Egypt


In [63]:
# Map regions to country names
wh_data_copy = wh_data_copy.replace({"region": mpd_dict})

# Confirm
wh_data_copy.head()

Unnamed: 0,country,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual,region
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.15,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363,Western Europe
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.25,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084,Western Europe
2,Iceland,7.53,0.049,7.625,7.434,10.896,0.983,72.05,0.936,0.211,0.668,1.778,1.926,1.62,0.559,0.738,0.25,0.187,2.25,Western Europe
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691,Middle East and North Africa
4,Netherlands,7.403,0.029,7.46,7.346,10.942,0.93,71.55,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.11,Western Europe


In [64]:
# Mapping successful; copy back to original dataframe
wh_data = wh_data_copy

In [65]:
# Save variables

%store wh_data
%store songs_data_df
%store songs_data_stats_country
%store songs_data_stats_region

Stored 'wh_data' (DataFrame)
Stored 'songs_data_df' (DataFrame)
Stored 'songs_data_stats_country' (DataFrame)
Stored 'songs_data_stats_region' (DataFrame)
