In [6]:
import numpy as np
import pandas as pd
import pycountry as pc
import pycountry_convert as pyc
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

# Carga de Datos

In [9]:
spotify_data = pd.read_csv('universal_top_spotify_songs.csv',delimiter = ",")
spotify_data.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6iycYUk3oB0NPMdaDUrN1w,EVIL J0RDAN,Playboi Carti,1,49,49,,2025-03-15,80,True,...,1,-7.029,1,0.0564,0.00665,6.1e-05,0.119,0.0599,154.009,4
1,3VdooJLOy4tLxKpnn46SMP,CRUSH (with Travis Scott),"Playboi Carti, Travis Scott",2,48,48,,2025-03-15,79,True,...,7,-7.392,1,0.0486,0.202,5.1e-05,0.173,0.255,139.936,4
2,68qeaZhtMZ6abrJCYt6nQn,RATHER LIE (with The Weeknd),"Playboi Carti, The Weeknd",3,47,47,,2025-03-15,78,True,...,8,-4.431,1,0.0325,0.0583,0.0,0.213,0.279,132.991,4
3,7so0lgd0zP2Sbgs2d7a1SZ,Die With A Smile,"Lady Gaga, Bruno Mars",4,-3,-3,,2025-03-15,80,False,...,6,-7.727,0,0.0317,0.289,0.0,0.126,0.498,157.964,3
4,3j3SfV4hAcR4XjCvW393Gr,POP OUT,Playboi Carti,5,45,45,,2025-03-15,78,True,...,5,-2.094,1,0.0344,0.000117,0.0,0.166,0.156,144.035,4


# Limpieza de Datos

In [12]:
spotify_data.shape

(1822175, 25)

In [14]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1822175 entries, 0 to 1822174
Data columns (total 25 columns):
 #   Column              Dtype  
---  ------              -----  
 0   spotify_id          object 
 1   name                object 
 2   artists             object 
 3   daily_rank          int64  
 4   daily_movement      int64  
 5   weekly_movement     int64  
 6   country             object 
 7   snapshot_date       object 
 8   popularity          int64  
 9   is_explicit         bool   
 10  duration_ms         int64  
 11  album_name          object 
 12  album_release_date  object 
 13  danceability        float64
 14  energy              float64
 15  key                 int64  
 16  loudness            float64
 17  mode                int64  
 18  speechiness         float64
 19  acousticness        float64
 20  instrumentalness    float64
 21  liveness            float64
 22  valence             float64
 23  tempo               float64
 24  time_signature      int6

Descripción de las columnas:

1. spotify_id: Identificador único de cada canción en Spotify.
2. name: Nombre de la canción.
3. artists: Nombre del artista o artistas.
4. daily_rank: Posición de la canción en el ranking diario.
5. daily_movement: Movimiento diario en el ranking (probablemente un valor numérico que indica cuántas posiciones subió o bajó).
6. weekly_movement: Movimiento semanal en el ranking.
7. country: País asociado al ranking.
8. snapshot_date: Fecha en la que se tomó la métrica.
9. popularity: Popularidad de la canción (un valor entre 0 y 100 siendo 100 la mas popular).
10. is_explicit: Indica si la canción tiene contenido explícito (True/False).
11. duration_ms: Duración de la canción en milisegundos.
12. album_name: Nombre del álbum al que pertenece la canción.
13. album_release_date: Fecha de lanzamiento del álbum.
14. danceability: Medida de qué tan bailable es la canción (0 a 1). A value of 0.0 is least danceable and 1.0 is most danceable.
15. energy: Medida de la intensidad o energía de la canción (0 a 1). Energy is a measure from 0.0 to 1.0
16. key: Tonalidad musical de la canción (valores numéricos que representan notas musicales). Integers map to pitches using standard Pitch Class notation
17. loudness: Volumen promedio de la canción (en decibelios). Values typically range between -60 and 0 db.
18. mode: Modalidad de la canción (0 = menor, 1 = mayor). the type of scale from which its melodic content is derived
19. speechiness: Medida de la presencia de palabras habladas en la canción (0 a 1). Values between 0.33 and 0.66 describe tracks that may contain both music and speech 1 es talk show, audio book, poetry
20. acousticness: Medida de acústica en la canción (0 a 1). Range: 0 - 1 Example: 0.00242 1.0 represents high confidence the track is acoustic
21. instrumentalness: Medida de instrumentalidad en la canción (0 a 1). The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.
22. liveness: Medida de la presencia de una audiencia en vivo (0 a 1). A value above 0.8 provides strong likelihood that the track is live.
23. valence: Medida de la positividad o felicidad de la canción (0 a 1). Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric),
24. tempo: Velocidad o ritmo de la canción (en BPM, beats por minuto). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration.
25. time_signature: Compás de la canción (por ejemplo, 4/4, 3/4). The time signature ranges from 3 to 7 indicating time signatures of "3/4", to "7/4".

In [17]:
spotify_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
daily_rank,1822175.0,25.492401,14.428954,1.0,13.0,25.0,38.0,50.0
daily_movement,1822175.0,0.921288,7.007889,-49.0,-1.0,0.0,2.0,49.0
weekly_movement,1822175.0,2.767274,12.144854,-49.0,-3.0,0.0,5.0,49.0
popularity,1822175.0,75.87906,15.841506,0.0,65.0,79.0,88.0,100.0
duration_ms,1822175.0,193764.242375,49728.960952,0.0,161948.0,186000.0,218423.0,939666.0
danceability,1822175.0,0.678376,0.142035,0.0,0.583,0.7,0.782,0.988
energy,1822175.0,0.648574,0.167595,2e-05,0.551,0.668,0.765,0.998
key,1822175.0,5.542668,3.582249,0.0,2.0,6.0,9.0,11.0
loudness,1822175.0,-6.675436,3.532226,-54.341,-7.81,-6.04,-4.715,3.233
mode,1822175.0,0.537682,0.498578,0.0,0.0,1.0,1.0,1.0


In [18]:
spotify_data.describe(include=[object]).T

Unnamed: 0,count,unique,top,freq
spotify_id,1822175,21570,6dOtVTDdiauQNBQEDOtlAB,13742
name,1822145,18949,BIRDS OF A FEATHER,13742
artists,1822146,12140,Billie Eilish,28341
country,1797268,72,DO,25175
snapshot_date,1822175,503,2024-03-16,3942
album_name,1821354,14348,HIT ME HARD AND SOFT,26568
album_release_date,1821517,2707,2024-05-17,30420


In [20]:
# Valores faltantes -> Nulos
print(f"% de Valores faltantes por columna: \n {((spotify_data.isnull().sum() / spotify_data.shape[0]) * 100).round(3)}")

% de Valores faltantes por columna: 
 spotify_id            0.000
name                  0.002
artists               0.002
daily_rank            0.000
daily_movement        0.000
weekly_movement       0.000
country               1.367
snapshot_date         0.000
popularity            0.000
is_explicit           0.000
duration_ms           0.000
album_name            0.045
album_release_date    0.036
danceability          0.000
energy                0.000
key                   0.000
loudness              0.000
mode                  0.000
speechiness           0.000
acousticness          0.000
instrumentalness      0.000
liveness              0.000
valence               0.000
tempo                 0.000
time_signature        0.000
dtype: float64


In [21]:
# Podemos ver que hay filas sin informacion de la cancion y revisando los IDs parece que son valores erroneos, 
# Borrmos los nulos en 'artists', 'album_name' y 'album_release_date'         
spotify_data.dropna(subset=['artists'], inplace=True)
spotify_data.dropna(subset=['album_name'], inplace=True)
spotify_data.dropna(subset=['album_release_date'], inplace=True)

In [24]:
# De acuerdo a la documentacion, completamos con 'Global' donde 'country' es nulo
spotify_data['country'] = spotify_data['country'].fillna('Global')

In [27]:
spotify_data.isnull().sum()

spotify_id            0
name                  0
artists               0
daily_rank            0
daily_movement        0
weekly_movement       0
country               0
snapshot_date         0
popularity            0
is_explicit           0
duration_ms           0
album_name            0
album_release_date    0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
time_signature        0
dtype: int64

In [29]:
# Revisamos Valores duplicados
print("\nNúmero de filas duplicadas:", spotify_data.duplicated().sum())



Número de filas duplicadas: 0


In [30]:
# Buscamos Anomalías en columnas numéricas
print("\nRevisión de anomalías:")

print("\nPopularity (debe estar entre 0 y 100):\n", spotify_data['popularity'].describe().round(2))

print("\nDanceability (debe estar entre 0 y 1):\n",spotify_data['danceability'].describe().round(2))

print("\nDuration_ms (debe ser positivo y razonable):\n",spotify_data['duration_ms'].describe().round(2))

print("\nTempo (debe ser positivo y razonable):\n",spotify_data['tempo'].describe().round(2))

print("\nLoudness (debe ser negativo y razonable):\n", spotify_data['loudness'].describe().round(2))


Revisión de anomalías:

Popularity (debe estar entre 0 y 100):
 count    1821354.00
mean          75.88
std           15.84
min            0.00
25%           65.00
50%           79.00
75%           88.00
max          100.00
Name: popularity, dtype: float64

Danceability (debe estar entre 0 y 1):
 count    1821354.00
mean           0.68
std            0.14
min            0.00
25%            0.58
50%            0.70
75%            0.78
max            0.99
Name: danceability, dtype: float64

Duration_ms (debe ser positivo y razonable):
 count    1821354.00
mean      193779.41
std        49726.98
min        16320.00
25%       161948.00
50%       186000.00
75%       218423.00
max       939666.00
Name: duration_ms, dtype: float64

Tempo (debe ser positivo y razonable):
 count    1821354.00
mean         122.15
std           27.99
min            0.00
25%          100.02
50%          119.96
75%          140.06
max          236.09
Name: tempo, dtype: float64

Loudness (debe ser negativo y razon

In [33]:
# Filtrar valores atípicos para duration_ms Definir el umbral de 1 hora en milisegundos
umbral_ms = 60 * 60 * 1000  # 3,600,000 ms

# Filtrar eliminando valores mayores a 1 hora
spotify_data = spotify_data[spotify_data['duration_ms'] <= umbral_ms]

# Mostrar el resultado
print("\nDuration_ms (debe ser positivo y razonable):\n",spotify_data['duration_ms'].describe().round(2))


Duration_ms (debe ser positivo y razonable):
 count    1821354.00
mean      193779.41
std        49726.98
min        16320.00
25%       161948.00
50%       186000.00
75%       218423.00
max       939666.00
Name: duration_ms, dtype: float64


In [35]:
# Buscamos inconsistencias en el formato
print("\nRevisión de formatos:")

print("\nSnapshot_date (debe ser fecha):\n",spotify_data['snapshot_date'].head(3))

print("\nAlbum_release_date (debe ser fecha):\n", spotify_data['album_release_date'].head(3))

print("\nIs_explicit (debe ser booleano):\n", spotify_data['is_explicit'].unique())


Revisión de formatos:

Snapshot_date (debe ser fecha):
 0    2025-03-15
1    2025-03-15
2    2025-03-15
Name: snapshot_date, dtype: object

Album_release_date (debe ser fecha):
 0    2025-03-14
1    2025-03-14
2    2025-03-14
Name: album_release_date, dtype: object

Is_explicit (debe ser booleano):
 [ True False]


In [37]:
# Cambiamos el formato de las fechas
spotify_data['snapshot_date'] = pd.to_datetime(spotify_data['snapshot_date'], format='%Y-%m-%d')
spotify_data['album_release_date'] = pd.to_datetime(spotify_data['album_release_date'], format='%Y-%m-%d')

In [39]:
print("\nSnapshot_date (debe ser fecha):\n",spotify_data['snapshot_date'].head(3))
print("\nAlbum_release_date (debe ser fecha):\n",spotify_data['album_release_date'].head(3))


Snapshot_date (debe ser fecha):
 0   2025-03-15
1   2025-03-15
2   2025-03-15
Name: snapshot_date, dtype: datetime64[ns]

Album_release_date (debe ser fecha):
 0   2025-03-14
1   2025-03-14
2   2025-03-14
Name: album_release_date, dtype: datetime64[ns]


In [41]:
# Reemplazamos True and False en la columna 'is_explicit' por valores de 1 and 0 para que sea booleana
spotify_data['is_explicit'] = spotify_data['is_explicit'].astype(int)
spotify_data.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6iycYUk3oB0NPMdaDUrN1w,EVIL J0RDAN,Playboi Carti,1,49,49,Global,2025-03-15,80,1,...,1,-7.029,1,0.0564,0.00665,6.1e-05,0.119,0.0599,154.009,4
1,3VdooJLOy4tLxKpnn46SMP,CRUSH (with Travis Scott),"Playboi Carti, Travis Scott",2,48,48,Global,2025-03-15,79,1,...,7,-7.392,1,0.0486,0.202,5.1e-05,0.173,0.255,139.936,4
2,68qeaZhtMZ6abrJCYt6nQn,RATHER LIE (with The Weeknd),"Playboi Carti, The Weeknd",3,47,47,Global,2025-03-15,78,1,...,8,-4.431,1,0.0325,0.0583,0.0,0.213,0.279,132.991,4
3,7so0lgd0zP2Sbgs2d7a1SZ,Die With A Smile,"Lady Gaga, Bruno Mars",4,-3,-3,Global,2025-03-15,80,0,...,6,-7.727,0,0.0317,0.289,0.0,0.126,0.498,157.964,3
4,3j3SfV4hAcR4XjCvW393Gr,POP OUT,Playboi Carti,5,45,45,Global,2025-03-15,78,1,...,5,-2.094,1,0.0344,0.000117,0.0,0.166,0.156,144.035,4


In [43]:
# Verificar valores únicos en columnas categóricas
print("Key (debe estar entre 0 y 11):\n", spotify_data['key'].unique())

print("\nMode (debe ser 0 o 1):\n", spotify_data['mode'].unique())

print("\nIs_Explicit (debe ser 0 o 1):\n", spotify_data['is_explicit'].unique())

print("\nTime_signature (debe ser un valor común como 3, 4, o 5):\n", spotify_data['time_signature'].unique())

print("\nValores unicos de paises:\n", spotify_data['country'].unique())

Key (debe estar entre 0 y 11):
 [ 1  7  8  6  5  0  2  4 11  9 10  3]

Mode (debe ser 0 o 1):
 [1 0]

Is_Explicit (debe ser 0 o 1):
 [1 0]

Time_signature (debe ser un valor común como 3, 4, o 5):
 [4 3 5 1 0]

Valores unicos de paises:
 ['Global' 'ZA' 'VN' 'VE' 'UY' 'US' 'UA' 'TW' 'TR' 'TH' 'SV' 'SK' 'SG' 'SE'
 'SA' 'RO' 'PY' 'PT' 'PL' 'PK' 'PH' 'PE' 'PA' 'NZ' 'NO' 'NL' 'NI' 'NG'
 'MY' 'MX' 'MA' 'LV' 'LU' 'LT' 'KZ' 'KR' 'JP' 'IT' 'IS' 'IN' 'IL' 'IE'
 'ID' 'HU' 'HN' 'HK' 'GT' 'GR' 'GB' 'FR' 'FI' 'ES' 'EG' 'EE' 'EC' 'DO'
 'DK' 'DE' 'CZ' 'CR' 'CO' 'CL' 'CH' 'CA' 'BY' 'BR' 'BO' 'BG' 'BE' 'AU'
 'AT' 'AR' 'AE']


In [45]:
# Separamos la columna 'artists' en nuevas columnas para diferenciar a los artistas que participan en la cancion
dfartist = spotify_data['artists'].str.split(', ', expand=True)
dfartist.nunique()

0     6978
1     4331
2     1870
3      734
4      331
5      173
6       79
7       35
8       26
9       15
10      11
11       6
12       3
13       2
14       2
15       2
16       2
17       2
18       2
19       2
20       1
21       1
22       1
23       1
24       1
25       1
dtype: int64

In [46]:
# Podemos ver que hay hasta 25 artists en algunas canciones, para facilidad dejaremos unicamente hasta 3 colaboradores en 'feat_artists', divididas en 'feat_1' and 'feat_2' y 'feat_3.
spotify_data = pd.concat([dfartist, spotify_data], axis=1)
spotify_data.rename(columns = {
    0:'main_artist', 
    1:'feat_1', 
    2:'feat_2', 
    3:'feat_3'
    },inplace=True)
spotify_data.sample(5)

Unnamed: 0,main_artist,feat_1,feat_2,feat_3,4,5,6,7,8,9,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
1613781,Joey Moe,,,,,,,,,,...,1,-6.309,0,0.0501,0.039,0.0,0.112,0.552,177.917,4
1361793,Magdy El Zahar,Eslam Kabonga,,,,,,,,,...,0,-1.202,0,0.121,0.47,0.0,0.102,0.797,109.975,4
598481,Sabrina Carpenter,,,,,,,,,,...,11,-5.968,0,0.0426,0.0678,0.0,0.104,0.676,94.99,4
1366382,Niklas Dee,Old Jim,Enny-Mae,,,,,,,,...,5,-5.305,1,0.045,0.0294,7.1e-05,0.25,0.523,144.989,4
1030879,Luis R Conriquez,Neton Vega,,,,,,,,,...,4,-6.372,0,0.0512,0.445,2e-06,0.0947,0.611,100.123,1


In [47]:
# Y Borramos las columnas restantes hasta 25
spotify_data.drop(spotify_data.iloc[:, 4:26 ], axis=1, inplace=True)
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1821354 entries, 0 to 1822174
Data columns (total 29 columns):
 #   Column              Dtype         
---  ------              -----         
 0   main_artist         object        
 1   feat_1              object        
 2   feat_2              object        
 3   feat_3              object        
 4   spotify_id          object        
 5   name                object        
 6   artists             object        
 7   daily_rank          int64         
 8   daily_movement      int64         
 9   weekly_movement     int64         
 10  country             object        
 11  snapshot_date       datetime64[ns]
 12  popularity          int64         
 13  is_explicit         int32         
 14  duration_ms         int64         
 15  album_name          object        
 16  album_release_date  datetime64[ns]
 17  danceability        float64       
 18  energy              float64       
 19  key                 int64         
 20  loudnes

In [48]:
# borramos la columna artists ya que lo hemos separado en main artists y featured artists
spotify_data.drop(['artists'], axis=1, inplace= True)

In [50]:
# Creamos una nueva columna para tener el Año de lanzamiento
spotify_data['release_year'] = spotify_data['album_release_date'].dt.year

In [52]:
# Creamos una columna con duracion en minutos
spotify_data['duration_min']= round(spotify_data['duration_ms'] / (1000*60), 2)

In [57]:
# Agregamos la nueva columna "key_name" basada en la columna "key"
key_mapping = {
    0: 'C',   # do
    1: 'C#',  # do sostenido
    2: 'D',   # re
    3: 'Eb',  # mi bemol
    4: 'E',   # mi
    5: 'F',   # fa
    6: 'F#',  # fa sostenido
    7: 'G',   # sol
    8: 'G#',  # sol sostenido
    9: 'A',   # la
    10: 'Bb', # si bemol
    11: 'B'   # si
}

spotify_data["key_name"] = spotify_data["key"].map(key_mapping)

# Mostrar las primeras filas para verificar
print(spotify_data[["key", "key_name"]].head())

   key key_name
0    1       C#
1    7        G
2    8       G#
3    6       F#
4    5        F


In [59]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1821354 entries, 0 to 1822174
Data columns (total 31 columns):
 #   Column              Dtype         
---  ------              -----         
 0   main_artist         object        
 1   feat_1              object        
 2   feat_2              object        
 3   feat_3              object        
 4   spotify_id          object        
 5   name                object        
 6   daily_rank          int64         
 7   daily_movement      int64         
 8   weekly_movement     int64         
 9   country             object        
 10  snapshot_date       datetime64[ns]
 11  popularity          int64         
 12  is_explicit         int32         
 13  duration_ms         int64         
 14  album_name          object        
 15  album_release_date  datetime64[ns]
 16  danceability        float64       
 17  energy              float64       
 18  key                 int64         
 19  loudness            float64       
 20  mode   

In [61]:
spotify_data.isna().sum()

main_artist                 0
feat_1                1082828
feat_2                1603639
feat_3                1754027
spotify_id                  0
name                        0
daily_rank                  0
daily_movement              0
weekly_movement             0
country                     0
snapshot_date               0
popularity                  0
is_explicit                 0
duration_ms                 0
album_name                  0
album_release_date          0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
time_signature              0
release_year                0
duration_min                0
key_name                    0
dtype: int64

In [63]:
spotify_data['country'].unique()

array(['Global', 'ZA', 'VN', 'VE', 'UY', 'US', 'UA', 'TW', 'TR', 'TH',
       'SV', 'SK', 'SG', 'SE', 'SA', 'RO', 'PY', 'PT', 'PL', 'PK', 'PH',
       'PE', 'PA', 'NZ', 'NO', 'NL', 'NI', 'NG', 'MY', 'MX', 'MA', 'LV',
       'LU', 'LT', 'KZ', 'KR', 'JP', 'IT', 'IS', 'IN', 'IL', 'IE', 'ID',
       'HU', 'HN', 'HK', 'GT', 'GR', 'GB', 'FR', 'FI', 'ES', 'EG', 'EE',
       'EC', 'DO', 'DK', 'DE', 'CZ', 'CR', 'CO', 'CL', 'CH', 'CA', 'BY',
       'BR', 'BO', 'BG', 'BE', 'AU', 'AT', 'AR', 'AE'], dtype=object)

In [65]:
# Función para obtener el nombre completo del país usando pyCountry
def get_country_name(country_code):
    if isinstance(country_code, str):  # Asegura que es un string
        country = pc.countries.get(alpha_2=country_code.upper())  # Convierte a mayúsculas por si acaso
        if country:
            return country.name
    return 'Global'  # Devuelve Global si no se encuentra el país

In [67]:
get_country_name('MX')

'Mexico'

In [69]:
# Agregar columna 'country_name' al dataframe de Spotify
spotify_data['country_name'] = spotify_data['country'].map(get_country_name)

In [70]:
spotify_data['country_name'].unique()

array(['Global', 'South Africa', 'Viet Nam',
       'Venezuela, Bolivarian Republic of', 'Uruguay', 'United States',
       'Ukraine', 'Taiwan, Province of China', 'Türkiye', 'Thailand',
       'El Salvador', 'Slovakia', 'Singapore', 'Sweden', 'Saudi Arabia',
       'Romania', 'Paraguay', 'Portugal', 'Poland', 'Pakistan',
       'Philippines', 'Peru', 'Panama', 'New Zealand', 'Norway',
       'Netherlands', 'Nicaragua', 'Nigeria', 'Malaysia', 'Mexico',
       'Morocco', 'Latvia', 'Luxembourg', 'Lithuania', 'Kazakhstan',
       'Korea, Republic of', 'Japan', 'Italy', 'Iceland', 'India',
       'Israel', 'Ireland', 'Indonesia', 'Hungary', 'Honduras',
       'Hong Kong', 'Guatemala', 'Greece', 'United Kingdom', 'France',
       'Finland', 'Spain', 'Egypt', 'Estonia', 'Ecuador',
       'Dominican Republic', 'Denmark', 'Germany', 'Czechia',
       'Costa Rica', 'Colombia', 'Chile', 'Switzerland', 'Canada',
       'Belarus', 'Brazil', 'Bolivia, Plurinational State of', 'Bulgaria',
       'Be

In [71]:
# Diccionario de continentes (ISO Alpha-2 de países → Continente)
continent_mapping = {
    "AF": "África", 
    "NA": "América del Norte", 
    "SA": "América del Sur",
    "AS": "Asia", 
    "EU": "Europa", 
    "OC": "Oceanía", 
    "AN": "Antártida"
}

# Función para obtener el continente del país usando pycountry_convert
def get_continent(country_code):
    try:
        country_alpha2 = country_code.upper()  # Asegurar formato ISO Alpha-2
        country_continent_code = pyc.country_alpha2_to_continent_code(country_alpha2)
        continent_names = {
            "AF": "África", "NA": "América del Norte", "SA": "América del Sur",
            "AS": "Asia", "EU": "Europa", "OC": "Oceanía", "AN": "Antártida"
        }
        return continent_names.get(country_continent_code, "Desconocido")
    except:
        return "Global"

In [75]:
get_continent('MX')

'América del Norte'

In [77]:
# Agregar columna continente al dataframe de Spotify
spotify_data['continent'] = spotify_data['country'].map(get_continent)

In [78]:
spotify_data['continent'].unique()

array(['Global', 'África', 'Asia', 'América del Sur', 'América del Norte',
       'Europa', 'Oceanía'], dtype=object)

In [81]:
spotify_data['continent'].value_counts()

continent
Europa               721036
Asia                 425791
América del Norte    250828
América del Sur      249057
África               100054
Oceanía               49688
Global                24900
Name: count, dtype: int64

In [83]:
# Creamos una columna con el Ranking de la semana previa
spotify_data['past_week_rank'] = spotify_data['daily_rank'] + spotify_data['weekly_movement']

In [85]:
# Revision final
spotify_data.sample(5)

Unnamed: 0,main_artist,feat_1,feat_2,feat_3,spotify_id,name,daily_rank,daily_movement,weekly_movement,country,...,liveness,valence,tempo,time_signature,release_year,duration_min,key_name,country_name,continent,past_week_rank
776355,Rauw Alejandro,,,,1FlL8ycld6uVDDyhV7P5FA,Cúrame,34,-10,-3,HN,...,0.112,0.262,102.101,4,2021,2.75,A,Honduras,América del Norte,31
1494131,Alan Gomez,Luck Ra,,,6CIMoDfTsvFVGhFi3v9Izn,Luck Ra | Mission 15,32,2,-8,AR,...,0.171,0.814,160.02,4,2023,2.22,E,Argentina,América del Sur,24
526883,FloyyMenor,,,,42wdz3j7gstgUuhoFLCXqd,PELIGROSA,8,0,5,PY,...,0.0816,0.804,100.025,4,2024,2.25,F#,Paraguay,América del Sur,13
192487,Shoday,Ayo Maff,,,6ovxhOjCU6SzPLtfNnzVQk,Casablanca (feat. Ayo Maff),11,-4,-3,NG,...,0.155,0.578,120.303,4,2024,2.73,Bb,Nigeria,África,8
753785,Chappell Roan,,,,0WbMK4wrZ1wFSty9F7FCgu,"Good Luck, Babe!",14,1,1,LV,...,0.0881,0.785,116.712,4,2024,3.64,B,Latvia,Europa,15


# Filtramos datos de prueba

In [88]:
    # Obtener la fecha más reciente
    ultimo_dia = spotify_data['snapshot_date'].max()
    
    # Obtener Fecha de inicio (cambiar dias segun se requiera)
    fecha_inicio = ultimo_dia - pd.Timedelta(days=30)

    # Filtrar los datos de un rango de fechas
    data_clean = spotify_data[(spotify_data['snapshot_date'] >= fecha_inicio) & (spotify_data['snapshot_date'] <= ultimo_dia)]

## Filtramos Datos de solo algunos paises

In [90]:
# Lista de países que queremos incluir
paises= ['United States','Portugal', 'Netherlands', 'Mexico','Japan', 'Italy', 'India', 'United Kingdom', 'Spain', 
         'Egypt', 'Germany','Brazil', 'Australia','Argentina', 'Korea, Republic of', 'Greece', 'France', 'Global']

# Filtrar esos países específicos
data_clean = data_clean[data_clean["country_name"].isin(paises)]

In [92]:
data_clean.head()

Unnamed: 0,main_artist,feat_1,feat_2,feat_3,spotify_id,name,daily_rank,daily_movement,weekly_movement,country,...,liveness,valence,tempo,time_signature,release_year,duration_min,key_name,country_name,continent,past_week_rank
0,Playboi Carti,,,,6iycYUk3oB0NPMdaDUrN1w,EVIL J0RDAN,1,49,49,Global,...,0.119,0.0599,154.009,4,2025,3.07,C#,Global,Global,50
1,Playboi Carti,Travis Scott,,,3VdooJLOy4tLxKpnn46SMP,CRUSH (with Travis Scott),2,48,48,Global,...,0.173,0.255,139.936,4,2025,2.89,G,Global,Global,50
2,Playboi Carti,The Weeknd,,,68qeaZhtMZ6abrJCYt6nQn,RATHER LIE (with The Weeknd),3,47,47,Global,...,0.213,0.279,132.991,4,2025,3.49,G#,Global,Global,50
3,Lady Gaga,Bruno Mars,,,7so0lgd0zP2Sbgs2d7a1SZ,Die With A Smile,4,-3,-3,Global,...,0.126,0.498,157.964,3,2025,4.19,F#,Global,Global,1
4,Playboi Carti,,,,3j3SfV4hAcR4XjCvW393Gr,POP OUT,5,45,45,Global,...,0.166,0.156,144.035,4,2025,2.7,F,Global,Global,50


## Exportamos el archivo con datos limpios

In [95]:
# Guardar el conjunto de datos limpios
data_clean.to_csv('spotify_clean.csv', index=False)