# Spotify Project

## Top 50 Songs per Spanish Speaking Country

### Lets gather the data!

In [3]:
# Installing the spotipy wrapper
# !pip install spotipy

In [2]:
CLIENT_ID = "2c66af9ffb9b4030921c65338e486980"
CLIENT_SECRET = "c364b5b2741b45c6a497abc0c7e7d9a0"

In [4]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=CLIENT_ID,
                                                           client_secret=CLIENT_SECRET))

In [2]:
import time
import warnings
warnings.filterwarnings("ignore") # Ignore the deprecated warning
import pandas as pd
import sqlite3

In [23]:
# Spanish speaking countries Top 50 playlist_ids (except Cuba)

top_50_playlists = [
    '37i9dQZEVXbNLrliB10ZnX',  # Top 50 - Venezuela
    '37i9dQZEVXbJfdy5b0KP7W',  # Top 50 - Peru
    '37i9dQZEVXbJp9wcIM9Eo5',  # Top 50 - Honduras
    '37i9dQZEVXbISk8kxnzfCq',  # Top 50 - Nicaragua
    '37i9dQZEVXbNOUPGj7tW6T',   # Top 50 - Paraguay
    '37i9dQZEVXbJlM6nvL1nD1',  # Top 50 - Ecuador
    '37i9dQZEVXbOa2lmxNORXQ',  # Top 50 - Colombia
    '37i9dQZEVXbMMy2roB9myp',  # Top 50 - Argentina
    '37i9dQZEVXbO3qyFxbkOE1',  # Top 50 - Mexico
    '37i9dQZEVXbL0GavIqMTeb',   # Top 50 - Chile
    '37i9dQZEVXbMJJi3wgRbAy',  # Top 50 - Uruguay
    '37i9dQZEVXbJqfMFK4d691',  # Top 50 - Bolivia
    '37i9dQZEVXbMZAjGMynsQX',  # Top 50 - Costa Rica
    '37i9dQZEVXbLy5tBFyQvd4',  # Top 50 - Guatemala
    '37i9dQZEVXbKypXHVwk1f0',   # Top 50 - Panama
    '37i9dQZEVXbLxoIml4MYkT',  # Top 50 - El Salvador
    '37i9dQZEVXbKAbrMR8uuf7',  # Top 50 - Dominican Republic
    '37i9dQZEVXbNFJfN1Vw8d9'   # Top 50 - Spain
]

In [8]:
# Function to get tracks data

def collect_data_from_playlists(playlist_ids):
    all_track_data = []
    for playlist_id in playlist_ids:
        tracks = get_playlist_tracks(playlist_id)
        for item in tracks:
            track = item['track']
            track_info = {
                'track_name': track['name'],
                'track_id': track['id'],
                'artist_name': track['artists'][0]['name'],
                'artist_id': track['artists'][0]['id'],
                'album_name': track['album']['name'],
                'album_id': track['album']['id'],
                'release_date': track['album']['release_date'],
                'popularity': track['popularity'],
                'duration_ms': track['duration_ms'],
                'explicit': track['explicit'],
                'playlist_name': playlist_id  # Placeholder, can replace with actual playlist name
            }
            all_track_data.append(track_info)
    return all_track_data

In [36]:
# Function to get artists data

def get_artist_data(artist_ids):
    all_artist_data = []
    for artist_id in artist_ids: 
        try:
            artist = sp.artist(artist_id)
            artist_info = {
                'artist_name': artist['name'],
                'artist_id': artist_id,
                'genres': ','.join(artist['genres']),
                'popularity': artist['popularity'],
                'followers': artist['followers']['total']
            }
            all_artist_data.append(artist_info)
            time.sleep(1)  # Pause to avoid hitting rate limits
        except spotipy.exceptions.SpotifyException as e:
            print(f"Error fetching artist {artist_id}: {e}")
    return all_artist_data

In [25]:
# Collect track data
all_top50_tracks = collect_data_from_playlists(top_50_playlists)

# Create DataFrame for tracks
track_df = pd.DataFrame(all_top50_tracks)

In [20]:
track_df.head()

Unnamed: 0,track_name,track_id,artist_name,artist_id,album_name,album_id,release_date,popularity,duration_ms,explicit,playlist_name
0,Si Antes Te Hubiera Conocido,6WatFBLVB0x077xWeoVc2k,KAROL G,790FomKkXshlbRYZFtlgla,Si Antes Te Hubiera Conocido,5ylbxH7EqpsmHZCRuiYewS,2024-06-21,94,195824,False,37i9dQZEVXbNLrliB10ZnX
1,LUNA,7bywjHOc0wSjGGbj04XbVi,Feid,2LRoIwlKmHjgvigdNGBHNo,FERXXOCALIPSIS,0lgs2Sa82lyX89nBUWyUy6,2023-12-01,91,196800,False,37i9dQZEVXbNLrliB10ZnX
2,Mirame,2btNsI4OvcVl7SAHQQDHFB,Blessd,1TA5sGRlKUJXBN4ZyJuDIX,Mirame,5oCAS6VzHoESewtXUGMqbz,2024-04-17,87,157453,True,37i9dQZEVXbNLrliB10ZnX
3,Orion,5pVJ3IlnpIorU44oXizzG5,Boza,2NfSBtmWe7oPw1EmetJVso,Orion,2FWRFIH8Agbw8JGG4hbdG6,2024-05-29,82,240964,False,37i9dQZEVXbNLrliB10ZnX
4,Ohnana,1fTjqf10accJCDYstPwwZ6,Kapo,3UTF2no3muGdiFXVujl94i,Ohnana,3IdVqIfyLPqxPZusYgifLo,2024-06-06,82,156577,False,37i9dQZEVXbNLrliB10ZnX


In [26]:
track_df.shape

(900, 11)

In [27]:
# Group by 'playlist_name' and count the number of rows per playlist
playlist_counts = track_df.groupby('playlist_name').size().reset_index(name='counts')

# Display the counts
print(playlist_counts)

             playlist_name  counts
0   37i9dQZEVXbISk8kxnzfCq      50
1   37i9dQZEVXbJfdy5b0KP7W      50
2   37i9dQZEVXbJlM6nvL1nD1      50
3   37i9dQZEVXbJp9wcIM9Eo5      50
4   37i9dQZEVXbJqfMFK4d691      50
5   37i9dQZEVXbKAbrMR8uuf7      50
6   37i9dQZEVXbKypXHVwk1f0      50
7   37i9dQZEVXbL0GavIqMTeb      50
8   37i9dQZEVXbLxoIml4MYkT      50
9   37i9dQZEVXbLy5tBFyQvd4      50
10  37i9dQZEVXbMJJi3wgRbAy      50
11  37i9dQZEVXbMMy2roB9myp      50
12  37i9dQZEVXbMZAjGMynsQX      50
13  37i9dQZEVXbNFJfN1Vw8d9      50
14  37i9dQZEVXbNLrliB10ZnX      50
15  37i9dQZEVXbNOUPGj7tW6T      50
16  37i9dQZEVXbO3qyFxbkOE1      50
17  37i9dQZEVXbOa2lmxNORXQ      50


In [28]:
# correcting 'playlist_id' column name
track_df.rename(columns={'playlist_name': 'playlist_id'}, inplace=True)

In [31]:
# Adding a column by mapping the playlist_id to the country
playlist_country_mapping = {
    '37i9dQZEVXbNLrliB10ZnX': 'Venezuela',
    '37i9dQZEVXbJfdy5b0KP7W': 'Peru',
    '37i9dQZEVXbJp9wcIM9Eo5': 'Honduras',
    '37i9dQZEVXbISk8kxnzfCq': 'Nicaragua',
    '37i9dQZEVXbNOUPGj7tW6T': 'Paraguay',
    '37i9dQZEVXbJlM6nvL1nD1': 'Ecuador',
    '37i9dQZEVXbOa2lmxNORXQ': 'Colombia',
    '37i9dQZEVXbMMy2roB9myp': 'Argentina',
    '37i9dQZEVXbO3qyFxbkOE1': 'Mexico',
    '37i9dQZEVXbL0GavIqMTeb': 'Chile',
    '37i9dQZEVXbMJJi3wgRbAy': 'Uruguay',
    '37i9dQZEVXbJqfMFK4d691': 'Bolivia',
    '37i9dQZEVXbMZAjGMynsQX': 'Costa Rica',
    '37i9dQZEVXbLy5tBFyQvd4': 'Guatemala',
    '37i9dQZEVXbKypXHVwk1f0': 'Panama',
    '37i9dQZEVXbLxoIml4MYkT': 'El Salvador',
    '37i9dQZEVXbKAbrMR8uuf7': 'Dominican Republic',
    '37i9dQZEVXbNFJfN1Vw8d9': 'Spain'
}


track_df['country'] = track_df['playlist_id'].map(playlist_country_mapping)

In [89]:
# Formating the release_date as date

track_df['release_date'] = pd.to_datetime(track_df['release_date'])

ValueError: time data "2006" doesn't match format "%Y-%m-%d", at position 60. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [91]:
# Lets see where the error is coming from and fix the date by including the month and year as well

track_df[track_df['release_date'] == '2006']

Unnamed: 0,track_name,track_id,artist_name,artist_id,album_name,album_id,release_date,popularity,duration_ms,explicit,playlist_id,country
138,Todo Cambió,4OyzSXRSadNQt5EMwASdap,Camila,2gRP1Ezbtj3qrERnd0XasU,Todo Cambio,7dTSfhp5okEjaYXp38LwVf,2006,80,193826,False,37i9dQZEVXbJp9wcIM9Eo5,Honduras
173,Todo Cambió,4OyzSXRSadNQt5EMwASdap,Camila,2gRP1Ezbtj3qrERnd0XasU,Todo Cambio,7dTSfhp5okEjaYXp38LwVf,2006,80,193826,False,37i9dQZEVXbISk8kxnzfCq,Nicaragua
274,Todo Cambió,4OyzSXRSadNQt5EMwASdap,Camila,2gRP1Ezbtj3qrERnd0XasU,Todo Cambio,7dTSfhp5okEjaYXp38LwVf,2006,80,193826,False,37i9dQZEVXbJlM6nvL1nD1,Ecuador
638,Todo Cambió,4OyzSXRSadNQt5EMwASdap,Camila,2gRP1Ezbtj3qrERnd0XasU,Todo Cambio,7dTSfhp5okEjaYXp38LwVf,2006,80,193826,False,37i9dQZEVXbMZAjGMynsQX,Costa Rica


In [92]:
# I researched the release date for this track

track_df['release_date'] = track_df['release_date'].replace('2006', '2006-05-09')

In [93]:
track_df['release_date'] = pd.to_datetime(track_df['release_date'])

In [94]:
track_df.head()

Unnamed: 0,track_name,track_id,artist_name,artist_id,album_name,album_id,release_date,popularity,duration_ms,explicit,playlist_id,country
0,Si Antes Te Hubiera Conocido,6WatFBLVB0x077xWeoVc2k,KAROL G,790FomKkXshlbRYZFtlgla,Si Antes Te Hubiera Conocido,5ylbxH7EqpsmHZCRuiYewS,2024-06-21,94,195824,False,37i9dQZEVXbNLrliB10ZnX,Venezuela
1,LUNA,7bywjHOc0wSjGGbj04XbVi,Feid,2LRoIwlKmHjgvigdNGBHNo,FERXXOCALIPSIS,0lgs2Sa82lyX89nBUWyUy6,2023-12-01,91,196800,False,37i9dQZEVXbNLrliB10ZnX,Venezuela
2,Mirame,2btNsI4OvcVl7SAHQQDHFB,Blessd,1TA5sGRlKUJXBN4ZyJuDIX,Mirame,5oCAS6VzHoESewtXUGMqbz,2024-04-17,87,157453,True,37i9dQZEVXbNLrliB10ZnX,Venezuela
3,Orion,5pVJ3IlnpIorU44oXizzG5,Boza,2NfSBtmWe7oPw1EmetJVso,Orion,2FWRFIH8Agbw8JGG4hbdG6,2024-05-29,82,240964,False,37i9dQZEVXbNLrliB10ZnX,Venezuela
4,Ohnana,1fTjqf10accJCDYstPwwZ6,Kapo,3UTF2no3muGdiFXVujl94i,Ohnana,3IdVqIfyLPqxPZusYgifLo,2024-06-06,82,156577,False,37i9dQZEVXbNLrliB10ZnX,Venezuela


In [63]:
# Collecting the artists data
artist_ids = track_df['artist_id'].unique()

# Applying the function
all_artist_data = get_artist_data(artist_ids)

# Create DataFrame for artists
artist_df = pd.DataFrame(all_artist_data)


In [39]:
artist_df.head()

Unnamed: 0,artist_name,artist_id,genres,popularity,followers
0,KAROL G,790FomKkXshlbRYZFtlgla,"reggaeton,reggaeton colombiano,trap latino,urb...",90,48499948
1,Feid,2LRoIwlKmHjgvigdNGBHNo,"colombian pop,pop reggaeton,reggaeton,reggaeto...",91,12359978
2,Blessd,1TA5sGRlKUJXBN4ZyJuDIX,"reggaeton,urbano latino",79,4328445
3,Boza,2NfSBtmWe7oPw1EmetJVso,"panamanian pop,reggaeton,urbano latino",69,536852
4,Kapo,3UTF2no3muGdiFXVujl94i,,66,37508


In [64]:
artist_df.shape

(176, 5)

In [43]:
artist_df.genres.unique()

array(['reggaeton,reggaeton colombiano,trap latino,urbano latino',
       'colombian pop,pop reggaeton,reggaeton,reggaeton colombiano,trap latino,urbano latino',
       'reggaeton,urbano latino',
       'panamanian pop,reggaeton,urbano latino', '', 'k-pop',
       'reggaeton,reggaeton flow,trap latino,urbano latino',
       'argentine hip hop,bases de freestyle,trap argentino,trap latino,urbano latino',
       'reggaeton,trap latino,urbano latino',
       'pop venezolano,reggaeton,trap latino,urbano latino',
       'trap latino,urbano latino',
       'colombian pop,latin pop,reggaeton,reggaeton colombiano,trap latino,urbano latino',
       'reggaeton chileno', 'venezuelan hip hop,venezuelan rock',
       'trap boricua,trap latino,urbano latino', 'dembow dominicano',
       'trap venezolano',
       'puerto rican pop,reggaeton,trap latino,urbano latino',
       'trap boricua,urbano latino',
       'rap canario,trap latino,urbano latino', 'r&b en espanol',
       'colombian hip hop,mexic

In [65]:
# Lets create a new df with one genre per row

# Split the 'genres' column into separate rows
genres_split = artist_df['genres'].str.split(',', expand=True)

In [68]:
# Stack the split columns into a single column and drop missing values
stacked_genres = genres_split.stack().reset_index(level=1, drop=True)

In [69]:
stacked_genres

0                 reggaeton
0      reggaeton colombiano
0               trap latino
0             urbano latino
1             colombian pop
               ...         
174               reggaeton
174    reggaeton colombiano
174           urbano latino
175             rap canario
175          urbano espanol
Length: 417, dtype: object

In [70]:
# Create a new DataFrame with the artist_id and genres
artist_genre_df = pd.DataFrame({
    'artist_name': artist_df['artist_name'].repeat(genres_split.notna().sum(axis=1)),
    'artist_id': artist_df['artist_id'].repeat(genres_split.notna().sum(axis=1)),
    'genre': stacked_genres
}).reset_index(drop=True)

In [74]:
artist_genre_df.head()

Unnamed: 0,artist_name,artist_id,genre
0,KAROL G,790FomKkXshlbRYZFtlgla,reggaeton
1,KAROL G,790FomKkXshlbRYZFtlgla,reggaeton colombiano
2,KAROL G,790FomKkXshlbRYZFtlgla,trap latino
3,KAROL G,790FomKkXshlbRYZFtlgla,urbano latino
4,Feid,2LRoIwlKmHjgvigdNGBHNo,colombian pop


In [72]:
artist_genre_df['genre'].value_counts()

genre
urbano latino                56
trap latino                  43
reggaeton                    30
                             22
latin pop                    15
                             ..
rap underground argentino     1
alternative r&b               1
pop boliviano                 1
drill chileno                 1
persian electronic            1
Name: count, Length: 101, dtype: int64

In [110]:
# Lets save the dfs as csv files

# Save to CSV
# artist_df.to_csv('artists_data.csv', index=False)
# print("Artist data saved to 'artists_data.csv'")

In [111]:
# Save to CSV
# track_df.to_csv('track_data.csv', index=False)
# print("Track data saved to 'track_data.csv'")

In [112]:
# Save to CSV
# artist_genre_df.to_csv('artist_genre_df.csv', index=False)
# print("Track data saved to 'artist_genre_df.csv'")

Lets connect to MySQL!

In [75]:
import pymysql
from sqlalchemy import create_engine, text
import getpass  # To get the password without showing the input
password = getpass.getpass()


In [80]:
# Define the server connection string (without a specific database)
server_connection_string = f'mysql+pymysql://root:{password}@localhost/'

In [81]:
# Create an engine and connect to the MySQL server
server_engine = create_engine(server_connection_string)

Lets create the DB and the tables!

In [82]:
# Create a new database
database_name = 'spotify_project'

with server_engine.connect() as connection:
    connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {database_name};"))
    print(f"Database '{database_name}' created successfully.")

Database 'spotify_project' created successfully.


In [83]:
new_db_connection_string = f'mysql+pymysql://root:{password}@localhost/{database_name}'
new_db_engine = create_engine(new_db_connection_string)

In [86]:
create_tracks_table = '''
CREATE TABLE IF NOT EXISTS tracks (
    id INT AUTO_INCREMENT PRIMARY KEY,
    track_name VARCHAR(255) NOT NULL,
    track_id VARCHAR(255) NOT NULL,
    artist_name VARCHAR(255) NOT NULL,
    artist_id VARCHAR(255) NOT NULL,
    album_name VARCHAR(255) NOT NULL,
    album_id VARCHAR(255) NOT NULL,
    release_date DATE NOT NULL,
    popularity INT NOT NULL,
    duration_ms INT NOT NULL,
    explicit BOOLEAN NOT NULL,
    playlist_id VARCHAR(255) NOT NULL,
    country VARCHAR(255) NOT NULL
);
'''

# Execute the SQL statement 
with new_db_engine.connect() as connection:
    connection.execute(text(create_tracks_table))
    print("Table 'tracks' created successfully.")

Table 'tracks' created successfully.


In [96]:
# Insert the DataFrame into the SQL table
# track_df.to_sql('tracks', new_db_engine, if_exists='append', index=False)

900

In [98]:
create_artists_table = '''
CREATE TABLE IF NOT EXISTS artists (
    artist_id VARCHAR(255) PRIMARY KEY,
    artist_name VARCHAR(255) NOT NULL,
    genres TEXT NOT NULL,
    popularity INT NOT NULL,
    followers INT NOT NULL
);
'''

In [99]:
with new_db_engine.connect() as connection:
    connection.execute(text(create_artists_table))
    print("Table 'artists' created successfully.")

Table 'artists' created successfully.


In [101]:
# artist_df.to_sql('artists', new_db_engine, if_exists='append', index=False)

176

In [108]:
# Lets drop the artist_name from the df 

artist_genre_df1 = artist_genre_df.drop(columns=['artist_name'])

In [105]:
create_artist_genres_table = '''
CREATE TABLE IF NOT EXISTS artist_genres (
    id INT AUTO_INCREMENT PRIMARY KEY,
    artist_id VARCHAR(255) NOT NULL,
    genre VARCHAR(255) NOT NULL,
    FOREIGN KEY (artist_id) REFERENCES artists(artist_id)
);
'''

In [106]:
with new_db_engine.connect() as connection:
    connection.execute(text(create_artist_genres_table))
    print("Table 'artist_genres' created successfully.")

Table 'artist_genres' created successfully.


In [109]:
artist_genre_df1.to_sql('artist_genres', new_db_engine, if_exists='append', index=False)

print("Data inserted successfully.")

Data inserted successfully.
