Importación de datos

In [1]:
!pip install scipy scikit-learn numpy pandas




[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

def remove_numbers(text):
    text = re.sub(r"[0-9]", " ", text)
    return " ".join(text.split())

def remove_unprintable_(text):
    printable = set(string.printable + "ñáéíóúü" + "ÑÁÉÍÓÚÜ")
    text = "".join(filter(lambda x: x in printable, text))
    return text

def remove_punctuation(text):
    pattern = re.compile(r"[^\w\sáéíóúüñÁÉÍÓÚÜÑ]")
    t = pattern.sub(r" ", text)
    return re.sub(" +", " ", t)

def reduce_spam(text):
    text = re.sub(r"(\w+)(\s+\1){2,}", r"\1", text)
    text = re.sub(r"(\w+\s+\w+)(\s+\1){2,}", r"\1", text)
    return text

def remove_vowels_accents(text):
    return (
        text.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
        .replace("ü", "u")
    )

def remove_stopwords(text, stopwords_list):
    return " ".join(
        [word for word in str(text).split() if word not in stopwords_list]
    )

url = "https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt"
r = requests.get(url)

stopwords_list = r.text.splitlines()

def preprocesar(text):
    text = text.lower()
    text = remove_numbers(text)
    text = remove_unprintable_(text)
    text = remove_punctuation(text)
    text = reduce_spam(text)
    text = remove_stopwords(text, stopwords_list)
    text = remove_vowels_accents(text)
    return text.strip()

In [None]:
import pandas as pd
df_spotify = pd.read_csv("Spotify.csv")

Redimensionando los parámetros

In [None]:
#Ajustamos columnas para contraarrestar error de formato en archivo xlsx
df_spotify['duration_ms'] = df_spotify['duration_ms']/10
df_spotify['popularity'] = df_spotify['popularity']/10
df_spotify['streams'] = df_spotify['streams']/10
df_spotify['af_danceability'] = df_spotify['af_danceability']/1000
df_spotify['af_energy'] = df_spotify['af_energy']/1000
df_spotify['af_key'] = df_spotify['af_key']/10
df_spotify['af_loudness'] = df_spotify['af_loudness']/1000
df_spotify['af_speechiness'] = df_spotify['af_speechiness']/1000
df_spotify['af_acousticness'] = df_spotify['af_acousticness']/1000
df_spotify['af_instrumentalness'] = df_spotify['af_instrumentalness']/1000
df_spotify['af_liveness'] = df_spotify['af_liveness']/1000
df_spotify['af_valence'] = df_spotify['af_valence']/1000
df_spotify['af_tempo'] = df_spotify['af_tempo']/1000
df_spotify['af_time_signature'] = df_spotify['af_time_signature']/10

La idea del experimento asociado a la pregunta 3 es predecir la popularidad de la canción considerando su composición en términos numéricos, esto es considerando: valencia, tempo, acordes, ritmo, duración, etc. Se considera que hay información que no es relevante para la predicción como “url”, “track”, “album”, etc. Esta información no está incluida en el análisis, y debido a que el nombre del artista es una variable que influye directamente en la popularidad esta se excluye también. La única variable no numérica es “region”. 
Antes de realizar el preprocesamiento y posterior entrenamiento se determinan rangos de popularidad según los cuartiles de la popularidad en el dataset, con la intención de clasificar los datos en rangos de popularidad.
Se realiza un preprocesamiento para vectorizar el texto y se estandarizan los valores numéricos, se utiliza como modelo un random forest y todo esto se implementa en un pipeline como se detalla a continuación. 


Quitamos las columnas de con información no relevante

In [None]:
df=df_spotify.drop(columns=['date','url','chart', 'track_id','available_markets','Año','Día','Column1','release_date','explicit','title','artist','trend','album','af_mode','rank','af_time_signature'])

Calculamos una medida para clasificar la popularidad por rangos

In [6]:
cuartiles = df['popularity'].quantile([0.25, 0.5, 0.75])

Seleccionamos una muestra del dataset

In [7]:
df_2000 = df.head(2000)

Definimos otra forma de escribir la popularidad

In [8]:
new_pop = df_2000['popularity']

Clasificamos la popularidad en rangos

In [9]:
for index, row in df_2000.iterrows():
    if row['popularity']>=cuartiles[0.75]:
        new_pop.iat[index] = 'High'
    elif row['popularity']>=cuartiles[0.50]:
        new_pop.iat[index] = 'Normal'
    elif row['popularity']>=cuartiles[0.25]:
        new_pop.iat[index] = 'Normal-Low'
    else:
        new_pop.iat[index] = 'Low'

  new_pop.iat[index] = 'Normal-Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Normal-Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Normal'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'High'


Reemplazamos la columna de popularidad

In [10]:
df_2000['popularity'].astype(str)
df_2000['popularity'] = new_pop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2000['popularity'] = new_pop


Cambiamos la nueva clasificación a un número

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_2000['popularity'] = le.fit_transform(df_2000['popularity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2000['popularity'] = le.fit_transform(df_2000['popularity'])


In [12]:
from sklearn.preprocessing import StandardScaler
features = ['region','streams','duration_ms', 'af_danceability', 'af_energy', 'af_key', 'af_loudness', 'af_speechiness', 
            'af_acousticness', 'af_instrumentalness', 'af_liveness', 'af_valence', 'af_tempo']

Preparamos el dataset para entrenar

In [13]:
X = df_2000[features]
y = df_2000['popularity']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=0, stratify=y_val_and_test)

Generamos un pipeline para:
- Transformar texto
- Estandarizar los datos
- Aplicar un clasificador (random forest)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
import numpy as np

# Define tus columnas
text_feature = 'region'
numeric_features = ['streams', 'duration_ms', 'af_danceability', 'af_energy', 'af_key', 
                    'af_loudness', 'af_speechiness', 'af_acousticness', 'af_instrumentalness', 
                    'af_liveness', 'af_valence', 'af_tempo']

# Crear la pipeline completa
clf_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('text', Pipeline([
                ('vectorizer', CountVectorizer(min_df=1, preprocessor = preprocesar)),
                ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
            ]), text_feature),
            ('num', StandardScaler(), numeric_features)
        ])),
    ('classifier', RandomForestClassifier(random_state=42))
])

Entrenamos

In [16]:
clf_pipeline.fit(X_train, y_train)

predecimos 

In [17]:
y_pred = clf_pipeline.predict(X_test)

evaluamos la predicción

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
target_names = ['High', 'Low', 'Medium-High', 'Medium-Low']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        High       0.93      1.00      0.97       115
         Low       0.77      0.74      0.75        54
      Normal       0.94      0.90      0.92        67
  Normal-Low       0.75      0.72      0.74        64

    accuracy                           0.87       300
   macro avg       0.85      0.84      0.84       300
weighted avg       0.87      0.87      0.87       300



Probaremos reducir la cantidad de parámetros a utilizar y probar distintos modelos de clasificación, considerando todo el dataset y no solo una parte de él. 

Replanteamiento de la pregunta

¿Cuál será la popularidad de una canción en una región debido a sus atributos?
