# Experimentos:

### Pregunta 1:
- ¿Es posible predecir la felicidad (positividad) de una canción en función de la popularidad (u otros parametros)?

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df_spotify = pd.read_excel('Spotify.xlsx')

#Ajustamos columnas para contraarrestar error de formato en archivo xlsx
df_spotify['duration_ms'] = df_spotify['duration_ms']/10
df_spotify['popularity'] = df_spotify['popularity']/10
df_spotify['streams'] = df_spotify['streams']/10
df_spotify['af_danceability'] = df_spotify['af_danceability']/1000
df_spotify['af_energy'] = df_spotify['af_energy']/1000
df_spotify['af_key'] = df_spotify['af_key']/10
df_spotify['af_loudness'] = df_spotify['af_loudness']/1000
df_spotify['af_speechiness'] = df_spotify['af_speechiness']/1000
df_spotify['af_acousticness'] = df_spotify['af_acousticness']/1000
df_spotify['af_instrumentalness'] = df_spotify['af_instrumentalness']/1000
df_spotify['af_liveness'] = df_spotify['af_liveness']/1000
df_spotify['af_valence'] = df_spotify['af_valence']/1000
df_spotify['af_tempo'] = df_spotify['af_tempo']/1000
df_spotify['af_time_signature'] = df_spotify['af_time_signature']/10

## Prediciendo con solo popularidad

En una primera instancia, experimentaremos sólo usando los atributos "streams" y "popularity" para intentar predecir "af_valence" (El cual representa la positividad o felicidad). Esto es porque de todos lo atributos, son estos dos los que se asocian con la "popularidad" de una canción dada.

In [5]:
df_util_1 = df_spotify[["streams", "popularity", "af_valence"]].copy()

In [6]:
df_util_1

Unnamed: 0,streams,popularity,af_valence
0,28838.0,44.0,0.251
1,22249.0,1.0,0.393
2,218751.0,64.0,0.822
3,193855.0,74.0,0.453
4,179042.0,72.0,0.055
...,...,...,...
1009045,11984.0,22.0,0.855
1009046,11904.0,53.0,0.025
1009047,11894.0,45.0,0.227
1009048,11751.0,0.0,0.669


Definimos una función que nos permitirá convertir los valores numéricos reales del atributo "af_valence" en etiquetas de texto mediante intervalos.

In [7]:
def apply_etiqueta(elemento):
    if (elemento <= 0.25):
        return "Low"
    elif (elemento > 0.25) & (elemento <= 0.5):
        return "Medium-Low"
    elif (elemento > 0.5) & (elemento <= 0.75):
        return "Medium-High"
    else:
        return "High"

Procedemos a separar las etiquetas de los datos y a aplicar un scaler.

In [8]:
from sklearn.preprocessing import StandardScaler

df_etiquetado_1 = df_util_1.copy()

df_etiquetado_1["af_valence"] = df_etiquetado_1["af_valence"].apply(apply_etiqueta)
X_1 = df_etiquetado_1.drop(columns = 'af_valence')
y_1 = df_etiquetado_1["af_valence"]

X_scaled_1 = pd.DataFrame(StandardScaler().fit_transform(X_1), columns = X_1.columns)

In [9]:
X_scaled_1

Unnamed: 0,streams,popularity
0,-0.209665,-0.391136
1,-0.228652,-1.953110
2,0.337597,0.335363
3,0.265856,0.698613
4,0.223170,0.625963
...,...,...
1009045,-0.258232,-1.190286
1009046,-0.258463,-0.064212
1009047,-0.258492,-0.354812
1009048,-0.258904,-1.989435


Hacemos la separación del conjunto de datos de entrenamiento y el de validación:

In [10]:
from sklearn.model_selection import train_test_split

X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_scaled_1, y_1, test_size=0.3, random_state=0, stratify=y_1)

### Dummy Classifier

Para poder comparar los resultados obtenidos con otros clasificadores con un baseline conocido, ejecutamos una predicción con un dummy classifier.

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy = 'stratified')

dummy_clf.fit(X_train_1, y_train_1)

y_pred_1 = dummy_clf.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))


              precision    recall  f1-score   support

        High       0.19      0.19      0.19     58190
         Low       0.18      0.18      0.18     52533
 Medium-High       0.36      0.36      0.36    109647
  Medium-Low       0.27      0.27      0.27     82345

    accuracy                           0.27    302715
   macro avg       0.25      0.25      0.25    302715
weighted avg       0.27      0.27      0.27    302715



### K-Nearest Neighbors

Partimos haciendo una búsqueda para encontrar los parámetros mas idóneos con GridSearchCV

In [None]:
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

tuned_parameters = {'n_neighbors': list(range(1, 16, 1)), 'weights': ['uniform', 'distance']}

score = 'f1_macro'

cls = KNeighborsClassifier(n_jobs = -1)

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train_1, y_train_1)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

In [None]:
kn_clf_15 = KNeighborsClassifier(n_neighbors=15, weights = 'uniform', n_jobs = -1)

kn_clf_15.fit(X_train_1, y_train_1)

y_pred_1 = kn_clf_15.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))

              precision    recall  f1-score   support

        High       0.42      0.40      0.41     58190
         Low       0.39      0.30      0.34     52533
 Medium-High       0.50      0.60      0.55    109647
  Medium-Low       0.49      0.44      0.46     82345

    accuracy                           0.47    302715
   macro avg       0.45      0.44      0.44    302715
weighted avg       0.46      0.47      0.46    302715



Los resultados no son terribles, considerando que son bastante mejores que dummy, pero tampoco son buenos. En ningun caso logramos scores mayores a 0.5

### Decision Tree

Nuevamente optimizamos los parametros de entrada con GridSearchCV

In [None]:
from sklearn.tree import DecisionTreeClassifier

np.random.seed(42)

tuned_parameters = {'max_depth': list(range(1, 16, 1)), 'criterion': ['gini', 'entropy']}
score = 'f1_macro'
cls = DecisionTreeClassifier()

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train_1, y_train_1)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'criterion': 'gini', 'max_depth': 15}


In [None]:
dtree_clf = DecisionTreeClassifier(max_depth = 15, criterion = 'gini')

dtree_clf.fit(X_train_1, y_train_1)

y_pred_1 = dtree_clf.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))

              precision    recall  f1-score   support

        High       0.47      0.37      0.41     58190
         Low       0.49      0.20      0.29     52533
 Medium-High       0.49      0.65      0.56    109647
  Medium-Low       0.47      0.50      0.49     82345

    accuracy                           0.48    302715
   macro avg       0.48      0.43      0.44    302715
weighted avg       0.48      0.48      0.46    302715



Obtenemos resultados similares a los conseguidos con K-Nearest Neighbors, por lo que tampoco son excelentes.

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()

nb_clf.fit(X_train_1, y_train_1)

y_pred_1 = nb_clf.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        High       0.22      0.02      0.03     58190
         Low       0.00      0.00      0.00     52533
 Medium-High       0.37      0.85      0.52    109647
  Medium-Low       0.29      0.17      0.21     82345

    accuracy                           0.36    302715
   macro avg       0.22      0.26      0.19    302715
weighted avg       0.26      0.36      0.25    302715



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aqui los resultados son terribles, incluso peores que el dummy classifier. Hay etiquetas que incluso no recibieron ninguna prediccion. Este clasificador se descarta para el conjunto de datos usado.

### Support Vector Machines

In [None]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC()

svm_clf.fit(X_train_1, y_train_1)

y_pred_1 = svm_clf.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        High       0.29      0.00      0.00     58190
         Low       0.00      0.00      0.00     52533
 Medium-High       0.36      1.00      0.53    109647
  Medium-Low       0.00      0.00      0.00     82345

    accuracy                           0.36    302715
   macro avg       0.16      0.25      0.13    302715
weighted avg       0.19      0.36      0.19    302715



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Los resultados aqui iguales o peores que los obtenidos con Naive Bayes. Este clasificador no sirve para este conjunto de datos.

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(n_jobs = -1)

sgd_clf.fit(X_train_1, y_train_1)

y_pred_1 = sgd_clf.predict(X_val_1)

kn_acc = accuracy_score(y_val_1, y_pred_1)
print(classification_report(y_val_1, y_pred_1))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        High       0.00      0.00      0.00     58190
         Low       0.00      0.00      0.00     52533
 Medium-High       0.36      1.00      0.53    109647
  Medium-Low       0.00      0.00      0.00     82345

    accuracy                           0.36    302715
   macro avg       0.09      0.25      0.13    302715
weighted avg       0.13      0.36      0.19    302715



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


En un intento de encontrar un clasificador que funcione, probamos el uso de SGD (Stocastic Gradient Descent). Obtenemos resultados terribles que no nos sirven para la situacion actual.

Dado el conjunto limitado de atributos, el clasificador con el mejor desempeño fue Decision Tree. Sin embargo, los resultados dejan mucho que desear. Podemos concluir a priori que basados solo en los atributos asociados a la "Popularidad" no es posible hacer una prediccion de la positividad de una canción, o al menos no una que sea aceptable.

## Ampliando el uso de atributos

Dado el mal desempeño anterior, procederemos a ampliar el campo de atributos a utilizar.

Esta vez ademas de "streams" y "popularity", utilizaremos "af_danceability", "af_energy", "af_key", "af_loudness", "af_mode", "af_acousticness"

In [None]:
df_util_2 = df_spotify[["streams", "popularity", "af_danceability", "af_energy", "af_key", "af_loudness", "af_mode", "af_acousticness", "af_valence"]].copy()

In [None]:
df_util_2

Unnamed: 0,streams,popularity,af_danceability,af_energy,af_key,af_loudness,af_mode,af_acousticness,af_valence
0,28838.0,44.0,0.068,0.411,11.0,-10.319,0,0.043,0.251
1,22249.0,1.0,0.611,0.688,1.0,-5.688,10,0.264,0.393
2,218751.0,64.0,0.606,0.853,9.0,-2.975,10,0.237,0.822
3,193855.0,74.0,0.086,0.758,11.0,-0.516,10,0.021,0.453
4,179042.0,72.0,0.795,0.542,6.0,-8.106,0,0.903,0.055
...,...,...,...,...,...,...,...,...,...
1009045,11984.0,22.0,0.824,0.823,11.0,-2.718,0,0.197,0.855
1009046,11904.0,53.0,0.051,0.375,9.0,-9.185,10,0.813,0.025
1009047,11894.0,45.0,0.534,0.499,9.0,-10.601,0,0.416,0.227
1009048,11751.0,0.0,0.735,0.824,2.0,-3.483,0,0.706,0.669


Hacemos el mismo proceso de etiquetado y escalado de la información que en la primera iteración

In [None]:
df_etiquetado_2 = df_util_2.copy()

df_etiquetado_2["af_valence"] = df_etiquetado_2["af_valence"].apply(apply_etiqueta)
X_2 = df_etiquetado_2.drop(columns = 'af_valence')
y_2 = df_etiquetado_2["af_valence"]

X_scaled_2 = pd.DataFrame(StandardScaler().fit_transform(X_2), columns = X_2.columns)

In [None]:
X_scaled_2

Unnamed: 0,streams,popularity,af_danceability,af_energy,af_key,af_loudness,af_mode,af_acousticness
0,-0.209665,-0.391136,-2.345716,-0.740317,1.487257,-1.676977,-1.086527,-1.226130
1,-0.228652,-1.953110,-0.088171,0.483934,-1.217273,-0.031738,0.920363,-0.374582
2,0.337597,0.335363,-0.108959,1.213180,0.946351,0.932100,0.920363,-0.478617
3,0.265856,0.698613,-2.270880,0.793311,1.487257,1.805701,0.920363,-1.310900
4,0.223170,0.625963,0.676817,-0.161339,0.134992,-0.890772,-1.086527,2.087586
...,...,...,...,...,...,...,...,...
1009045,-0.258232,-1.190286,0.797385,1.080590,1.487257,1.023404,-1.086527,-0.632744
1009046,-0.258463,-0.064212,-2.416394,-0.899425,0.946351,-1.274105,0.920363,1.740801
1009047,-0.258492,-0.354812,-0.408302,-0.351385,0.946351,-1.777162,-1.086527,0.211098
1009048,-0.258904,-1.989435,0.427364,1.085010,-0.946820,0.751625,-1.086527,1.328514


Separamos los conjuntos de entrenamiento y validación

In [None]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X_scaled_2, y_2, test_size=0.3, random_state=0, stratify=y_2)

### K-nearest neighbors

Comenzamos con una busqueda y optimizacion de parametros con GridSearchCV

In [None]:
np.random.seed(42)

tuned_parameters = {'n_neighbors': list(range(1, 16, 1)), 'weights': ['uniform', 'distance']}

score = 'f1_macro'

cls = KNeighborsClassifier(n_jobs = -1)

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train_2, y_train_2)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'n_neighbors': 1, 'weights': 'uniform'}


In [None]:
kn_clf_1 = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', n_jobs = -1)

kn_clf_1.fit(X_train_2, y_train_2)

y_pred_2 = kn_clf_1.predict(X_val_2)

kn_acc = accuracy_score(y_val_2, y_pred_2)
print(classification_report(y_val_2, y_pred_2))

              precision    recall  f1-score   support

        High       1.00      1.00      1.00     58190
         Low       1.00      1.00      1.00     52533
 Medium-High       1.00      1.00      1.00    109647
  Medium-Low       1.00      1.00      1.00     82345

    accuracy                           1.00    302715
   macro avg       1.00      1.00      1.00    302715
weighted avg       1.00      1.00      1.00    302715



Tras correr este experimento, es claro que o la informacion provista tiene correlaciones directas con la informacion buscada, o que este modelo esta demasiado over-fitted a la muestra de datos entregada.

Dado lo anterior, vamos a reducir el scope de los atributos utilizados:

Esta vez sólo utilizaremos: "streams", "popularity", "af_loudness" y "af_tempo"

In [None]:
df_util_3 = df_spotify[["streams", "popularity", "af_loudness", "af_tempo", "af_valence"]].copy()

In [None]:
df_util_3

Unnamed: 0,streams,popularity,af_loudness,af_tempo,af_valence
0,28838.0,44.0,-10.319,115.024,0.251
1,22249.0,1.0,-5.688,178.462,0.393
2,218751.0,64.0,-2.975,178.043,0.822
3,193855.0,74.0,-0.516,97.014,0.453
4,179042.0,72.0,-8.106,167.823,0.055
...,...,...,...,...,...
1009045,11984.0,22.0,-2.718,140.014,0.855
1009046,11904.0,53.0,-9.185,132.552,0.025
1009047,11894.0,45.0,-10.601,91.954,0.227
1009048,11751.0,0.0,-3.483,95.972,0.669


Repetimos el proceso de etiquetado y escalado de la data

In [None]:
df_etiquetado_3 = df_util_3.copy()

df_etiquetado_3["af_valence"] = df_etiquetado_3["af_valence"].apply(apply_etiqueta)
X_3 = df_etiquetado_3.drop(columns = 'af_valence')
y_3 = df_etiquetado_3["af_valence"]

X_scaled_3 = pd.DataFrame(StandardScaler().fit_transform(X_3), columns = X_3.columns)

In [None]:
X_scaled_3

Unnamed: 0,streams,popularity,af_loudness,af_tempo
0,-0.209665,-0.391136,-1.676977,0.027485
1,-0.228652,-1.953110,-0.031738,1.497668
2,0.337597,0.335363,0.932100,1.487958
3,0.265856,0.698613,1.805701,-0.389899
4,0.223170,0.625963,-0.890772,1.251108
...,...,...,...,...
1009045,-0.258232,-1.190286,1.023404,0.606631
1009046,-0.258463,-0.064212,-1.274105,0.433699
1009047,-0.258492,-0.354812,-1.777162,-0.507165
1009048,-0.258904,-1.989435,0.751625,-0.414047


In [None]:
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(X_scaled_3, y_3, test_size=0.3, random_state=0, stratify=y_3)

Establecemos un baseline para evaluar los clasificadores:

### Dummy Classifier

In [None]:
dummy_clf = DummyClassifier(strategy = 'stratified')

dummy_clf.fit(X_train_3, y_train_3)

y_pred_3 = dummy_clf.predict(X_val_3)

kn_acc = accuracy_score(y_val_3, y_pred_3)
print(classification_report(y_val_3, y_pred_3))

              precision    recall  f1-score   support

        High       0.19      0.19      0.19     58190
         Low       0.17      0.17      0.17     52533
 Medium-High       0.36      0.36      0.36    109647
  Medium-Low       0.27      0.27      0.27     82345

    accuracy                           0.27    302715
   macro avg       0.25      0.25      0.25    302715
weighted avg       0.27      0.27      0.27    302715



### K-nearest neighbors

In [None]:
np.random.seed(42)

tuned_parameters = {'n_neighbors': list(range(1, 16, 1)), 'weights': ['uniform', 'distance']}

score = 'f1_macro'

cls = KNeighborsClassifier(n_jobs = -1)

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train_3, y_train_3)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'n_neighbors': 1, 'weights': 'uniform'}


In [None]:
kn_clf_1 = KNeighborsClassifier(n_neighbors=1, weights = 'uniform', n_jobs = -1)

kn_clf_1.fit(X_train_3, y_train_3)

y_pred_3 = kn_clf_1.predict(X_val_3)

kn_acc = accuracy_score(y_val_3, y_pred_3)
print(classification_report(y_val_3, y_pred_3))

              precision    recall  f1-score   support

        High       0.99      0.99      0.99     58190
         Low       0.99      0.99      0.99     52533
 Medium-High       0.99      0.99      0.99    109647
  Medium-Low       0.99      0.99      0.99     82345

    accuracy                           0.99    302715
   macro avg       0.99      0.99      0.99    302715
weighted avg       0.99      0.99      0.99    302715



Aun habiendo reducido el scope de atributos utilizados, el resultado esta demasiado ajustado a los datos.

En un ultimo intento por mejorar la situación, vamos a remover el atributo "af_loudness" y probar como sigue la situación:

In [None]:
df_util_4 = df_spotify[["streams", "popularity", "af_tempo", "af_valence"]].copy()

In [None]:
df_util_4

Unnamed: 0,streams,popularity,af_tempo,af_valence
0,28838.0,44.0,115.024,0.251
1,22249.0,1.0,178.462,0.393
2,218751.0,64.0,178.043,0.822
3,193855.0,74.0,97.014,0.453
4,179042.0,72.0,167.823,0.055
...,...,...,...,...
1009045,11984.0,22.0,140.014,0.855
1009046,11904.0,53.0,132.552,0.025
1009047,11894.0,45.0,91.954,0.227
1009048,11751.0,0.0,95.972,0.669


Nuevamente etiquetamos y escalamos los datos

In [None]:
df_etiquetado_4 = df_util_4.copy()

df_etiquetado_4["af_valence"] = df_etiquetado_4["af_valence"].apply(apply_etiqueta)
X_4 = df_etiquetado_4.drop(columns = 'af_valence')
y_4 = df_etiquetado_4["af_valence"]

X_scaled_4 = pd.DataFrame(StandardScaler().fit_transform(X_4), columns = X_4.columns)

In [None]:
X_scaled_4

Unnamed: 0,streams,popularity,af_tempo
0,-0.209665,-0.391136,0.027485
1,-0.228652,-1.953110,1.497668
2,0.337597,0.335363,1.487958
3,0.265856,0.698613,-0.389899
4,0.223170,0.625963,1.251108
...,...,...,...
1009045,-0.258232,-1.190286,0.606631
1009046,-0.258463,-0.064212,0.433699
1009047,-0.258492,-0.354812,-0.507165
1009048,-0.258904,-1.989435,-0.414047


In [None]:
X_train_4, X_val_4, y_train_4, y_val_4 = train_test_split(X_scaled_4, y_4, test_size=0.3, random_state=0, stratify=y_4)

Volvemos a establecer baseline:

### Dummy Classifier

In [None]:
dummy_clf = DummyClassifier(strategy = 'stratified')

dummy_clf.fit(X_train_4, y_train_4)

y_pred_4 = dummy_clf.predict(X_val_4)

kn_acc = accuracy_score(y_val_4, y_pred_4)
print(classification_report(y_val_4, y_pred_4))

              precision    recall  f1-score   support

        High       0.19      0.19      0.19     58190
         Low       0.17      0.17      0.17     52533
 Medium-High       0.36      0.36      0.36    109647
  Medium-Low       0.27      0.27      0.27     82345

    accuracy                           0.27    302715
   macro avg       0.25      0.25      0.25    302715
weighted avg       0.27      0.27      0.27    302715



### K-nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

tuned_parameters = {'n_neighbors': list(range(1, 16, 1)), 'weights': ['uniform', 'distance']}

score = 'f1_macro'

cls = KNeighborsClassifier(n_jobs = -1)

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train_4, y_train_4)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'n_neighbors': 2, 'weights': 'distance'}


In [None]:
kn_clf_2 = KNeighborsClassifier(n_neighbors=2, weights = 'distance', n_jobs = -1)

kn_clf_2.fit(X_train_4, y_train_4)

y_pred_4 = kn_clf_2.predict(X_val_4)

kn_acc = accuracy_score(y_val_4, y_pred_4)
print(classification_report(y_val_4, y_pred_4))

              precision    recall  f1-score   support

        High       0.96      0.96      0.96     58190
         Low       0.95      0.95      0.95     52533
 Medium-High       0.96      0.96      0.96    109647
  Medium-Low       0.96      0.95      0.95     82345

    accuracy                           0.96    302715
   macro avg       0.96      0.96      0.96    302715
weighted avg       0.96      0.96      0.96    302715



Los resultados esta vez son buenos, pero sin llegar al nivel de overfit que se presentaba en las situaciones anteriores.
Creemos que este es un buen modelo para responder a la pregunta buscada.

Dados estos resultados, la pregunta se puede reformular:

- ¿Es posible predecir la positividad de una cancion basandose en su popularidad y tempo?

A lo cual podemos responder con un alto grado de certeza que si, en efecto es posible, y con muy buenos resultados.

## Pregunta 2:
- ¿Es posible asociar el valor de energía u otros parámetros de una canción, con la región geográfica en donde es más popular?

In [None]:
#juntar paises en base al continente
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
df_P2 = df_spotify.copy()

Primero modificamos la información de la columna regiones para reducir las clases al agrupar los países por regiones geográficas más amplia.

In [None]:
d1 = [
    (['Morocco','South Africa'],'Africa'),
    (['Argentina','Bolivia','Brazil','Chile','Colombia','Costa Rica','Dominican Republic','Ecuador','El Salvador','Guatemala','Honduras','Mexico','Nicaragua','Panama','Paraguay','Peru','Uruguay'],'America_latina'),
    (['Canada','United States'],'America_norte'),
    (['Hong Kong','India','Indonesia','Japan','Malaysia','Philippines','Singapore','Taiwan','Thailand','Vietnam'],'Asia'),
    (['Andorra','Austria','Belgium','Bulgaria','Czech Republic','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Iceland','Ireland','Italy','Latvia','Lithuania','Luxembourg',
     'Netherlands','Norway','Poland','Portugal','Romania','Russia','Slovakia','Spain','Sweden','Switzerland','Ukraine','United Kingdom'],'Europa'),
    (['Australia','New Zealand'],'Oceania'),
    (['Egypt','Israel','Saudi Arabia','Turkey','United Arab Emirates'],'Oriente_medio')
]

def apply_etiqueta(elemento):
    for i in d1:
        if elemento in i[0]:
            return i[1]
        
new_reg = df_P2["region"]


for index, row in df_P2.iterrows():
    new_data = apply_etiqueta(row["region"])
    new_reg.iat[index] = new_data

#se reemplaza la columa de region
df_P2["region"] = new_reg


Para encontrar si esta relación existe vamos a usar un Decision Tree Classifier.

Primero vamos atributo por atributo probando para ver cual tiene mejores resultados.

In [None]:
for i in ['af_danceability','af_energy', 'af_key', 'af_loudness', 'af_mode', 'af_speechiness','af_acousticness', 'af_liveness', 'af_valence','af_tempo', 'af_time_signature']:

    df_P2 = df_P2.dropna(subset=["region", i])  # Drop rows where target or feature is NaN
    X = df_P2[i].copy().values.reshape(-1, 1)
    y = df_P2["region"]

    #primero separamos los datos de entrenamiento y validacion/test
    X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

    # Luego separamos los datos de validación y pruebas                                       0.5 x 0.3 = 0.15
    X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=0, stratify=y_val_and_test)

    #estandarizacion de datos, transformar los datos de manera que tengan media 0 y desviacion estandar 1.
    std_scaler = StandardScaler()
    X_train_std_scaled = std_scaler.fit_transform(X_train)

    X_val_std_scaled = std_scaler.transform(X_val)
    X_test_std_scaled = std_scaler.transform(X_test)

    #definimos nuestro modelo
    clf = DecisionTreeClassifier(criterion="gini",random_state=0)

    #entrenamos al modelo con fit
    clf.fit(X_train_std_scaled, y_train)   ## Entrenar usando X (features), y (clase)

    #realizamos predicciones de nuestros datos con los datos de validación
    y_val_pred = clf.predict(X_val_std_scaled)


    #metricas de las predicciones
    print(i)
    print(classification_report(y_val, y_val_pred))

af_danceability
                precision    recall  f1-score   support

        Africa       0.42      0.02      0.03      1785
America_latina       0.53      0.50      0.52     15151
 America_norte       0.88      0.00      0.01      1785
          Asia       0.52      0.26      0.35      8921
        Europa       0.51      0.80      0.62     25448
       Oceania       0.53      0.01      0.03      1785
 Oriente_medio       0.46      0.04      0.07      4462

      accuracy                           0.51     59337
     macro avg       0.55      0.23      0.23     59337
  weighted avg       0.52      0.51      0.46     59337

af_energy
                precision    recall  f1-score   support

        Africa       0.56      0.02      0.03      1785
America_latina       0.55      0.53      0.54     15151
 America_norte       0.31      0.01      0.02      1785
          Asia       0.51      0.28      0.36      8921
        Europa       0.52      0.80      0.63     25448
       Oceania    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

        Africa       0.00      0.00      0.00      1785
America_latina       0.00      0.00      0.00     15151
 America_norte       0.00      0.00      0.00      1785
          Asia       0.00      0.00      0.00      8921
        Europa       0.43      1.00      0.60     25448
       Oceania       0.00      0.00      0.00      1785
 Oriente_medio       0.00      0.00      0.00      4462

      accuracy                           0.43     59337
     macro avg       0.06      0.14      0.09     59337
  weighted avg       0.18      0.43      0.26     59337

af_loudness
                precision    recall  f1-score   support

        Africa       0.79      0.39      0.52      1785
America_latina       0.75      0.71      0.73     15151
 America_norte       0.55      0.12      0.20      1785
          Asia       0.75      0.57      0.65      8921
        Europa       0.64      0.88      0.74     25448
       Oceania       0.67      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

        Africa       0.00      0.00      0.00      1785
America_latina       0.00      0.00      0.00     15151
 America_norte       0.00      0.00      0.00      1785
          Asia       0.00      0.00      0.00      8921
        Europa       0.43      1.00      0.60     25448
       Oceania       0.00      0.00      0.00      1785
 Oriente_medio       0.00      0.00      0.00      4462

      accuracy                           0.43     59337
     macro avg       0.06      0.14      0.09     59337
  weighted avg       0.18      0.43      0.26     59337

af_speechiness
                precision    recall  f1-score   support

        Africa       0.50      0.07      0.13      1785
America_latina       0.56      0.51      0.54     15151
 America_norte       0.24      0.00      0.00      1785
          Asia       0.49      0.25      0.33      8921
        Europa       0.52      0.82      0.64     25448
       Oceania       0.63    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

        Africa       0.00      0.00      0.00      1785
America_latina       0.00      0.00      0.00     15151
 America_norte       0.00      0.00      0.00      1785
          Asia       0.00      0.00      0.00      8921
        Europa       0.43      1.00      0.60     25448
       Oceania       0.00      0.00      0.00      1785
 Oriente_medio       0.00      0.00      0.00      4462

      accuracy                           0.43     59337
     macro avg       0.06      0.14      0.09     59337
  weighted avg       0.18      0.43      0.26     59337



De lo anterior se pude observar que las dos características que presentan mayor relación son loudness y tempo.

Ahora probamos con todas las características a la vez para ver si se encuentra un mejor resultado.

In [None]:
df_P2 = df_P2.dropna(subset=["region", i])  # Drop rows where target or feature is NaN
X = df_P2[['af_danceability','af_energy', 'af_key', 'af_loudness', 'af_mode', 'af_speechiness','af_acousticness', 'af_liveness', 'af_valence','af_tempo', 'af_time_signature']].copy()
y = df_P2["region"]

#primero separamos los datos de entrenamiento y validacion/test
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

# Luego separamos los datos de validación y pruebas                                       0.5 x 0.3 = 0.15
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=0, stratify=y_val_and_test)

#estandarizacion de datos, transformar los datos de manera que tengan media 0 y desviacion estandar 1.
std_scaler = StandardScaler()
X_train_std_scaled = std_scaler.fit_transform(X_train)

X_val_std_scaled = std_scaler.transform(X_val)
X_test_std_scaled = std_scaler.transform(X_test)

#definimos nuestro modelo
clf = DecisionTreeClassifier(criterion="gini",random_state=0)

#entrenamos al modelo con fit
clf.fit(X_train_std_scaled, y_train)   ## Entrenar usando X (features), y (clase)

#realizamos predicciones de nuestros datos con los datos de validación
y_val_pred = clf.predict(X_val_std_scaled)


#metricas de las predicciones
print("todas")
print(classification_report(y_val, y_val_pred))

El resultado es mejor que cualquiera de los intentos individuales

Ahora se hará con los dos mejores resultados anteriores a la vez por si al agregar todos los dados se está perdiendo algo 

In [None]:
df_P2 = df_P2.dropna(subset=["region", i])  # Drop rows where target or feature is NaN
X = df_P2[["af_loudness","af_tempo"]].copy()
y = df_P2["region"]

#primero separamos los datos de entrenamiento y validacion/test
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

# Luego separamos los datos de validación y pruebas                                       0.5 x 0.3 = 0.15
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=0, stratify=y_val_and_test)

#estandarizacion de datos, transformar los datos de manera que tengan media 0 y desviacion estandar 1.
std_scaler = StandardScaler()
X_train_std_scaled = std_scaler.fit_transform(X_train)

X_val_std_scaled = std_scaler.transform(X_val)
X_test_std_scaled = std_scaler.transform(X_test)

#definimos nuestro modelo
clf = DecisionTreeClassifier(criterion="gini",random_state=0)

#entrenamos al modelo con fit
clf.fit(X_train_std_scaled, y_train)   ## Entrenar usando X (features), y (clase)

#realizamos predicciones de nuestros datos con los datos de validación
y_val_pred = clf.predict(X_val_std_scaled)


#metricas de las predicciones
print("af_loudness y af_tempo")
print(classification_report(y_val, y_val_pred))

af_loudness y af_tempo
                precision    recall  f1-score   support

        Africa       0.94      0.48      0.64      1785
America_latina       0.82      0.75      0.78     15151
 America_norte       0.64      0.19      0.29      1785
          Asia       0.87      0.68      0.76      8921
        Europa       0.68      0.92      0.78     25448
       Oceania       0.83      0.25      0.39      1785
 Oriente_medio       0.86      0.38      0.52      4462

      accuracy                           0.75     59337
     macro avg       0.81      0.52      0.59     59337
  weighted avg       0.77      0.75      0.73     59337



La con mejor resultado sigue siendo loudness y key, al parecer al agregar el resto de los datos no suma ni resta al resultado por lo tanto tomares el de los dos mejores para el test.

In [None]:
#realizamos predicciones de nuestros datos con los datos de validación
y_test_pred = clf.predict(X_test_std_scaled)


#metricas de las predicciones
print("todas")
print(classification_report(y_test, y_test_pred))


todas
                precision    recall  f1-score   support

        Africa       0.94      0.46      0.62      1785
America_latina       0.82      0.75      0.78     15151
 America_norte       0.64      0.17      0.27      1785
          Asia       0.88      0.67      0.76      8922
        Europa       0.68      0.92      0.78     25448
       Oceania       0.83      0.22      0.35      1785
 Oriente_medio       0.87      0.38      0.53      4461

      accuracy                           0.74     59337
     macro avg       0.81      0.51      0.59     59337
  weighted avg       0.77      0.74      0.73     59337



Se podría seguir experimentando al buscar alguna otra combinación de características que den un mejor resultado, pero tomar demasiado tiempo al ser demasiado combinaciones

## Pregunta 3:
- ¿Qué parámetros son más adecuados para predecir la popularidad de una canción?

In [4]:
import requests
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

def remove_numbers(text):
    text = re.sub(r"[0-9]", " ", text)
    return " ".join(text.split())

def remove_unprintable_(text):
    printable = set(string.printable + "ñáéíóúü" + "ÑÁÉÍÓÚÜ")
    text = "".join(filter(lambda x: x in printable, text))
    return text

def remove_punctuation(text):
    pattern = re.compile(r"[^\w\sáéíóúüñÁÉÍÓÚÜÑ]")
    t = pattern.sub(r" ", text)
    return re.sub(" +", " ", t)

def reduce_spam(text):
    text = re.sub(r"(\w+)(\s+\1){2,}", r"\1", text)
    text = re.sub(r"(\w+\s+\w+)(\s+\1){2,}", r"\1", text)
    return text

def remove_vowels_accents(text):
    return (
        text.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
        .replace("ü", "u")
    )

def remove_stopwords(text, stopwords_list):
    return " ".join(
        [word for word in str(text).split() if word not in stopwords_list]
    )

url = "https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt"
r = requests.get(url)

stopwords_list = r.text.splitlines()

def preprocesar(text):
    text = text.lower()
    text = remove_numbers(text)
    text = remove_unprintable_(text)
    text = remove_punctuation(text)
    text = reduce_spam(text)
    text = remove_stopwords(text, stopwords_list)
    text = remove_vowels_accents(text)
    return text.strip()

In [5]:
df_P3 = df_spotify.copy()

La idea del experimento asociado a la pregunta 3 es predecir la popularidad de la canción considerando su composición en términos numéricos, esto es considerando: valencia, tempo, acordes, ritmo, duración, etc. Se considera que hay información que no es relevante para la predicción como “url”, “track”, “album”, etc. Esta información no está incluida en el análisis, y debido a que el nombre del artista es una variable que influye directamente en la popularidad esta se excluye también. La única variable no numérica es “region”. 
Antes de realizar el preprocesamiento y posterior entrenamiento se determinan rangos de popularidad según los cuartiles de la popularidad en el dataset, con la intención de clasificar los datos en rangos de popularidad.
Se realiza un preprocesamiento para vectorizar el texto y se estandarizan los valores numéricos, se utiliza como modelo un random forest y todo esto se implementa en un pipeline como se detalla a continuación. 


Quitamos las columnas con información no relevante

In [6]:
df_P3=df_P3.drop(columns=['date','url','chart', 'track_id','available_markets','Año','Día','Column1','release_date','explicit','title','artist','trend','album','af_mode','rank','af_time_signature'])

Calculamos una medida para clasificar la popularidad por rangos

In [7]:
cuartiles = df_P3['popularity'].quantile([0.25, 0.5, 0.75])

Seleccionamos una muestra del dataset

In [8]:
df_2000 = df_P3.head(2000)

Definimos otra forma de escribir la popularidad

In [9]:
new_pop = df_2000['popularity']

Clasificamos la popularidad en rangos

In [10]:
for index, row in df_2000.iterrows():
    if row['popularity']>=cuartiles[0.75]:
        new_pop.iat[index] = 'High'
    elif row['popularity']>=cuartiles[0.50]:
        new_pop.iat[index] = 'Medium'
    elif row['popularity']>=cuartiles[0.25]:
        new_pop.iat[index] = 'Medium-Low'
    else:
        new_pop.iat[index] = 'Low'

  new_pop.iat[index] = 'Medium-Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Medium-Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Low'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'Medium'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pop.iat[index] = 'High'


Reemplazamos la columna de popularidad

In [11]:
df_2000['popularity'].astype(str)
df_2000['popularity'] = new_pop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2000['popularity'] = new_pop


Cambiamos la nueva clasificación a un número

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_2000['popularity'] = le.fit_transform(df_2000['popularity'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2000['popularity'] = le.fit_transform(df_2000['popularity'])


In [13]:
from sklearn.preprocessing import StandardScaler
features = ['region','streams','duration_ms', 'af_danceability', 'af_energy', 'af_key', 'af_loudness', 'af_speechiness', 
            'af_acousticness', 'af_instrumentalness', 'af_liveness', 'af_valence', 'af_tempo']

Preparamos el dataset para entrenar

In [14]:
X = df_2000[features]
y = df_2000['popularity']

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.5, random_state=0, stratify=y_val_and_test)

Generamos un pipeline para:
- Transformar texto
- Estandarizar los datos
- Aplicar un clasificador (random forest)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
import numpy as np

# Define tus columnas
text_feature = 'region'
numeric_features = ['streams', 'duration_ms', 'af_danceability', 'af_energy', 'af_key', 
                    'af_loudness', 'af_speechiness', 'af_acousticness', 'af_instrumentalness', 
                    'af_liveness', 'af_valence', 'af_tempo']

# Crear la pipeline completa
clf_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('text', Pipeline([
                ('vectorizer', CountVectorizer(min_df=1, preprocessor = preprocesar)),
                ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
            ]), text_feature),
            ('num', StandardScaler(), numeric_features)
        ])),
    ('classifier', RandomForestClassifier(random_state=42))
])

Entrenamos

In [17]:
clf_pipeline.fit(X_train, y_train)

predecimos 

In [18]:
y_pred = clf_pipeline.predict(X_test)

evaluamos la predicción

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
target_names = ['High', 'Low', 'Medium-High', 'Medium-Low']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        High       0.93      1.00      0.97       115
         Low       0.77      0.74      0.75        54
 Medium-High       0.94      0.90      0.92        67
  Medium-Low       0.75      0.72      0.74        64

    accuracy                           0.87       300
   macro avg       0.85      0.84      0.84       300
weighted avg       0.87      0.87      0.87       300



### Resultados: 
Podemos observar que se tiene buenos métricas para predecir que una canción tendrá una alta popularidad pero no tan buenas métricas para baja popularidad.

#### Camino a seguir
Probaremos reducir la cantidad de parámetros a utilizar y probar distintos modelos de clasificación, considerando todo el dataset y no solo una parte de él. 

### Replanteamiento de la pregunta

¿Cuál será la popularidad de una canción en una región debido a sus atributos?
