# Experimento:

Buscamos lograr contestar la pregunta: 
- ¿Es posible predecir la felicidad (positividad) de una canción en función de la popularidad (u otros parametros)?

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataframe = pd.read_excel('../Spotify.xlsx')
df_spotify = pd.DataFrame(dataframe)

#Ajustamos columnas para contraarrestar error de formato en archivo xlsx
df_spotify['duration_ms'] = df_spotify['duration_ms']/10
df_spotify['popularity'] = df_spotify['popularity']/10
df_spotify['streams'] = df_spotify['streams']/10
df_spotify['af_danceability'] = df_spotify['af_danceability']/1000
df_spotify['af_energy'] = df_spotify['af_energy']/1000
df_spotify['af_key'] = df_spotify['af_key']/10
df_spotify['af_loudness'] = df_spotify['af_loudness']/1000
df_spotify['af_speechiness'] = df_spotify['af_speechiness']/1000
df_spotify['af_acousticness'] = df_spotify['af_acousticness']/1000
df_spotify['af_instrumentalness'] = df_spotify['af_instrumentalness']/1000
df_spotify['af_liveness'] = df_spotify['af_liveness']/1000
df_spotify['af_valence'] = df_spotify['af_valence']/1000
df_spotify['af_tempo'] = df_spotify['af_tempo']/1000
df_spotify['af_time_signature'] = df_spotify['af_time_signature']/10

## Prediciendo con solo popularidad

In [3]:
df_util = df_spotify[["streams", "popularity", "af_valence"]]

In [4]:
def apply_etiqueta(elemento):
    if (elemento <= 0.25):
        return "Low"
    elif (elemento > 0.25) & (elemento <= 0.5):
        return "Medium-Low"
    elif (elemento > 0.5) & (elemento <= 0.75):
        return "Medium-High"
    else:
        return "High"

In [12]:
from sklearn.model_selection import train_test_split

df_etiquetado = df_util.copy()

df_etiquetado["af_valence"] = df_etiquetado["af_valence"].apply(apply_etiqueta)
X = df_etiquetado[["streams", "popularity"]]
y = df_etiquetado["af_valence"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)


## Dummy Classifier

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy = 'stratified')

dummy_clf.fit(X_train, y_train)

y_pred = dummy_clf.predict(X_val)

kn_acc = accuracy_score(y_val, y_pred)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

        High       0.19      0.19      0.19     58190
         Low       0.17      0.17      0.17     52533
 Medium-High       0.36      0.36      0.36    109647
  Medium-Low       0.27      0.27      0.27     82345

    accuracy                           0.27    302715
   macro avg       0.25      0.25      0.25    302715
weighted avg       0.27      0.27      0.27    302715



## K-Nearest Neighbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(42)

tuned_parameters = {'n_neighbors': list(range(1, 16, 1))}

score = 'f1_macro'

cls = KNeighborsClassifier()

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train, y_train)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'n_neighbors': 1}


In [15]:
kn_clf_1 = KNeighborsClassifier(n_neighbors=1)

kn_clf_1.fit(X_train, y_train)

y_pred = kn_clf_1.predict(X_val)

kn_acc = accuracy_score(y_val, y_pred)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

        High       0.25      0.25      0.25     58190
         Low       0.22      0.22      0.22     52533
 Medium-High       0.40      0.40      0.40    109647
  Medium-Low       0.33      0.33      0.33     82345

    accuracy                           0.32    302715
   macro avg       0.30      0.30      0.30    302715
weighted avg       0.32      0.32      0.32    302715



## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

np.random.seed(42)

tuned_parameters = {'max_depth': list(range(1, 16, 1)), 'criterion': ['gini', 'entropy']}
score = 'f1_macro'
cls = DecisionTreeClassifier()

clf = GridSearchCV(cls, param_grid = tuned_parameters, scoring = score, cv = 5)

clf.fit(X_train, y_train)

print("Mejor combinación de parámetros:")
print(clf.best_params_)

Mejor combinación de parámetros:
{'criterion': 'gini', 'max_depth': 15}


In [17]:
dtree_clf = DecisionTreeClassifier(max_depth = 15, criterion = 'gini')

dtree_clf.fit(X_train, y_train)

y_pred = dtree_clf.predict(X_val)

kn_acc = accuracy_score(y_val, y_pred)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

        High       0.47      0.37      0.41     58190
         Low       0.49      0.20      0.29     52533
 Medium-High       0.49      0.65      0.56    109647
  Medium-Low       0.47      0.50      0.49     82345

    accuracy                           0.48    302715
   macro avg       0.48      0.43      0.44    302715
weighted avg       0.48      0.48      0.46    302715



## Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()

nb_clf.fit(X_train, y_train)

y_pred = nb_clf.predict(X_val)

kn_acc = accuracy_score(y_val, y_pred)
print(classification_report(y_val, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

        High       0.21      0.02      0.03     58190
         Low       0.00      0.00      0.00     52533
 Medium-High       0.37      0.86      0.52    109647
  Medium-Low       0.29      0.16      0.21     82345

    accuracy                           0.36    302715
   macro avg       0.22      0.26      0.19    302715
weighted avg       0.26      0.36      0.25    302715



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Support Vector Machines

In [None]:
from sklearn.svm import SVC 

svm_clf = SVC()

svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_val)

kn_acc = accuracy_score(y_val, y_pred)
print(classification_report(y_val, y_pred))