- SCHERRER Arthur
- CHIRON Yoann
- BOQUAIN Mathis

# Classifieurs - Spotify Best Songs from 2000 to 2023
-> Prédiction du genre des musiques populaires de 2000 à 2023

### Import des différentes librairies

In [142]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Chargement des données

In [143]:
data = pd.read_csv("spotify_songs.csv", sep=';')

### Affichage

In [144]:
df = pd.DataFrame(data)
df.head()


Unnamed: 0,title,artist,top genre,year,bpm,energy,danceability,valence,duration,acousticness,speechiness
0,Flowers,Miley Cyrus,pop,2023,118,68,71,65,200,6,7
1,Cupid - Twin Ver.,FIFTY FIFTY,k-pop girl group,2023,120,59,78,73,174,44,3
2,BESO,ROSALÍA,pop,2023,95,64,77,53,195,74,14
3,Boy's a liar Pt. 2,PinkPantheress,bronx drill,2023,133,81,70,86,131,25,5
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,2022,98,62,72,17,222,42,5


### Discrétisation des données

Certaines features ont été supprimées où ne sont pas utilisées car peu de pertinences face au sujet étudié :
- dB
- liveness
- popularity : tous les morceaux sont populaires

#### Feature à discrétiser

- genre
- bpm
- danceability
- energy
- valence
- acousticness
- speechiness
- year 
- duration

### Discrétisation du genre

In [145]:
df_sorted = df.sort_values(by="top genre")

df['top genre'] = df['top genre'].str.replace('.*pop.*', 'pop', regex=True)
df['top genre'] = df['top genre'].str.replace('.*hip hop.*', 'rap', regex=True)
df['top genre'] = df['top genre'].str.replace('.*hip-hop.*', 'rap', regex=True)
df['top genre'] = df['top genre'].str.replace('.*rap.*', 'rap', regex=True)
df['top genre'] = df['top genre'].str.replace('.*metal.*', 'metal', regex=True)
df['top genre'] = df['top genre'].str.replace('.*r&b.*', 'r&b', regex=True)
df['top genre'] = df['top genre'].str.replace('.*edm.*', 'techno', regex=True)
df['top genre'] = df['top genre'].str.replace('.*electro.*', 'techno', regex=True)
df['top genre'] = df['top genre'].str.replace('.*electro dance music.*', 'techno', regex=True)
df['top genre'] = df['top genre'].str.replace('.*indie.*', 'indian', regex=True)
df['top genre'] = df['top genre'].str.replace('.*rock.*', 'rock', regex=True)
df['top genre'] = df['top genre'].str.replace('.*house.*', 'techno', regex=True)
df['top genre'] = df['top genre'].str.replace('.*jazz.*', 'jazz', regex=True)
df['top genre'] = df['top genre'].str.replace('.*mellow.*', 'rock', regex=True)
df['top genre'] = df['top genre'].str.replace('.*soul.*', 'soul', regex=True)

genres_to_keep = ['pop', 'rap', 'metal', 'r&b', 'techno', 'rock']
mask = df['top genre'].isin(genres_to_keep)
df = df[mask]

df_count = df['top genre'].value_counts()
df.head()

Unnamed: 0,title,artist,top genre,year,bpm,energy,danceability,valence,duration,acousticness,speechiness
0,Flowers,Miley Cyrus,pop,2023,118,68,71,65,200,6,7
1,Cupid - Twin Ver.,FIFTY FIFTY,pop,2023,120,59,78,73,174,44,3
2,BESO,ROSALÍA,pop,2023,95,64,77,53,195,74,14
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,2022,98,62,72,17,222,42,5
8,Anti-Hero,Taylor Swift,pop,2022,97,64,64,53,201,13,5


### Méthode de discrétisation

In [146]:
def discretize_column(df, column_name, num_intervals, labels=None):
    """
    Discrétise une colonne de données dans un DataFrame en fonction d'un nombre d'intervalles spécifié.

    Args:
    - df : DataFrame contenant les données
    - column_name : Nom de la colonne à discrétiser
    - num_intervals : Nombre d'intervalles souhaité
    - labels : Liste des labels pour les catégories discrétisées

    Returns:
    - Un DataFrame avec la colonne initiale remplacée par les données discrétisées
    """

    # Calcul des quantiles en fonction du nombre d'intervalles
    quantiles = [i / num_intervals for i in range(num_intervals)]

    # Calcul des valeurs des quantiles
    quantiles_values = df[column_name].quantile(quantiles)

    # Définition des intervalles
    intervals = [df[column_name].min()] + list(quantiles_values.unique()) + [df[column_name].max()]

    # Définition des labels si spécifiés, sinon utiliser les intervalles comme labels
    if labels is None:
        labels = [f"Interval {i+1}" for i in range(len(quantiles) + 1)]

    # Discrétisation de la colonne en supprimant les bords d'intervalles en double
    df[column_name] = pd.cut(df[column_name], bins=intervals, labels=labels, include_lowest=True, duplicates='drop')

    return df

df = discretize_column(df, 'bpm', num_intervals=4, labels=['Low bpm', 'Medium bpm', 'High bpm', 'Very High bpm'])
df = discretize_column(df, 'danceability', num_intervals=4, labels=['Low Danceability', 'Moderate Danceability', 'High Danceability', 'Very High Danceability'])
df = discretize_column(df, 'energy', num_intervals=4, labels=['Low Energy', 'Moderate Energy', 'High Energy', 'Very High Energy'])
df = discretize_column(df, 'valence', num_intervals=3, labels=['negative', 'neutral', 'positive'])
df = discretize_column(df, 'acousticness', num_intervals=4, labels= ['Low Acousticness', 'Moderate Acousticness', 'High Acousticness', 'Very High Acousticness'])
df = discretize_column(df, 'speechiness', num_intervals=4, labels=['Low Speechiness', 'Moderate Speechiness', 'High Speechiness', 'Very High Speechiness'])
df = discretize_column(df, 'year', num_intervals=3, labels=["2000's","2010's", "2020's"])
df = discretize_column(df, 'duration', num_intervals=3, labels=['short', 'medium', 'long'])

df.head()

Unnamed: 0,title,artist,top genre,year,bpm,energy,danceability,valence,duration,acousticness,speechiness
0,Flowers,Miley Cyrus,pop,2020's,Medium bpm,Moderate Energy,High Danceability,neutral,short,Moderate Acousticness,High Speechiness
1,Cupid - Twin Ver.,FIFTY FIFTY,pop,2020's,Medium bpm,Low Energy,Very High Danceability,positive,short,Very High Acousticness,Low Speechiness
2,BESO,ROSALÍA,pop,2020's,Low bpm,Moderate Energy,High Danceability,neutral,short,Very High Acousticness,Very High Speechiness
4,Creepin' (with The Weeknd & 21 Savage),Metro Boomin,rap,2020's,Low bpm,Moderate Energy,High Danceability,negative,medium,Very High Acousticness,Moderate Speechiness
8,Anti-Hero,Taylor Swift,pop,2020's,Low bpm,Moderate Energy,Moderate Danceability,neutral,short,High Acousticness,Moderate Speechiness


### Séparation du dataset 

In [147]:
x = df[['bpm','danceability','energy','valence','acousticness','speechiness','year','duration']]
y = df['top genre']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=101)

means_train = np.mean(x_train, axis=0)
stds_train = np.std(x_train, axis=0)

# Étape 2 : Application de la standardisation
train_std = (x_train - means_train) / stds_train

means_test = np.mean(x_test, axis=0)
stds_test = np.std(x_test, axis=0)

# Étape 2 : Application de la standardisation
test_std = (x_test - means_test) / stds_test


TypeError: 'Categorical' with dtype category does not support reduction 'mean'

# Supervisé

### ZeroR

In [141]:
def learn_zeror(x_train, y_train):
    unique_labels, counts = np.unique(y_train, return_counts=True)

    # Trouvez l'indice de l'étiquette avec le nombre le plus élevé
    index_max_count = np.argmax(counts)
    
    # L'étiquette avec le nombre le plus élevé est à l'indice index_max_count
    most_common_label = unique_labels[index_max_count]
    
    return most_common_label 

def class_zeror(model, x_test):
    return [model]* len(x_test)

('accuracy =  54.947916666666664%',
 'précision =  9.15798611111111%',
 'rappel =  16.666666666666664%')

### OneR Mathis

In [136]:
def learn_oner(x_train_dis, y_train_dis):
    best_feature = None
    best_error_rate = 1.0
    best_rules = {}

    for feature in x_train_dis.columns:
        unique_classes = y_train_dis.unique()
        best_rule = {}
        
        for value in x_train_dis[feature].unique():
            value_counts = {}
            for class_ in unique_classes:
                value_counts[class_] = len(y_train_dis[(x_train_dis[feature] == value) & (y_train_dis == class_)])

            best_class = max(value_counts, key=value_counts.get)
            best_rule[value] = best_class

        # Calculer le taux d'erreur pour cette caractéristique
        error_rate = 0
        for value in x_train_dis[feature].unique():
            error_rate += len(y_train_dis[(x_train_dis[feature] == value) & (y_train_dis != best_rule[value])])

        error_rate /= len(x_train_dis)

        if error_rate < best_error_rate:
            best_error_rate = error_rate
            best_feature = feature
            best_rules = best_rule

    return {best_feature: best_rules}

def class_oner(model, x_test_dis):
    predictions = []
    feature = list(model.keys())[0]

    for index, row in x_test_dis.iterrows():
        feature_value = row[feature]
        if feature_value in model[feature]:
            predicted_class = model[feature][feature_value]
            predictions.append(predicted_class)
    return pd.Series(predictions)

evaluate(y_test,class_oner(learn_oner(x_train, y_train),x_test))

('accuracy =  58.59375%',
 'précision =  18.470211565260453%',
 'rappel =  23.08232403019133%')

('accuracy =  58.59375%',
 'précision =  18.470211565260453%',
 'rappel =  23.08232403019133%')

### Naïve Bayes

In [135]:
def calculate_likelihood(mean, var, x):
    eps = 1e-4  
    coef = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
    exponent = np.exp(-((x - mean) ** 2) / (2 * (var + eps)))
    return coef * exponent

def learn_naive_bayes(X, y):
    
    classes = np.unique(y)  
    mean_var_by_class = {} 

    for c in classes:
        X_class = X[y == c]
        mean = X_class.mean(axis=0)
        var = X_class.var(axis=0)
        mean_var_by_class[c] = (mean, var)

    return mean_var_by_class

def class_naive_bayes(mean_var_by_class, X):
    predictions = []
    for x in X:
        class_probs = {}
        for c, (mean, var) in mean_var_by_class.items():
            likelihood = calculate_likelihood(mean, var, x)
            class_probs[c] = np.prod(likelihood)
        predicted_class = max(class_probs, key=class_probs.get)
        predictions.append(predicted_class)
    return predictions

mean = learn_naive_bayes(train_std, y_train)

naive_baisenaive_classification = class_naive_bayes(mean, np.array(test_std))

print(mean)

evaluate(y_test, naive_baisenaive_classification)

('accuracy =  34.11458333333333%',
 'précision =  33.65118265965724%',
 'rappel =  49.11172244314914%')


{'metal': (bpm             0.162689
danceability   -1.032868
energy          0.810671
valence        -0.243530
acousticness   -0.599639
speechiness    -0.573655
year           -1.117547
duration        0.152217
dtype: float64, bpm             1.010292
danceability    0.541361
energy          0.660004
valence         0.968002
acousticness    0.156728
speechiness     0.091603
year            0.433374
duration        0.911803
dtype: float64), 'pop': (bpm            -0.001309
danceability    0.035184
energy         -0.036292
valence         0.079128
acousticness    0.134779
speechiness    -0.163618
year            0.097914
duration       -0.049201
dtype: float64, bpm             0.923790
danceability    0.922496
energy          1.014045
valence         1.062304
acousticness    1.255534
speechiness     0.703047
year            0.951361
duration        0.879905
dtype: float64), 'r&b': (bpm            -0.289288
danceability   -0.091960
energy         -0.326143
valence        -0.165616
acousti

('accuracy =  34.11458333333333%',
 'précision =  33.65118265965724%',
 'rappel =  49.11172244314914%')

### KPPV

In [123]:
y_train.nunique()

6

In [140]:
def euclidean_distance(array1, array2):
    squared_diff = np.square(array1 - array2)
    sum_squared_diff = np.sum(squared_diff)
    return np.sqrt(sum_squared_diff)

from random import choice

def k_nearest_neighbors(X_train, Y_train, X_test, k=3):
    y_pred = []
    for x in X_test.values:
        distances = [euclidean_distance(x, x_train) for x_train in X_train.values]
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = [Y_train.values[i] for i in k_indices]

        label_counts = dict()
        for label in k_nearest_labels:
            label_counts[label] = label_counts.get(label, 0) + 1

        most_common_labels = [label for label, count in label_counts.items() if count == max(label_counts.values())]
        predicted_label = choice(most_common_labels)

        y_pred.append(predicted_label)
    return np.array(y_pred)

pred = k_nearest_neighbors(train_std, y_train, test_std)

evaluate(y_test, pred)

('accuracy =  53.90625%',
 'précision =  33.829944801811806%',
 'rappel =  34.21549756980383%')

('accuracy =  52.864583333333336%',
 'précision =  36.47991081260207%',
 'rappel =  33.03351582502587%')

# Non Supervisé

### K-means Mathis Boquain

In [None]:
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

def initialize_centroids(X, k):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    return centroids

# Fonction pour attribuer chaque point au centroïde le plus proche
def assign_to_nearest(X, centroids):
    clusters = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        distances = np.linalg.norm(X[i] - centroids, axis=1)
        clusters[i] = np.argmin(distances)
    return clusters

# Fonction pour mettre à jour les centroides
def update_centroids(X, clusters, k):
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        cluster_points = X[clusters == i]
        centroids[i] = np.mean(cluster_points, axis=0)
    return centroids

# Fonction d'algorithme de k-moyennes
def k_means(X, k, max_iterations=100):
    centroids = initialize_centroids(X, k)
    for _ in range(max_iterations):
        prev_centroids = centroids.copy()
        clusters = assign_to_nearest(X, centroids)
        centroids = update_centroids(X, clusters, k)
        if np.all(prev_centroids == centroids):
            break
    return clusters, centroids

# Application de l'algorithme de k-moyennes à l'ensemble de données Iris
k_values = range(1, 20)
wcss_values = []

new_df.reset_index(drop=True, inplace=True)  # Réinitialiser complètement l'index

# Réinitialiser les données après l'opération précédente
X_music = new_df.values

# Utiliser les données réinitialisées pour l'algorithme de K-moyennes
k_values = range(1, 20)
wcss_values = []

for k in k_values:
    clusters, centroids = k_means(X_music, k)
    wcss = np.sum((X_music - centroids[clusters.astype(int)]) ** 2)
    wcss_values.append(wcss)
    
# Affichage du graphique pour la méthode du coude
plt.figure(figsize=(8, 6))
plt.plot(k_values, wcss_values, marker='o', linestyle='-')
plt.xlabel('Nombre de clusters')
plt.ylabel('Somme des carrés des distances intra-cluster')
plt.title('Méthode du coude pour déterminer le nombre optimal de clusters')
plt.show()

NameError: name 'new_df' is not defined

### Méthodes d'évaluation

In [10]:
def evaluate(Class_true, Class_pred):
    acc = accuracy(Class_true, Class_pred)
    prec = precision(Class_true, Class_pred)
    rap = rappel(Class_true, Class_pred)
    return acc, prec, rap

def accuracy(Class_true, Class_pred):
    correct = sum(1 for true, pred in zip(Class_true, Class_pred) if true == pred)
    total = len(Class_pred)
    return "accuracy =  " + str(correct / total * 100) + "%"

def precision(Class_true, Class_pred):
    unique_values = np.unique(Class_true)
    n = 0
    for value in unique_values:
        correct = sum(1 for true, pred in zip(Class_true, Class_pred) if true == pred == value)
        total = sum(1 for pred in Class_pred if pred == value)
        if total != 0 :
            n += correct / total
    return "précision =  " + str(n / len(unique_values) * 100) + "%"

def rappel(Class_true, Class_pred):
    unique_values = np.unique(Class_true)
    n = 0
    for value in unique_values:
        correct = sum(1 for true, pred in zip(Class_true, Class_pred) if true == pred == value)
        total = sum(1 for true in Class_true if true == value)
        if total != 0 :
            n += correct / total
    return "rappel =  " + str(n / len(unique_values) * 100) + "%"

In [10]:
def stats(test):
     
     print(metrics.classification_report(test.target, 
                                         test.hyp, 
                                         target_names=test.target_name.unique(), 
                                         zero_division=1))

In [127]:
learn_zeror(x_train, y_train)

pred2 = class_zeror(learn_zeror(x_train, y_train), x_test)

evaluate(y_test, pred2)

('accuracy =  54.947916666666664%',
 'précision =  9.15798611111111%',
 'rappel =  16.666666666666664%')

2