### **PRESENTACIÓN IAA**

In [48]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import random
from imblearn.over_sampling import SMOTE
from itertools import combinations

# DATASETS
df1 = pd.read_csv('/content/train.csv')
df2 = pd.read_csv('/content/test.csv')
target_exp = pd.read_csv('/content/submission.csv')
df = pd.concat([df1, df2], ignore_index = True)       # Concatenacion de datasets

#df

## **LIMPIEZA**

In [49]:
filas_duplicadas = df.duplicated().sum()
#print(f"La cantidad de filas duplicadas es: {filas_duplicadas}")

nan_counts = df.isna().sum()
#print(f"La cantidad de filas duplicadas es: {nan_counts}")

df = df.drop_duplicates()                             # Eliminacion de duplicados
df = df.dropna(subset=['Class'])                        # Sacar NaN de Class
df = df.dropna(subset=['Popularity'])             # Sacar NaN de Popularity
df = df.dropna(subset=['instrumentalness'])       # Sacar NaN de instrumentalness

clases = ['Folk Acústico', 'Música Alternativa', 'Blues', 'Bollywood', 'Country', 'Hip-Hop', 'Indie Alternativo', 'Instrumental', 'Metal', 'Pop', 'Rock']

df['Rock'] = df['Class'].apply(lambda x: True if x == 10 else False)

df['ClassName'] = df['Class'].apply(lambda x: clases[int(x)])  #Le agrego los nombres

conteo_clases = df['ClassName'].value_counts()       # Cantidad de clases que hay en el df
#print(conteo_clases)

## **EXPLORACIÓN DE LOS DATOS**

**Boxplot por clase**

In [None]:
# EXPLORANDO DATASET

grouped = df.groupby("Class")

dataframes = [group for _, group in grouped]

plt.figure(figsize=(10, 6))
plt.boxplot([df["liveness"] for df in dataframes], labels=[name for name, _ in grouped])
plt.xlabel("Class")
plt.ylabel("liveness")
plt.title("Boxplot por Clase")
plt.show()

**Graficos por atributo**

In [None]:
# GRAFICOS Relacione entre atributos

# Ver si podemos encontrar generos que sean parecidos en sus atributos y fusionarlos

colores = ['red', 'blue', 'green', 'yellow', 'orange', 'purple', 'pink', 'brown', 'cyan', 'magenta', 'black']

#Todas las combinaciones de las variables

combinaciones = [
    ('Popularity', 'danceability'),
    ('Popularity', 'energy'),
    ('Popularity', 'loudness'),
    ('Popularity', 'speechiness'),
    ('Popularity', 'acousticness'),
    ('Popularity', 'instrumentalness'),
    ('Popularity', 'valence'),
    ('danceability', 'energy'),
    ('danceability', 'loudness'),
    ('danceability', 'speechiness'),
    ('danceability', 'acousticness'),
    ('danceability', 'instrumentalness'),
    ('danceability', 'valence'),
    ('energy', 'loudness'),
    ('energy', 'speechiness'),
    ('energy', 'acousticness'),
    ('energy', 'instrumentalness'),
    ('energy', 'valence'),
    ('loudness', 'speechiness'),
    ('loudness', 'acousticness'),
    ('loudness', 'instrumentalness'),
    ('loudness', 'valence'),
    ('speechiness', 'acousticness'),
    ('speechiness', 'instrumentalness'),
    ('speechiness', 'valence'),
    ('acousticness', 'instrumentalness'),
    ('acousticness', 'valence'),
    ('instrumentalness', 'valence')
]
for comb in combinaciones:
  print(comb)
  for df in dataframes:

      x_clase = df[comb[0]].apply(float)
      y_clase = df[comb[1]].apply(float)
      color = colores[int(df['Class'].loc[df.index[0]])]

      clase_str = str(df['Class'].loc[df.index[0]])
      plt.title(clase_str)
      plt.scatter(x_clase, y_clase, c=color, label='Clase ' + clase_str)
      plt.legend()
      plt.show()

**Matriz de correlacion entre atributos por clase**

In [None]:
#Otra forma de ver como se relacionan las variables entre si por clases

classes = df['Class'].unique()

plt.figure(figsize=(70, 55))

for i, class_label in enumerate(classes):
    class_df = df[df['Class'] == class_label]
    correlation_matrix = class_df.corr()
    plt.subplot(3, 4, i+1, aspect='equal', adjustable='box')
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Matrix - Class {class_label}')

plt.tight_layout()
plt.show()

**Grafico de barras por clase**

In [None]:
#Cantidad de entradas en cada clase, lo usamos para ver el desbalance de clases

unique_classes, counts = np.unique(df[['ClassName']], return_counts=True)

sorted_indices = np.argsort(counts)[::-1]
sorted_classes = unique_classes[sorted_indices]
sorted_counts = counts[sorted_indices]

plt.figure(figsize=(10, 8))

plt.bar(sorted_classes, sorted_counts, align='center', alpha=0.7, color='#006400')

plt.xlabel('Clase')
plt.ylabel('Cantidad de Entradas')
plt.title('Cantidad de Entradas por Clase')
plt.xticks(rotation=60)

plt.show()

**ATRIBUTOS A USAR**

In [50]:
X = df[['Popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']]
y = df[['ClassName']]

## **RANDOM FOREST**

**TRAIN**

In [None]:
# Modelo RFC con Validacion Cruzada, (SIN SMOTE)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

# RFC, FITEO, VALIDACION CRUZADA

RFC = RandomForestClassifier(random_state = 42, n_jobs=-1)
RFC.fit(X_train, y_train)
cv_results = cross_validate(RFC, X_train , y_train, scoring = "accuracy", n_jobs = -1, return_estimator = True)

# Promedio de los accuracy en cada fold de la Validacion Cruzada del modelo de RFC

In [None]:
cv_results

In [None]:
#TRAIN#

# Matriz de confusion
conf_matrix = confusion_matrix(y_train, RFC.predict(X_train))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases, yticklabels = clases)

plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

plt.title("Matriz de Confusión")
plt.show()

#Validacion Cruzada
scores = cross_val_score(RFC, X_train, y_train, scoring = "accuracy")
print('Score:', scores.mean())

# Accuracy
accuracy = accuracy_score(y_train, RFC.predict(X_train))
print("Accuracy:", accuracy)

# Reporte de clasificación
print(classification_report(y_train,RFC.predict(X_train)))

**TEST**

In [None]:
#TEST#

# Matriz de confusion
conf_matrix = confusion_matrix(y_test, RFC.predict(X_test))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases, yticklabels = clases)

plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

plt.title("Matriz de Confusión")
plt.show()

# Accuracy
accuracy = accuracy_score(y_test, RFC.predict(X_test))
print("Accuracy:", accuracy)

# Reporte de clasificación
print(classification_report(y_test,RFC.predict(X_test)))

## **CLASS WEIGHT**

In [None]:
 # Class Weight

from sklearn.model_selection import train_test_split
import random

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

random.seed(42)

# Probamos asignando pesos
class_weights = {'Blues': 6, 'Bollywood': 2, 'Country': 2, 'Folk Acústico':6, 'Hip-Hop':1, 'Indie Alternativo':4, 'Instrumental':1, 'Metal':4, 'Música Alternativa':2, 'Pop':2, 'Rock': 0.01}

RFC = RandomForestClassifier(class_weight = class_weights)

score = cross_val_score(RFC, X_train, y_train, scoring = "accuracy")

RFC.fit(X_train, y_train)

accuracy = accuracy_score(y_test, RFC.predict(X_test))

print('Train: ', score.mean())
print('Test: ', accuracy)
print(classification_report(y_test,RFC.predict(X_test)))

## **RANDOM FOREST CON SMOTE**

**TRAIN**

In [None]:
# SMOTE
random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

# SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# RFC con SMOTE
RFC = RandomForestClassifier(random_state = 42, n_jobs=-1)
RFC.fit(X_resampled, y_resampled)

# Validacion Cruzada en los datos de RFC con SMOTE
cv_results = cross_validate(RFC, X_resampled, y_resampled, scoring = "accuracy", n_jobs = -1, return_estimator = True)

# Promedio de los accuracy en cada fold de la Validacion Cruzada del modelo de RFC con SMOTE
scores = cross_val_score(RFC, X_resampled, y_resampled, scoring = "accuracy")
print('Score:', scores.mean())

# conteo_clases = y_resampled['Class'].value_counts()       # Cantidad de clases que hay en el df con SMOTE

In [None]:
cv_results

In [None]:
# Accuracys, recall, f1-score del modelo RFC con SMOTE

# scores = cross_val_predict(RFC, X_resampled, y_resampled, n_jobs=-1, method = 'predict')          # DE QUE NOS SIRVE?
# print('Score:', scores.mean())

conf_matrix = confusion_matrix(y_resampled, RFC.predict(X_resampled))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases, yticklabels = clases)

# Agrega etiquetas a los ejes.
plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

# Agrega títulos y muestra el gráfico.
plt.title("Matriz de Confusión")
plt.show()


#TRAIN
accuracy = accuracy_score(y_resampled, RFC.predict(X_resampled))
print("Accuracy en Train:", accuracy)

# Mostrar el reporte de clasificación
print(classification_report(y_resampled, RFC.predict(X_resampled)))



**TEST**

In [None]:
# Matriz sobre los conjuntos resampled de SMOTE

conf_matrix = confusion_matrix(y_test, RFC.predict(X_test))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases, yticklabels = clases)

plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

plt.title("Matriz de Confusión")
plt.show()

#TEST
accuracy = accuracy_score(y_test, RFC.predict(X_test))
print("Accuracy en Test:", accuracy)

# Reporte de clasificación
print(classification_report(y_test, RFC.predict(X_test)))

## **SEPARANDO ROCK**

In [None]:
random.seed(42)

# CONJUNTO DE DATOS ROCK
X = df[['Popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']]
y = df[['Rock']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

# RFC de ROCK O NO
RFC = RandomForestClassifier(random_state = 42, n_jobs=-1)

# Validacion Cruzada de ROCK O NO
cv_results = cross_validate(RFC, X_train, y_train, scoring = "accuracy")

score = cross_val_score(RFC, X_train, y_train, n_jobs=-1)

# FIT
RFC.fit(X_train, y_train)

# Matriz de los datos
conf_matrix = confusion_matrix(y_test, RFC.predict(X_test))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)

# Agrega etiquetas a los ejes.
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

# Agrega títulos y muestra el gráfico.
plt.title("Matriz de Confusión")
plt.show()

print(cv_results)
print(score.mean())

# Reporte de Clasificacion
print(classification_report(y_test, RFC.predict(X_test)))
df

**DATOS SIN ROCK**

In [None]:
# CONJUNTO DE DATOS SIN ROCK

random.seed(42)

# DATOS SIN ROCK
df2 = df.loc[df['Rock'] == False]

clases1 = ['Folk Acústico', 'Música Alternativa', 'Blues', 'Bollywood', 'Country', 'Hip-Hop', 'Indie Alternativo', 'Instrumental', 'Metal', 'Pop']

X = df2[['Popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']]
y = df2[['ClassName']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

# RFC DE SIN ROCK
RFC = RandomForestClassifier(random_state = 42, n_jobs=-1)

# VALIDACION CRUZADA DE SIN ROCK
scores = cross_val_score(RFC, X_train, y_train, scoring = "accuracy", n_jobs=-1)
cv_results = cross_validate(RFC, X_train, y_train, scoring = "accuracy")
print(cv_results)
print('Score:', scores.mean())

# FIT
RFC.fit(X_train, y_train)

# Matriz DE SIN ROCK
conf_matrix = confusion_matrix(y_train, RFC.predict(X_train))
#conf_matrix = confusion_matrix(y_train, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases1, yticklabels = clases1)

# Agrega etiquetas a los ejes.
plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

# Agrega títulos y muestra el gráfico.
plt.title("Matriz de Confusión")
plt.show()

# Reporte de Clasificacion
print(classification_report(y_train, RFC.predict(X_train)))

**SIN ROCK CON SMOTE**

In [None]:
# AHORA VER EL SMOTE EN EL CONJUNTO DE DATOS DE SIN ROCK

# SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# RFC con SMOTE
RFC = RandomForestClassifier(random_state = 42, n_jobs=-1)
RFC.fit(X_resampled, y_resampled)

# Validacion Cruzada en los datos de RFC con SMOTE
cross_validate(RFC, X_resampled, y_resampled, scoring = "accuracy", n_jobs = -1, return_estimator = True)
scores = cross_val_score(RFC, X_resampled, y_resampled, scoring = "accuracy", n_jobs=-1)

print('Score:', scores.mean())
print(cv_results)

# Matriz DE SIN ROCK
conf_matrix = confusion_matrix(y_resampled, RFC.predict(X_resampled))
#conf_matrix = confusion_matrix(y_train, y_pred)


plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels = clases1, yticklabels = clases1)

# Agrega etiquetas a los ejes.
plt.xticks(rotation = 45)
plt.xlabel("Predicciones")
plt.ylabel("Clases reales")

# Agrega títulos y muestra el gráfico.
plt.title("Matriz de Confusión")
plt.show()




# Reporte de Clasificacion
print(classification_report(y_resampled, RFC.predict(X_resampled)))

# EL PROMEDIO DE ACC BAJA UN 0.07, Y EL RECALL AUMENTA UN 0.07 CON RESPECTO AL MODELO SIN SMOTE
# EL SMOTE NO TERMINA SIENDO SIGNIFICATIVO

## **CURVA ROC EN EL MODELO RANDOM FOREST**

In [None]:
## CURVA ROC ##

from sklearn.metrics import roc_curve, auc
from itertools import cycle
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

X = df[['Popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']]
y = df[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=2, stratify=y)

RFC = OneVsRestClassifier(RandomForestClassifier(random_state = 42, n_jobs=-1))
RFC.fit(X_train, y_train)
#RFC.fit(X_resampled, y_resampled)

y_pred = RFC.predict_proba(X_test)

y_test_bin = label_binarize(y_test, classes=list(range(11)))

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(11):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

colors = cycle(['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'black'])

plt.figure(figsize=(8, 6))

for i in range(len(fpr)):
    plt.plot(fpr[i], tpr[i], label=f'{clases[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curva ROC Multiclase')
plt.legend(loc='lower right')
plt.show()