In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet('spotify_songs.parquet')

FileNotFoundError: ignored

In [None]:
# Assume df is your DataFrame containing the Spotify dataset

# Handling missing values
df.dropna(inplace=True)

# Converting 'track_popularity' to categorical labels
def classify_popularity(popularity):
    if popularity <= 20:
        return 0
    elif popularity <= 40:
        return 1
    elif popularity <= 60:
        return 2
    else:
        return 3

df['popularity_class'] = df['track_popularity'].apply(classify_popularity)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of popularity classes
sns.countplot(x='popularity_class', data=df)
plt.title('Distribution of Popularity Classes')
plt.show()



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

df.drop(columns = ['track_name', 'track_artist', 'track_album_id', 'track_album_name', 'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre'], inplace = True)


# Scaling numerical columns
numerical_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


# Convert 'track_album_release_date' to datetime
df['track_album_release_date'] = pd.to_datetime(df['track_album_release_date'])

# Extract year, month, day information
df['release_year'] = df['track_album_release_date'].dt.year

filtered_df = df[df['release_year'] >= 2019]
# Drop the original 'track_album_release_date' column
filtered_df.drop(columns=['track_album_release_date', 'release_year','track_popularity'], inplace=True)

In [None]:
# Calculate correlation matrix
corr_matrix = filtered_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
filtered_df.info()

In [None]:
# Distribution of popularity classes
sns.countplot(x='popularity_class', data=filtered_df)
plt.title('Distribution of Popularity Classes')
plt.show()



In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
# separando quem é feature e quem é target
X = filtered_df.drop(columns=[ 'popularity_class'])
y = filtered_df['popularity_class']

# Splitting dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Original dataset shape", Counter(y_train))

sm = RandomUnderSampler(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("Dataset shape undersampling", Counter(y_res))

In [None]:
filtered_df.to_csv('filtered_df.csv', index=False)

In [None]:
# SVM - TESTAR MAIS HIPERPARAMETROS
svm_model = SVC(kernel='linear')  # You can try different kernels and hyperparameters

# Treinando modelo
svm_model.fit(X_res, y_res)

# Predict
predictions = svm_model.predict(X_test)

# Avaliando
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

cm = confusion_matrix(y_test, predictions)
plot_confusion_matrix(conf_mat = cm, show_normed = True, class_names = [0,1,2,3])

In [None]:
# SVM - TESTAR MAIS HIPERPARAMETROS
svm_model_sem_under = SVC(kernel='linear')  # You can try different kernels and hyperparameters

# Treinando modelo
svm_model_sem_under.fit(X_train, y_train)

# Predict
predictions_sem_under = svm_model_sem_under.predict(X_test)

# Avaliando
print(classification_report(y_test, predictions_sem_under))
print("Accuracy:", accuracy_score(y_test, predictions_sem_under))

In [None]:
cm_sem_under = confusion_matrix(y_test, predictions_sem_under)
plot_confusion_matrix(conf_mat = cm_sem_under, show_normed = True, class_names = [0,1,2,3])