# COGS 118B - Data Wrangling

# Names


- Ana Maria Baboescu
- Bradley Grace
- Fatima Enriquez
- Ngoc (Lucy) Giang
- Stephanie Frianeza

# Imported Libraries
Feel free to add any necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# FMA - Bradley
Below code displays the updated dataset with the wanted variables

In [None]:
# https://github.com/mdeff/fma

# Import dataframes from CSV files
fma_tracks = pd.read_csv('fma/tracks.csv') # per track metadata such as ID, title, artist, genres, tags and play counts, for all 106,574 tracks.
fma_genres = pd.read_csv('fma/genres.csv') # all 163 genres with name and parent
fma_features1 = pd.read_csv('fma/features.csv') # common features extracted with librosa
fma_features2 = pd.read_csv('fma/echonest.csv') # audio features provided by Echonest (now Spotify)

fma_tracks.shape, fma_genres.shape, fma_features1.shape, fma_features2.shape

In [None]:
fma_tracks.columns.values

In [None]:
# 12 = artist names
fma_artist_names = fma_tracks['artist.12']
fma_artist_names

# TODO: truncate 1-3 rows.

In [None]:

fma_track_titles = fma_tracks['track.19']
fma_track_titles

In [None]:

fma_album_titles = fma_tracks['album.10']
fma_album_titles

In [None]:
fma_album_types = fma_tracks['album.12']
fma_album_types

In [None]:

fma_features2.head()

fma_danceability = fma_features2['echonest.1']
fma_energy = fma_features2['echonest.2']
#fma_loudness = fma_features2['echonest.']
fma_speechiness = fma_features2['echonest.5']
fma_acousticness = fma_features2['echonest']
fma_instrumentalness = fma_features2['echonest.3']
fma_liveness = fma_features2['echonest.4']

# Spotify - Lucy
Below code displays the updated dataset with the wanted variables

In [None]:
headers = ["Artist", "Track", "Album", "Album_type", "Danceability", "Energy", "Loudness","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Tempo","Duration_min","Title","Channel","Views","Likes","Comments","Licensed","official_video","Stream","EnergyLiveness","most_playedon"]
music_data = pd.read_csv('cleaned_dataset.csv', header = None, names = headers)
music_data.head()

selected_columns = ["Artist", "Track", "Album", "Album_type", "Danceability", "Loudness", "Speechiness", "Acousticness", "Instrumentalness"]
music_data = music_data[selected_columns]

# preview
music_data.head()

#check what data types we have in the dataset
print(music_data.dtypes)

#check for any null values
music_data[music_data.isnull().any(axis=1)]

# export file
# music_data.to_csv('music_data2.csv', index=False)

# 500 Greatest Songs - Fatima
Below code displays the updated dataset with the wanted variables


In [None]:
df_top_500 = pd.read_csv('Top 500 Songs.csv',encoding='Latin 1')
df_top_500 = df_top_500[["title", "description", "artist"]] 
df_top_500.isnull().sum().sum() #no missing values (yay!) 
df_top_500 = df_top_500.rename(columns={"artist": "Artist", "description" : "Description", "title" : "Title"})
df_top_500["Title,Artist"] = df_top_500["Title"] + "*\*" + df_top_500["Artist"]
df_top_500

# Prediction of Music - Ana & Stephanie
Below code displays the updated dataset with the wanted variables


In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA



pred_Music = pd.read_csv('music_genre.csv')

#subset with wanted variables
edited_pred_Music = pred_Music[["artist_name", "track_name", "acousticness", "danceability", 
                                "instrumentalness", "liveness", "loudness", "mode", "speechiness", 
                                "tempo", "valence", "music_genre"]]

#checked and got rid of duplicates
edited_pred_Music = edited_pred_Music.drop_duplicates('track_name').copy()


#changed name in columns artist_name and track_name and combined columns
new_name_edited_pred_Music = edited_pred_Music.rename(columns={"artist_name" : "Artist", "track_name" : "Title"})
new_name_edited_pred_Music["Title,Artist"] = new_name_edited_pred_Music["Title"] + "*\*" + new_name_edited_pred_Music["Artist"]
new_name_edited_pred_Music


In [None]:
#Show the Clustering

# Drop non-numeric columns
X = edited_pred_Music.drop(columns=['artist_name', 'track_name', 'music_genre'])

#Define categorical and numerical features
numeric_features = X.select_dtypes(include=['float64']).columns.tolist()
categorical_features = ['mode']

#Define transformers for categorical & numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'mean')), 
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#apply transformers to appropriate columns via ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), 
        ('cat', categorical_transformer, categorical_features)
    ])

#preprocess the data
X_processed = preprocessor.fit_transform(X)


#Fit GMM
n_components = 9
gmm = GaussianMixture(n_components=n_components, random_state = 42)
gmm.fit(X_processed)

#get cluster labels
cluster_labels = gmm.predict(X_processed)

#add cluster labels to orig. dataframe
edited_pred_Music['cluster_label'] = cluster_labels

print(edited_pred_Music['cluster_label'].value_counts())

#reduce dimensionality for purposes of visualizing data
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_processed)

#plot data with cluster labels
plt.figure(figsize=(10,6))
for cluster_label in range(n_components):
    plt.scatter(X_reduced[cluster_labels == cluster_label, 0],
               X_reduced[cluster_labels == cluster_label, 1],
               label=f'Cluster {cluster_label}', alpha=0.7)

#plot centroids
centroids = pca.transform(gmm.means_)
plt.scatter(centroids[:, 0], centroids[:, 1], marker = 'x', color = 'black', s=100, label='Centroids')
plt.title('GMM Clustering')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#access principal components
principal_components = pca.components_

#display principal components
for i, pc in enumerate(principal_components):
    print(f"Principal Component {i+1}:")
    for j, feature in enumerate(X.columns):
        print(f"{feature}: {pc[j]}")
    print()
    
#This code shows the contribution of each original 
#feature towards each of the two principal components

# Compiled Data - Everyone

Here we compile all the datasets together for Monday meeting


# EDA - TBD




# Program - TBD

# Footnotes
1. <a name="cite_note-1"></a> [^](#cite_ref-1) Chillar, Snigdha et al. (May 2019) Music Genre Classification using Machine Learning Algorithms: A comparison. *International Research Journal of Engineering and Technology (IRJET)*. https://d1wqtxts1xzle7.cloudfront.net/59934287/IRJET-V6I517420190704-120568-1u4iafr-libre.pdf?1562308085=&response-content-disposition=inline%3B+filename%3DIRJET_Music_Genre_Classification_using_M.pdf&Expires=1708407619&Signature=WRJ6JnCTvv8fyWmo~A-SzVQ2DRT77pSZFX8tmlz7YCCB7J5ZMkrJpugkiwlJT7DoaCR-d2jI6IfGjgYXI9-EsJlVANQ~gJY04gUz9H4zkZG-HiyimSeXcAkK58Rqp06qgvlu-yx5zcM1wNxrnhgASBQEvVBkhyQMxETbkgtLYzw40gKVZFeioo0Qjj7aqC-YDzGwlzlnXhNss4xlBsBj7PdyTFgGu2cM8ky8g3XsqA1yIIuiO4cr0SEcCJU8orRATVTpB388Nud0GvNiGOz6DAhpoCBLEqrsYxFHk2jOut7x1TPs91ECXgG1SwjPE03vlYoeAMEkdV5pasBm1yZDeQ__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA

2. <a name="cite_note-2"></a> [^](#cite_ref-2) Bahuleyan, Hareesh( 3 April 2018) Music Genre Classification using Machine Learning Techniques. Cornell University. https://doi.org/10.48550/arXiv.1804.01149

3. <a name="cite_note-3"></a> [^](#cite_ref-3) Wohlwend, Brandon (23 July 2023) Decision Tree, Random Forest, and XGBoost: An Exploration into the Heart of Machine Learning. Medium. https://medium.com/@brandon93.w/decision-tree-random-forest-and-xgboost-an-exploration-into-the-heart-of-machine-learning-90dc212f4948#:~:text=Random%20Forests%20build%20on%20this,that%20often%20outperforms%20many%20others.
