# CSE4224 Project 2
#### Grant Butler | gbutler2020@my.fit.edu

#### Data Retrieval/Cleaning:
Here, I will use the Spotify API to get the audio features of the tracks in my library, and create a cleaned pandas dataframe to be used with PCA and t-SNE after.

In [None]:
from dotenv import dotenv_values
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# use dotenv to import api creds
secrets = dotenv_values(".env")

# making authentication manager for spotipy to communicate with spotify
auth_manager = SpotifyClientCredentials(client_id=secrets["SPOTIPY_CLIENT_ID"],
                                        client_secret=secrets["SPOTIPY_CLIENT_SECRET"])


sp = spotipy.Spotify(auth_manager=auth_manager)  # making spotipy object
sp.trace = False  # no debugging needed

# grabbing the track ids and adding them to an array passed into them
def show_tracks(results, uriArray):
    for i, item in enumerate(results['items']):
        track = item['track']
        uriArray.append(track['id'])

# taking in the tracks from the playlist and grabbing the ids before returning them to a list
def get_playlist_track_ids(username, playlist_id):
    track_ids = []
    playlist = sp.user_playlist(username, playlist_id)

    tracks = playlist['tracks']

    while tracks['next']:
        tracks = sp.next(tracks)
        show_tracks(tracks, track_ids)
    return track_ids


track_ids = get_playlist_track_ids(secrets['SPOTIFY_USERNAME'],
                                   secrets['PLAYLIST_ID'])

print(track_ids)

In [None]:
import json

# helper function to split up the bigger list of track ids (spotify api limits to 100 tracks)
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# taking in a chunk at a time, ensuring they are strings, and then adding on the features and returning them
def get_audio_features(track_ids):
    audio_features = []
    for chunk in chunks(track_ids, 100):
        chunk = [str(track_id) for track_id in chunk]
        audio_features.extend(sp.audio_features(chunk))
    return audio_features

audio_features = get_audio_features(track_ids)
print(json.dumps(audio_features, indent=2))

In [None]:
import pandas as pd

# ensuring there are no entries without data
cleaned_features = [item for item in audio_features if item is not None]

# making dataframe from the list of dicts
df = pd.DataFrame(cleaned_features)

# remove fields that have no bearing on analysis
fields_to_remove = ["analysis_url", "track_href", "type", "uri"]
df = df.drop(columns=fields_to_remove)

print(df)

#### PCA Dimensional Reduction:

Using PCA, I hope to reduce the number of dimensions that t-SNE needs to work with.

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df_numeric = df.drop(columns='id')

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)

# apply PCA
pca = PCA(n_components=2) # only 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)

# visualize PCA
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.8)
plt.title('PCA Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

In [None]:
# checking loadings of principal components
loadings = pca.components_

loadings_df = pd.DataFrame(loadings, columns=df_numeric.columns)

print(loadings_df)

In [None]:
import numpy as np

# sort the loadings to be the most impactful first

loadings_diff = np.abs(np.diff(loadings, axis=0))
loadings_diff_sum = loadings_diff.sum(axis=0)
loadings_diff_df = pd.DataFrame(
    loadings_diff_sum, index=df_numeric.columns, columns=['Sum of Loadings Diff'])

loadings_diff_sorted = loadings_diff_df.sort_values(by='Sum of Loadings Diff', ascending=False)

print(loadings_diff_sorted)

In [None]:
# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Print the explained variance ratio for each component
for i, ratio in enumerate(explained_variance_ratio):
    print(f"Principal Component {i + 1}: Explained Variance Ratio = {ratio:.4f}")

# Plot the cumulative explained variance ratio
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='-')
plt.title('Cumulative Explained Variance Ratio')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
from sklearn.manifold import TSNE

# using 2 components for t-SNE for visualization after reducing with PCA
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_pca)

# t-SNE results
plt.figure(figsize=(8, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.8)
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True)
plt.show()

In [None]:
# using k-means to group the clusters and color them based on that

from sklearn.cluster import KMeans

# apply k-means to t-SNE results
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(X_tsne)

plt.figure(figsize=(8, 6))
for cluster_id in range(len(np.unique(clusters))):
    cluster_indices = np.where(clusters == cluster_id)[0]
    plt.scatter(X_tsne[cluster_indices, 0], X_tsne[cluster_indices,
                1], label=f'Cluster {cluster_id}', alpha=0.8)
plt.title('t-SNE Visualization (Clustered)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
# plt.legend() # legends for whether you see which cluster is which
plt.grid(True)
plt.show()

In [None]:
import json

cluster_track_ids = {}
for cluster_label, track_id in zip(clusters, track_ids):
    if cluster_label not in cluster_track_ids:
        cluster_track_ids[cluster_label] = []
    cluster_track_ids[cluster_label].append(track_id)

# relabel points with original components
for cluster_label, track_ids in cluster_track_ids.items():
    print(f'Cluster {cluster_label}:')
    for track_id in track_ids:
        index = track_ids.index(track_id)
        original_point = X_tsne[index]
        print(f'\tTrack ID: {track_id}')