In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sharespotify/spotify_data.csv


In [1]:
# Import other needed packages

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [4]:
spotifydf = pd.read_csv('/kaggle/input/sharespotify/spotify_data.csv')

# Data Cleaning and Processing

In [5]:
spotifydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17774 entries, 0 to 17773
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   artist_name                   17774 non-null  object 
 1   artist_id                     17774 non-null  object 
 2   album_id                      17774 non-null  object 
 3   album_type                    17774 non-null  object 
 4   album_images                  0 non-null      float64
 5   album_release_date            17774 non-null  object 
 6   album_release_year            17289 non-null  float64
 7   album_release_date_precision  17774 non-null  object 
 8   danceability                  17774 non-null  float64
 9   energy                        17774 non-null  float64
 10  key                           17774 non-null  int64  
 11  loudness                      17774 non-null  float64
 12  mode                          17774 non-null  int64  
 13  s

In [6]:
# Dropping columns with entire null values
spotifydf = spotifydf.drop(columns=['album_images','artists','available_markets'])

In [None]:
# Replacing null values in track_preview_url
spotifydf[['track_preview_url']] = spotifydf[['track_preview_url']].fillna('notSpecified')

In [None]:
# Getting year from album_release_date to fill in nulls in album_release_year...


# Convert album_release_date to datetime format
spotifydf['album_release_date'] = pd.to_datetime(spotifydf['album_release_date'], format='ISO8601')
# Extract the year from album_release_date
spotifydf['extracted_year'] = spotifydf['album_release_date'].dt.year
# Fill the null values in album_release_year with the extracted year
spotifydf['album_release_year'].fillna(spotifydf['extracted_year'], inplace=True)
# Drop the temporary extracted_year column
spotifydf.drop(columns=['extracted_year'], inplace=True)

In [None]:
# Check null values again
spotifydf.info()

In [7]:
# Count the number of duplicate rows
spotifydf.duplicated().sum()

0

# Descriptive Analysis

In [None]:
# Most popular artists
popartists = spotifydf.groupby('artist_name')['count'].sum()
popartists.sort_values(ascending=False)

In [None]:
# Number of artists
len(spotifydf['artist_id'].unique())

In [None]:
# Number of tracks
len(spotifydf['track_id'].unique())

In [None]:
# Number of albums
len(spotifydf['album_id'].unique())

# Histogram - Songs by Album Year

In [None]:
# Adding column for count
spotifydf['count'] = 1

In [None]:
sns.set_style('ticks')
sns.set_context('notebook')
sns.displot(spotifydf['album_release_year'], kde = False, bins = 15, color = '#1DB954', edgecolor= 'black')
plt.title('Songs from Albums by Year', fontdict={'fontsize':20, 'color':'#1DB954'}, pad=20)
plt.xlabel('Year Released', color = '#1DB954')
plt.ylabel('No. of Songs', color ='#1DB954')
plt.savefig('songs_year')
plt.show()

# Boxplot - Audio Feature Variable (AFV) distribution

In [None]:
# Get descriptive stats for the AFVs
spotifydf.describe()

## Means:
* danceability: 0.50
* energy: 0.55
* loudness: -10.17
* speechiness: 0.09
* acousticness: 0.40
* instrumentalness: 0.18
* liveness: 0.28 
* valence: 0.42

In [None]:
# Reshape data so that the AFVs are all part of one 'feature' column
features = ['acousticness','danceability','energy','instrumentalness','liveness','speechiness']

spotifymelt = spotifydf.melt(id_vars=['track_id','artist_name', 'album_name', 'album_release_year', 'explicit', 'track_name'],
                             value_vars=features, var_name='feature', value_name='value')
spotifymelt

In [None]:
plt.figure(figsize=(15,5))
sns.set_style('whitegrid')
sns.set_context('notebook')
sns.boxplot(x='feature', y='value', data=spotifymelt, color = '#1DB954')
plt.title('Prevalence of Audio Features', fontdict={'fontsize':20, 'color':'#1DB954'}, pad=20)
plt.xlabel('Feature', color='#1DB954')
plt.ylabel('Rating', color='#1DB954')
customlabels = ['Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Speechiness']
plt.xticks(ticks=range(len(customlabels)), labels=customlabels)
plt.savefig('boxplotafv')
plt.show()

# Clustermap - Correlations based on AFV similarity

In [None]:
# Correlate the features
spotifymx = spotifydf[features].corr()

In [8]:
# Create colourmap for aesthetics and synergy
from matplotlib.colors import LinearSegmentedColormap
spotifycmap = LinearSegmentedColormap.from_list('custom',['#edfcf2','#98efb6','#1DB954'])

In [None]:
sns.set_context('notebook')

sns.clustermap(spotifymx.corr(), annot=True, cmap=spotifycmap, method='average',
              xticklabels=customlabels,
              yticklabels=customlabels)
plt.title('Correlation & Clustering of Audio Features', color='#1DB954', pad=20)
plt.savefig('clusterafv')
plt.show()

# Line Chart - Time Series analysis of AFVs

In [None]:
# Create a 3x2 grid of subplots
fig, axs = plt.subplots(3, 2, figsize=(15, 12), sharex=True)

sns.set_context('talk')

# Define the features, colors, and titles
features = ['acousticness', 'energy', 'liveness', 'instrumentalness', 'danceability', 'speechiness']
colors = ['#232723', '#62d089', '#e1ece3', '#05fc47', '#a8b2a8', '#0cf5b7']
titles = ['Acousticness', 'Energy', 'Liveness', 'Instrumentalness', 'Danceability', 'Speechiness']

# Plot each feature in a separate subplot
for ax, feature, color, title in zip(axs.flat, features, colors, titles):
    # Plot the feature
    spotifydf.groupby('album_release_year')[feature].mean().plot(kind='line', ax=ax, color=color)
    # Set the title and labels
    ax.set_title(title, color='#1DB954')
    ax.set_xlabel('Year', color='#1DB954')
    ax.set_ylabel('Value', color='#1DB954')
    # Set the y-axis limits
    ax.set_ylim(0, 1)

# Adjust the layout to prevent overlap
plt.tight_layout()

plt.savefig('afv_years')

plt.show()

# Clustering using K-means and PCA

In [None]:
# Select the relevant features
features2 = spotifydf[['danceability', 'energy', 'loudness','instrumentalness','acousticness',
                      'speechiness', 'liveness', 'tempo']]

# Normalize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features2)

In [None]:
# Determine the optimal number of clusters using the Elbow method
wcss = []
for i in range(1, 20):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

# Plot the Elbow method graph
plt.plot(range(1, 20), wcss)
sns.set_context('notebook')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fit K-Means with the optimal number of clusters (e.g., k=3)
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
kmeans.fit(scaled_features)
spotifydf['Cluster'] = kmeans.labels_

In [None]:
# Reduce dimensions with PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_features)

In [None]:
# Create a scatter plot of the clusters
custom_palette = sns.color_palette(['#1DB954', '#0cf5b7', '#232723'])
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_components[:, 0], y=pca_components[:, 1], hue=spotifydf['Cluster'], palette=custom_palette)
plt.title('Clusters of Songs')
plt.savefig('clusteredsongs')
plt.show()

In [None]:
# Calculate summary statistics for each feature within each cluster
cluster_summary = spotifydf.groupby('Cluster').agg({
    'danceability': ['mean', 'count'],
    'energy': ['mean'],
    'loudness': ['mean'],
    'instrumentalness': ['mean'],
    'acousticness': ['mean'],
    'speechiness': ['mean'],
    'liveness': ['mean'],
    'tempo': ['mean']
})


print(cluster_summary)

## Cluster descriptions:

* 0 - highest danceability/lowest acousticness and liveness
> Name: Electronic Dance Songs
* 1 - highest instrumentalness and acousticness/lowest energy speechiness tempo and loudness
> Name: Chilled Focus Songs
* 2 - highest energy speechiness liveness tempo loudness/lowest danceability and instrumentalness
> Name: Upbeat Singalong Classics

In [None]:
# Calculate silhouette score
silhouette_avg = silhouette_score(scaled_features, spotifydf['Cluster'])
print(f'Silhouette Score: {silhouette_avg}')