## Data Gathering
author: Liang-Yun Cheng & Federico Cimini

### Secition I: Data Extraction from Spotify
We recommend not to re-run this section as the API might take sometime to loop through 2000+ songs.

In [None]:
!pip install spotipy
!pip install wordcloud

In [None]:
#import required libraries
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# set up API connect with client id & passpord
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="20053ba0f21d4da7be037777d0276be0",
                                                           client_secret="63d4eda7b4054e64ae1725955ada81cb"))

Spotify API returns json-type data. We will need to find the keys inside the nested dictionary to extract selected field information.

In [None]:
top_playlist = ["37i9dQZF1DXcBWIGoYBM5M", '37i9dQZEVXbMDoHDwVN2tF', '37i9dQZF1DX0XUsuxWHRQd', 
                '37i9dQZF1DX10zKzsJ2jva','37i9dQZF1DWXRqgorJj26U','37i9dQZF1DWTmvXBN4DgpA'] ## last id is playlist 2000

# list to store all tracks and its info extracted
track_list = []

def extract_track_info(playlist_info, track_list):
    ''' Tracks are stored under the dictionary key "items"
      Loop through each tracks and extract track info 
      playlist_info: a json of playlist info
      track_list: a list of list to store all tracks info extracted
    '''
    for tracks in playlist_info['items']:
        track = tracks['track']
        track_id = track['id']
        track_name = track['name']
        artist_name = track['artists'][0]['name']
        artist_id = track['artists'][0]['id']
        track_popularity = track['popularity']
        track_release_date = track['album']['release_date']
        track_list.append([track_id, track_name, artist_name, artist_id, track_popularity, track_release_date])

'''loop through the playlists to extract tracks'''
for playlist in top_playlist: 
    playlist_info = sp.playlist_items(playlist) # limit 100 songs
    # extract each track info for first page
    extract_track_info(playlist_info, track_list)
    # check to see if playlist has more than 1 page
    while playlist_info['next']:
    # if more pages exists, then extract each track info for each subsequent pages
    # update [playlist_info] variable to be the next page's info
        playlist_info = sp.next(playlist_info) # parameter is the current page info
        extract_track_info(playlist_info,track_list)


In [None]:
# convert list of list to a df
tracks_df = pd.DataFrame(track_list, columns = ['track_id',
                                                'track_name',
                                                'artist_name',
                                                'artist_id',
                                                'track_popularity',
                                                'release_date'])
tracks_df.shape

In [None]:
# remove duplicated songs
tracks_df.drop_duplicates(inplace = True)
tracks_df.shape

In [None]:
# get artist genres
artist_list_df = tracks_df[['artist_id']].drop_duplicates()
artist_list = []
for id in artist_list_df['artist_id']:
    artist_info = sp.artist(id)
    genres = ", ".join(artist_info['genres'])
    artist_list.append([id,genres])

# convert list of list to df    
artist_df = pd.DataFrame(artist_list, columns =['artist_id','genres'])


In [None]:
print(artist_df.shape)
artist_df.head()

In [None]:
# get audio features 
feature_df = pd.DataFrame()
for tracks in tracks_df['track_id']:
    feature_data = sp.audio_features(tracks)
    feature_data_df = pd.DataFrame.from_dict(feature_data)
    feature_df = pd.concat([feature_df,feature_data_df])

In [None]:
print(feature_df.shape)
feature_df.head()

In [None]:
# merge tracks info table with track audio feature info
tracks_feature_df = tracks_df.merge(feature_df, how = "left", left_on= "track_id", right_on= "id") \
                             .merge(artist_df, how = "left", left_on = 'artist_id', right_on = "artist_id")

# drop unused columns
tracks_feature_df.drop(['type','id','track_href','analysis_url','time_signature'], axis=1, inplace = True)


In [None]:
# clean up release date column to show the decade 
tracks_feature_df['release_decade'] = tracks_feature_df['release_date'].str.slice(0,4) \
                                                                       .astype('int32') \
                                                                       .div(10) \
                                                                       .apply(np.floor) \
                                                                       .mul(10) \
                                                                       .astype('int32')
                                                                      

In [None]:
tracks_feature_df.head()

In [None]:
# export tracks_feature_df to csv
import os  
os.makedirs('Downloads', exist_ok=True)  
tracks_feature_df.to_csv('Downloads/tracks_feature_df_phase1.csv') # file can be found in Download

## Section II:  EDA
The section below can be runned by importing "tracks_feature_df_phase1" data.
In the following section, we will ... 
- remove highly correlated columns
- identify commonly used [genre] terms
- perform k-means clustering (and conduct standard scaler and PCA before running k-means)

In [None]:
tracks_feature_df = pd.read_csv('tracks_feature_df_phase1.csv')
tracks_feature_df.drop(['Unnamed: 0'], axis=1, inplace = True)
tracks_feature_df.head()

### What are the popular genres?
For the [genres] drop-down list, we wanted to know the most common genres. We used the following two approaches to visualize the most common terms used.

#### Approach 1: Count number of original catetgories

In [None]:
# fill NA for null genres to avoid error when looping through the list split
genre_list = tracks_feature_df['genres'].fillna("NA").tolist()
genre_list_cleaned = []
genre_dict = {}

for item in genre_list:
    split = item.split(sep = ',')
    genre_list_cleaned.extend(split)

for i in genre_list_cleaned:
    genre_dict[i] = genre_dict.get(i, 0) + 1


In [None]:
# use wordcloud to visualize term frequency
plt.subplots(figsize = (15,6))

wordcloud = WordCloud (
                    background_color = 'white',
                    width = 512,
                    height = 384
                        ).generate_from_frequencies(genre_dict)
plt.imshow(wordcloud) # show wordcloud
plt.axis('off') # to hide x and y axes
plt.show()

### Approach 2: Using NLTK to parse terms into words

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

In [None]:
# remove some punctuations and concat hip&hop to avoid being seperated into two terms
genre_list_cleaned
genre_list_cleaned_punkt1 = [i.replace("-","") for i in genre_list_cleaned]
genre_list_cleaned_punkt2 = [i.replace("&","") for i in genre_list_cleaned_punkt1]
genre_list_cleaned_punkt3 = [i.replace("+","") for i in genre_list_cleaned_punkt2]
genre_list_cleaned_punkt4 = [i.replace("hip hop","hiphop") for i in genre_list_cleaned_punkt3]

In [None]:
# tokenizer function
def tokenize_content(content):
    ''' 
    - convert input into lowercase, then tokenize each element
    - remove words stopwords and words with non-alphabetic characters
    - return a list of tokenized words
    '''
    s = content.lower()
    tokens = nltk.word_tokenize(s)
    token_words = [i for i in tokens if i not in stopwords and i.isalpha()]
    return token_words

# remove words stopwords and words with non-alphabetic characters
top_tokens_list = [tokenize_content(i) for i in genre_list_cleaned_punkt4]
top_tokens = [w for ls in top_tokens_list for w in ls]


In [None]:
# use COUNTER to count number of times a token showed up
token_counter = Counter(top_tokens)

# plot the word cloud using token_counter dictionary
plt.subplots(figsize = (15,6))
wordcloud = WordCloud (
                    background_color = 'white',
                    width = 512,
                    height = 384
                        ).generate_from_frequencies(token_counter)
plt.imshow(wordcloud) # show wordcloud
plt.axis('off') # to hide x and y axes
plt.show()

## Run correlation matrix to remove highly correlated features

In [None]:
sns.set(style = "white")
corr = tracks_feature_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(14, 4))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap="RdBu", vmax=1, vmin = -1, annot = True)
plt.title("Correlation Heatmap", fontweight='bold')
plt.show()

Since [loudness] and [acousticness] features are both highly correlated with [energy], we decided to keep [energy], which is a more intuitive feature. Also, the field [key] denotes scale such as C, C+, D, etc, which are hard for user to define, thus we decided to remove it.

In [None]:
tracks_feature_df.drop(['loudness','acousticness','key'], axis=1, inplace = True)

### Part III: K-Means Clustering

Before running K-Means, since K-Means is scale-variant, we need to standardize some of the features. 

In [None]:
# to understand the min max of each numerical features
min_max_df = pd.DataFrame()
min_max_df['min'] = tracks_feature_df.min(axis = 0,numeric_only = True)
min_max_df['max'] = tracks_feature_df.max(axis = 0,numeric_only = True)
min_max_df

In [None]:
# run standard scaler on column ['duration_ms_sc','tempo_sc'] which has a large range of value
scaler = StandardScaler()
feature_df_sc = scaler.fit_transform(tracks_feature_df[['duration_ms','tempo']])
feature_df_sc = pd.DataFrame(feature_df_sc,columns = ['duration_ms_sc','tempo_sc'])
feature_df_sc.head()

In [None]:
# merge with original df and keep only features for k-means clustering
tracks_feature_df_sc = tracks_feature_df[['danceability',
                                          'energy',
                                          'mode',
                                          'speechiness',
                                          'instrumentalness',
                                          'valence']].join(feature_df_sc)
tracks_feature_df_sc.head()                                      

In [None]:
# Run PCA with 2 components in order to visualize our clusters
pca = PCA(n_components = 2) # 2D PCA for the plot
# fit and transform the data
tracks_feature_df_pca = pd.DataFrame(pca.fit_transform(tracks_feature_df_sc))
tracks_feature_df_pca.head()

In [None]:
#initialize the kmeans model
kmeans = KMeans(n_clusters=6, n_init= 20)
# fit the input data
kmeans = kmeans.fit(tracks_feature_df_pca)
# get the cluster labels
labels = kmeans.predict(tracks_feature_df_pca)
# centroid values
centroid = kmeans.cluster_centers_
# cluster values
clusters = kmeans.labels_.tolist()

In [None]:
# append clustered result to PCA-ed df
tracks_feature_df_pca['cluster'] = clusters
# append track name to the PCA-ed df
tracks_feature_df_pca['track_name'] = tracks_feature_df['track_name']
# rename the columns of the pca-ed df
tracks_feature_df_pca.columns = ['x', 'y', 'cluster', 'track_name']
tracks_feature_df_pca.head()

In [None]:
# plot the clusters
%matplotlib inline
sns.set(style="white")
ax = sns.lmplot(x="x", y="y", hue='cluster', data = tracks_feature_df_pca, legend=False,
fit_reg=False, height = 5, scatter_kws={"s": 50})

plt.xlabel("Component #1", fontsize = 10)
plt.ylabel("Component #2", fontsize = 10)
plt.show()

In [None]:
# append cluster results to tracks_feature_df
tracks_feature_df['cluster'] = clusters
tracks_feature_df.head()

In [None]:
import os
os.makedirs('Downloads', exist_ok=True)  
tracks_feature_df.to_csv('Downloads/tracks_feature_df.csv') 