### Pre-installs

In [24]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install sklearn
# %pip install -U scikit-learn
# %pip install spotipy
# %pip install lyricsgenius
# %pip install langdetect

### Cleaning Song Dataset

In [25]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv("data/tracks.csv")

# Creating the dataframe
df = pd.DataFrame(dataset)

df.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
586667,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020-09-26,0.56,0.518,0,-7.471,0,0.0292,0.785,0.0,0.0648,0.211,131.896,4
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,0,-5.223,1,0.0652,0.141,0.000297,0.0924,0.686,150.091,4
586669,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020-09-02,0.535,0.314,7,-12.823,0,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,10,-6.212,1,0.0345,0.206,3e-06,0.305,0.438,90.029,4
586671,5Ocn6dZ3BJFPWh4ylwFXtn,Mar de Emociones,38,214360,0,['Afrosound'],['0i4Qda0k4nf7jnNHmSNpYv'],2015-07-01,0.686,0.723,6,-7.067,1,0.0363,0.105,0.0,0.264,0.975,112.204,4


In [26]:
# Spotify API Authentication Information
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
client_id = # CLIENT ID GOES HERE
client_secret = # CLIENT SECRET GOES HERE
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [27]:
# Creating a new feature that comprises of Song Name and Artist.
df["song_name_artist"] = df["name"] + df["artists"]

In [28]:
df['year'] = df['release_date'].str.extract(r'(\d{4})').astype(int)

In [29]:
# Removing all duplicate songs-artist pairs. Meaning that there won't be 2 of the same song sung by the same artist.
print(df.shape)
df.drop_duplicates(subset=["song_name_artist"], keep='first', inplace=True)
print(df.shape)

(586672, 22)
(526610, 22)


In [30]:
# Filter data to keep songs published 2000* and later.
df = df[df["year"] >= 1980]
print(df.shape)

(360785, 22)


In [31]:
df_fill_null = df.copy()

df_fill_null['name'] = df.apply(
    lambda row: sp.track(f"spotify:track:{row['id']}")['name'] if pd.isnull(row['name']) else row['name'], axis=1
)

In [32]:
# Removing rows without song name.
df_removed = df_fill_null.dropna()
df_removed.isnull().sum()
df_removed.count()
# df_removed.head()

id                  360784
name                360784
popularity          360784
duration_ms         360784
explicit            360784
artists             360784
id_artists          360784
release_date        360784
danceability        360784
energy              360784
key                 360784
loudness            360784
mode                360784
speechiness         360784
acousticness        360784
instrumentalness    360784
liveness            360784
valence             360784
tempo               360784
time_signature      360784
song_name_artist    360784
year                360784
dtype: int64

### Adding genres into dataset

In [33]:
artist_dataset = pd.read_csv("data/artists.csv")

artist_dataset['first_id_artists'] = artist_dataset['id']
needed = artist_dataset[["first_id_artists", "genres"]]

needed.head()

Unnamed: 0,first_id_artists,genres
0,0DheY5irMjBUeLybbCUEZ2,[]
1,0DlhY15l3wsrnlfGio2bjU,[]
2,0DmRESX2JknGPQyO15yxg7,[]
3,0DmhnbHjm1qw6NCYPeZNgJ,[]
4,0Dn11fWM7vHQ3rinvWEl4E,[]


In [34]:
import ast

df_removed['artists'] = df_removed['artists'].apply(ast.literal_eval)
df_removed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed['artists'] = df_removed['artists'].apply(ast.literal_eval)


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,song_name_artist,year
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,1,0.0322,0.394,0.0,0.149,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,[The Toys],['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,1,0.0571,0.436,0.0,0.139,0.839,120.689,4,A Lover's Concerto['The Toys'],2020
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,0,0.0289,0.255,5e-06,0.163,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,1,0.03,0.406,0.0,0.122,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,[Frank Sinatra],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018


In [35]:
import matplotlib.pyplot as plt

def count_items_in_list(lst):
    return len(lst)

# Apply the function to the DataFrame column
df_removed['artists_count'] = df_removed['artists'].apply(count_items_in_list)

# ax = df_removed['artists_count'].plot(kind='hist', bins=100, edgecolor='black')

# bin_labels = [f'{int(b)}-{int(b)+1}' for b in ax.get_xticks()]
# plt.xticks(ax.get_xticks(), bin_labels)

# # Add number labels on the bars
# for i in ax.patches:
#     plt.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.1,
#              str(int(i.get_height())), ha='center', va='bottom')

# plt.title('Histogram of Values')
# plt.xlabel('Number of Artists')
# plt.ylabel('Frequency')
# plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_removed['artists_count'] = df_removed['artists'].apply(count_items_in_list)


In [36]:
df_removed['artists_count'].describe()

# Giving a generous estimate of 3 Std away from mean as outlier:
mean = df_removed['artists_count'].mean()
std = df_removed['artists_count'].std()

upper = mean + (3 * std)
lower = mean - (3 * std)
print(upper.round())
print(lower.round())

4.0
-1.0


In [37]:
print(df_removed['id'].count())
df_removed = df_removed[(df_removed['artists_count'] >= -1) & (df_removed['artists_count'] <= 4)]

# ax = df_removed['artists_count'].plot(kind='hist', bins=100, edgecolor='black')

# # Add number labels on the bars
# for i in ax.patches:
#     plt.text(i.get_x() + i.get_width() / 2, i.get_height() + 0.1,
#              str(int(i.get_height())), ha='center', va='bottom')

# plt.title('Histogram of Values')
# plt.xlabel('Number of Artists')
# plt.ylabel('Frequency')
# plt.show()
# print(df_removed['id'].count())

360784


In [38]:
expanded_artists = df_removed['artists'].apply(lambda x: pd.Series(x))

# Rename the columns
expanded_artists.columns = [f"artist_{i+1}" for i in range(len(expanded_artists.columns))]

# Concatenate the expanded columns with the original DataFrame
df_merged = pd.concat([df_removed, expanded_artists], axis=1)
df_merged.head()


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,valence,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,[The Toys],['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,0.839,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,[Frank Sinatra],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,


In [39]:
artist_dataset['artists'] = artist_dataset['name']
artist_genres = artist_dataset[["artists", "genres"]]

df_merged = df_merged.drop('artists', axis=1)
df_merged['artist_1'] = df_merged['artist_1'].fillna('')
df_merged['artist_2'] = df_merged['artist_2'].fillna('')
df_merged['artist_3'] = df_merged['artist_3'].fillna('')
df_merged['artist_4'] = df_merged['artist_4'].fillna('')
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,valence,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,0.839,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,...,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,


In [40]:
def string_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

for i in range(1, 5):  # Iterate over artist_1, artist_2, and artist_3
    # Merge genres for each artist column
    df_merged = pd.merge(df_merged, artist_genres, left_on=f'artist_{i}', right_on='artists', how='left')
    # Rename the genres column
    df_merged.rename(columns={'genres': f'genres_{i}'}, inplace=True)
    # Drop the redundant 'artists' column
    df_merged.drop('artists', axis=1, inplace=True)
    
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres_1,genres_2,genres_3,genres_4
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[],,,
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,['thai pop'],,,
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,


In [41]:
df_merged['genres_1'] = df_merged['genres_1'].fillna('[]')
df_merged['genres_2'] = df_merged['genres_2'].fillna('[]')
df_merged['genres_3'] = df_merged['genres_3'].fillna('[]')
df_merged['genres_4'] = df_merged['genres_4'].fillna('[]')

df_merged['genres_1'] = df_merged['genres_1'].apply(ast.literal_eval)
df_merged['genres_2'] = df_merged['genres_2'].apply(ast.literal_eval)
df_merged['genres_3'] = df_merged['genres_3'].apply(ast.literal_eval)
df_merged['genres_4'] = df_merged['genres_4'].apply(ast.literal_eval)
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres_1,genres_2,genres_3,genres_4
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[],[],[],[]
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[thai pop],[],[],[]
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]


In [42]:
df_merged['genres'] = df_merged['genres_1'] + df_merged['genres_2'] + df_merged['genres_3'] + df_merged['genres_4']

# Apply the function to each row to create a new column containing concatenated lists
df_merged.drop(['genres_1', 'genres_2', 'genres_3', 'genres_4'], axis=1, inplace=True)

In [43]:
df_merged['genres'] = df_merged['genres'].apply(lambda x: list(set(x)))
df_merged.iloc[4789]['genres']

[]

In [44]:
df_merged.iloc[9000:9005].head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres
9000,0q4xDKMq99VOq47FjICiGg,Kookaburra Sits In The Old Gum Tree,41,55800,0,['0gkeMf1I9r5U5Hne19vr9A'],1987-01-01,0.823,0.047,3,...,137.829,4,Kookaburra Sits In The Old Gum Tree['Play Scho...,1987,1,Play School,,,,"[preschool children's music, australian childr..."
9001,0FNWkaLLOL5mPJom4O0ICu,Rubber Ring - 2011 Remaster,41,228293,0,['3yY2gUcIsjMr8hjo51PoJ8'],1987,0.535,0.721,4,...,137.621,4,Rubber Ring - 2011 Remaster['The Smiths'],1987,1,The Smiths,,,,"[new wave, madchester, uk post-punk, permanent..."
9002,0FNWkaLLOL5mPJom4O0ICu,Rubber Ring - 2011 Remaster,41,228293,0,['3yY2gUcIsjMr8hjo51PoJ8'],1987,0.535,0.721,4,...,137.621,4,Rubber Ring - 2011 Remaster['The Smiths'],1987,1,The Smiths,,,,[]
9003,1lMNcFJjUcrhRSEMzeDZqQ,4th of July,41,247293,0,['54NqjhP2rT524Mi2GicG4K'],1987,0.44,0.815,7,...,139.104,4,4th of July['X'],1987,1,X,,,,"[hardcore punk, new wave, punk, punk blues, al..."
9004,1lMNcFJjUcrhRSEMzeDZqQ,4th of July,41,247293,0,['54NqjhP2rT524Mi2GicG4K'],1987,0.44,0.815,7,...,139.104,4,4th of July['X'],1987,1,X,,,,[]


In [45]:
# Exporting dataset to new .csv file.
df_merged.to_csv('data/tracks_with_genres.csv', index=False)

### LanguageDetect

In [46]:
# input
song_name = '夜曲'
artist_name = '周杰倫'

In [None]:
import lyricsgenius
from langdetect import detect
from langdetect import detect_langs

token = 'GXFX5feHNS0gJDSb9scRbZr-M3dBGVL_SWcKvd5kcCRSHmvRRXQngVU3Cq5Hl0Bw'
genius = lyricsgenius.Genius(token)
song = genius.search_song(song_name, artist_name)

# print(song.lyrics)
print(song.lyrics)
print(song.lyrics.split('Lyrics')[1])
print(detect(song.lyrics.split('Lyrics')[1]))
print(detect_langs(song.lyrics.split('Lyrics')[1]))