### Pre-installs

In [10]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install seaborn
# %pip install sklearn
# %pip install -U scikit-learn
# %pip install spotipy
# %pip install lyricsgenius
# %pip install langdetect

### Cleaning Song Dataset

In [1]:
import pandas as pd

# Importing the dataset
dataset = pd.read_csv("data/tracks.csv")

# Creating the dataframe
df = pd.DataFrame(dataset)

df.tail()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
586667,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],['1QLBXKM5GCpyQQSVMNZqrZ'],2020-09-26,0.56,0.518,0,-7.471,0,0.0292,0.785,0.0,0.0648,0.211,131.896,4
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],['1dy5WNgIKQU6ezkpZs4y8z'],2020-10-21,0.765,0.663,0,-5.223,1,0.0652,0.141,0.000297,0.0924,0.686,150.091,4
586669,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],['37M5pPGs6V1fchFJSgCguX'],2020-09-02,0.535,0.314,7,-12.823,0,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","['4jGPdu95icCKVF31CcFKbS', '5ebPSE9YI5aLeZ1Z2g...",2021-03-05,0.696,0.615,10,-6.212,1,0.0345,0.206,3e-06,0.305,0.438,90.029,4
586671,5Ocn6dZ3BJFPWh4ylwFXtn,Mar de Emociones,38,214360,0,['Afrosound'],['0i4Qda0k4nf7jnNHmSNpYv'],2015-07-01,0.686,0.723,6,-7.067,1,0.0363,0.105,0.0,0.264,0.975,112.204,4


In [2]:
# Spotify API Authentication Information
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
client_id = # CLIENT ID GOES HERE
client_secret = # CLIENT SECRET GOES HERE
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# Creating a new feature that comprises of Song Name and Artist.
df["song_name_artist"] = df["name"] + df["artists"]

In [4]:
df['year'] = df['release_date'].str.extract(r'(\d{4})').astype(int)

In [5]:
# Removing all duplicate songs-artist pairs. Meaning that there won't be 2 of the same song sung by the same artist.
print(df.shape)
df.drop_duplicates(subset=["song_name_artist"], keep='first', inplace=True)
print(df.shape)

(586672, 22)
(526610, 22)


In [6]:
# Filter data to keep songs published 2000* and later.
df = df[df["year"] >= 2000]
print(df.shape)

(187472, 22)


In [7]:
# Fill in missing song names with Spotify API.
df_fill_null = df.copy()

df_fill_null['name'] = df.apply(
    lambda row: sp.track(f"spotify:track:{row['id']}")['name'] if pd.isnull(row['name']) else row['name'], axis=1
)

In [8]:
# Removing rows without song name.
df_removed = df_fill_null.dropna()
df_removed.isnull().sum()
df_removed.count()
# df_removed.head()

id                  187472
name                187472
popularity          187472
duration_ms         187472
explicit            187472
artists             187472
id_artists          187472
release_date        187472
danceability        187472
energy              187472
key                 187472
loudness            187472
mode                187472
speechiness         187472
acousticness        187472
instrumentalness    187472
liveness            187472
valence             187472
tempo               187472
time_signature      187472
song_name_artist    187472
year                187472
dtype: int64

### Adding genres into dataset

In [9]:
# Importing the artist and their genre(s) dataset.
artist_dataset = pd.read_csv("data/artists.csv")

artist_dataset['first_id_artists'] = artist_dataset['id']
needed = artist_dataset[["first_id_artists", "genres"]]

needed.head()

Unnamed: 0,first_id_artists,genres
0,0DheY5irMjBUeLybbCUEZ2,[]
1,0DlhY15l3wsrnlfGio2bjU,[]
2,0DmRESX2JknGPQyO15yxg7,[]
3,0DmhnbHjm1qw6NCYPeZNgJ,[]
4,0Dn11fWM7vHQ3rinvWEl4E,[]


In [10]:
import ast

# Converting the string array into a regular array.
df_removed['artists'] = df_removed['artists'].apply(ast.literal_eval)
df_removed.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,song_name_artist,year
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,1,0.0322,0.394,0.0,0.149,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,[The Toys],['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,1,0.0571,0.436,0.0,0.139,0.839,120.689,4,A Lover's Concerto['The Toys'],2020
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,0,0.0289,0.255,5e-06,0.163,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,1,0.03,0.406,0.0,0.122,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,[Frank Sinatra],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,1,0.0623,0.887,0.0,0.904,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018


In [11]:
# Counting how many genres each song has.
def count_items_in_list(lst):
    return len(lst)

# Apply the function to the DataFrame column
df_removed['artists_count'] = df_removed['artists'].apply(count_items_in_list)

In [12]:
# Finding the Outlier Range of artists per song.
df_removed['artists_count'].describe()

# Giving a generous estimate of 3 Std away from mean as outlier:
mean = df_removed['artists_count'].mean()
std = df_removed['artists_count'].std()

upper = mean + (3 * std)
lower = mean - (3 * std)
print(upper.round())
print(lower.round())

4.0
-1.0


In [13]:
# Removing all outliers.
print(df_removed['id'].count())
df_removed = df_removed[(df_removed['artists_count'] >= -1) & (df_removed['artists_count'] <= 4)]
print(df_removed['id'].count())

187472


In [14]:
# Extracting the artists from their array.
expanded_artists = df_removed['artists'].apply(lambda x: pd.Series(x))

# Rename the columns
expanded_artists.columns = [f"artist_{i+1}" for i in range(len(expanded_artists.columns))]

# Concatenate the expanded columns with the original DataFrame
df_merged = pd.concat([df_removed, expanded_artists], axis=1)
df_merged.head()


Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,valence,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,...,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,[The Toys],['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,...,0.839,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,...,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,[Gerry & The Pacemakers],['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,...,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,[Frank Sinatra],['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,...,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,


In [15]:
# Filling in any missing values with blank spaces to prevent NaN errors and type misalignment.
artist_dataset['artists'] = artist_dataset['name']
artist_genres = artist_dataset[["artists", "genres"]]

df_merged = df_merged.drop('artists', axis=1)
df_merged['artist_1'] = df_merged['artist_1'].fillna('')
df_merged['artist_2'] = df_merged['artist_2'].fillna('')
df_merged['artist_3'] = df_merged['artist_3'].fillna('')
df_merged['artist_4'] = df_merged['artist_4'].fillna('')
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,valence,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4
39501,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,0.285,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,
39511,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,0.839,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,
39517,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,0.588,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,
39521,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,0.478,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,
39529,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,...,0.239,117.153,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,


In [16]:
# Obtaining genre data for each artist per song through joining this dataset with the artist dataset called earlier.
def string_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return []

for i in range(1, 5):  # Iterate over artist_1, artist_2, and artist_3
    # Merge genres for each artist column
    df_merged = pd.merge(df_merged, artist_genres, left_on=f'artist_{i}', right_on='artists', how='left')
    # Rename the genres column
    df_merged.rename(columns={'genres': f'genres_{i}'}, inplace=True)
    # Drop the redundant 'artists' column
    df_merged.drop('artists', axis=1, inplace=True)
    
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres_1,genres_2,genres_3,genres_4
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[],,,
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,['thai pop'],,,
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'brill building pop', 'bri...",,,


In [17]:
# Filling in any missing values with blank spaces to prevent NaN errors and type misalignment.
df_merged['genres_1'] = df_merged['genres_1'].fillna('[]')
df_merged['genres_2'] = df_merged['genres_2'].fillna('[]')
df_merged['genres_3'] = df_merged['genres_3'].fillna('[]')
df_merged['genres_4'] = df_merged['genres_4'].fillna('[]')

# Convering string array into regular array.
df_merged['genres_1'] = df_merged['genres_1'].apply(ast.literal_eval)
df_merged['genres_2'] = df_merged['genres_2'].apply(ast.literal_eval)
df_merged['genres_3'] = df_merged['genres_3'].apply(ast.literal_eval)
df_merged['genres_4'] = df_merged['genres_4'].apply(ast.literal_eval)
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres_1,genres_2,genres_3,genres_4
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[],[],[],[]
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,2020,1,The Toys,,,,[thai pop],[],[],[]
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,2008,1,Gerry & The Pacemakers,,,,"[adult standards, brill building pop, british ...",[],[],[]


In [18]:
# Compiling all artist(s) genres per song into one singluar array.
df_merged['genres'] = df_merged['genres_1'] + df_merged['genres_2'] + df_merged['genres_3'] + df_merged['genres_4']

# Dropping the individual artist genre(s) columns.
df_merged.drop(['genres_1', 'genres_2', 'genres_3', 'genres_4'], axis=1, inplace=True)

In [19]:
# Making the genre(s) list's unique. Meaning that there is one of each value.
df_merged['genres'] = df_merged['genres'].apply(lambda x: list(set(x)))
df_merged.iloc[4789]['genres']

['edm',
 'trance',
 'pop dance',
 'progressive trance',
 'progressive house',
 'uplifting trance']

In [20]:
# Exporting dataset to new .csv file.
df_merged.to_csv('data/tracks_with_genres.csv', index=False)

### Spotify Lyrics Scraper
Do not run this. This is just to slow how it works. Download tracks_with_genres_&_language.csv and use it for the recommender model instead.

In [2]:
import pandas as pd

# Reading new dataset.
df_merged = pd.read_csv("data/tracks_with_genres.csv")
df_merged = pd.DataFrame(df_merged)
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,tempo,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,113.564,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe..."
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,[]
2,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,120.689,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,['thai pop']
3,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,104.536,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe..."
4,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,106.773,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe..."


In [3]:
# Checking earliest year.
df_merged['year'].min()

2000

In [5]:
# Removing any duplicate rows due to data manupilation and extraction.
print(df_merged.shape)
df_merged.drop_duplicates(subset=["song_name_artist"], keep='first', inplace=True)
print(df_merged.shape)

(228029, 27)
(186011, 27)


In [8]:
# %pip install spotify-lyrics-scraper
import spotify_lyrics_scraper as spotify
import threading

# Using Spotify Lyrics Scraper to obtain song language based on lyrics.
# Follow instructions for this python library to obtain the following keys.
sp_dc = # SP_DC GOES HERE
sp_key = # SP_KEY GOES HERE

token = spotify.getToken(sp_dc, sp_key)

def get_lyrics_and_update_dataframe(token, song_name, dataframe, index):
    # Get the lyrics for the given song name
    lyrics_data = spotify.getLyrics(token, songName=song_name)
    if lyrics_data['status'] == False:
        language = "null"
    else:
        # Extract the language from the lyrics data
        language = lyrics_data['message']['lyrics']['language']
    
    # Update the dataframe with the language information
    print(index, language)
    dataframe.at[index, 'language'] = language

# Multi-threading to expidite the process. Still took around 140 minutes.
def process_songs_parallel(token, dataframe):
    threads = []
    for index, row in dataframe.iterrows():
        song_name = row['name']
        thread = threading.Thread(target=get_lyrics_and_update_dataframe, args=(token, song_name, dataframe, index))
        threads.append(thread)
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()
        
# ONLY UNCOMMENT THIS IF YOU WANT TO RUN THE CODE FOR ABOUY 180 MINS.
# process_songs_parallel(token, df_merged)
df_merged['language'].head()

13 null
3 null
0 null
26 null
32 null
10 null
41 null
23 null
49 null
52 null
4 en
14 null
67 null
66 null
5 en
15 null
31 null
11 en
6 en
30 null
12 en
34 en
17 en
46 null
43 en
21 en
40 ja
75 null
19 en
24 en
8 en
7 en
16 en
39 en
29 en
38 null
37 en
18 en
47 null
44 en
27 en
22 en
51 en
45 en
1 en
20 null
54 en
36 en
80 null
76 null
42 en
55 en
48 en
57 en
35 en
50 en
53 en
69 null
28 en
63 en
64 en
62 en
58 en
68 en
72 en
25 en
65 en
59 null
94 null
71 en
73 en
60 en
70 en
82 null
9 en
56 en
61 null
96 null
74 null
97 null
101 null
77 ja
100 null
104 null
78 null
105 null
112 null
81 null
83 en
88 en
84 en
87 en
92 en
85 en
79 en
95 en
86 en
89 en
93 en
91 en
103 en
113 en
9890 en
 null
108 en
109 en
117 null
142 null
99 en
140 null
120 null
106 null
102 en
143 null
110 null
121 en
119111 null
 null
116 null
141 null
115 en
114 en
118 null
107 null
130 null
129 null
125 null
135 null
134 en
122 null
138 en
124 en
128 null
139 null
126 null
123 null
137 en
148 en
151132 null
 en
147

0    null
1      en
3    null
4      en
5      en
Name: language, dtype: object

In [74]:
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres,language
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,[],en
2,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",
3,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en
4,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,...,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,,"['easy listening', 'adult standards', 'lounge']",en


In [75]:
from langdetect import detect

# Replacing all NaN values with string 'null' for future use and type mislignment prevention.
df_merged['language'] = df_merged['language'].fillna('null')

In [103]:
df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres,language
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,[],en
2,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",
3,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en
4,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,...,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,,"['easy listening', 'adult standards', 'lounge']",en


In [113]:
df_merged.index

RangeIndex(start=0, stop=186011, step=1)

In [121]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Many songs do not have lyrics enabled by spotify. So the alternative to that is to use their name as a benchmark for the language.
# Uses language detect library to identify the song name's expected language. Not exactly accurate but it is an acceptable alternative for now until NLP.
def replace_nan_with_language(row):
    if pd.isna(row['language']):  # Check if the value is NaN
        name = row['name']
        if isinstance(name, str) and len(name) > 0:  # Check if name is a non-empty string
            try:
                return detect(name)  # Use detect() output as the value if it's NaN
            except LangDetectException:
                pass  # Handle LangDetectException, e.g., return 'unknown' or None
        # Return None or any other default value if language detection fails or input text is empty
        return None  
    else:
        return row['language']  # Otherwise, keep the original value

df_merged['language'] = df_merged.apply(replace_nan_with_language, axis=1)

df_merged.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,time_signature,song_name_artist,year,artists_count,artist_1,artist_2,artist_3,artist_4,genres,language
0,6catF1lDhNTjjGa2GxRQNN,You'll Never Walk Alone - Mono; 2002 Remaster,56,160187,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.484,0.265,0,...,3,You'll Never Walk Alone - Mono; 2002 Remaster[...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en
1,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,['6lH5PpuiMa5SpfjoIOlwCS'],2020-03-13,0.671,0.867,2,...,4,A Lover's Concerto['The Toys'],2020,1,The Toys,,,,[],en
2,4aSw1QJIMwYSoDEgzgdCJL,Ferry Cross the Mersey - Mono; 2002 Remaster,40,141987,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.405,0.365,6,...,4,Ferry Cross the Mersey - Mono; 2002 Remaster['...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en
3,0ZMMtH875IR2TfkyC4PolD,Don't Let the Sun Catch You Crying (Main) - Mono,34,157093,0,['3UmBeGyNwr4iDWi1vTxWi8'],2008-02-11,0.477,0.352,1,...,4,Don't Let the Sun Catch You Crying (Main) - Mo...,2008,1,Gerry & The Pacemakers,,,,"['adult standards', 'bubblegum pop', 'merseybe...",en
4,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,['1Mxqyy3pSjf8kZZL4QVxS0'],2018-05-04,0.319,0.201,7,...,3,The September Of My Years - Live At The Sands ...,2018,1,Frank Sinatra,,,,"['easy listening', 'adult standards', 'lounge']",en


In [122]:
# Checking if there are any 'null' or None.
(df_merged['language'] == 'null').sum()

0

In [138]:
(df_merged['language'] == None).sum()

0

In [144]:
# Exporting dataset to new .csv file.
df_merged.to_csv('data/tracks_with_genres_&_language.csv', index=False)

### LanguageDetect

In [None]:
# # input
# song_name = '夜曲'
# artist_name = '周杰倫'

In [None]:
# import lyricsgenius
# from langdetect import detect
# from langdetect import detect_langs

# token = 'GXFX5feHNS0gJDSb9scRbZr-M3dBGVL_SWcKvd5kcCRSHmvRRXQngVU3Cq5Hl0Bw'
# genius = lyricsgenius.Genius(token)
# song = genius.search_song(song_name, artist_name)

# # print(song.lyrics)
# print(song.lyrics)
# print(song.lyrics.split('Lyrics')[1])
# print(detect(song.lyrics.split('Lyrics')[1]))
# print(detect_langs(song.lyrics.split('Lyrics')[1]))