In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

#### Loading data

In [2]:
d1 = pd.read_csv('../dataset/processed_data.csv')
d1.head()

Unnamed: 0.1,Unnamed: 0,id,name,album,artists,artist_ids,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,year
0,22,2SwgVZn9S4NGueAaEAryf1,man on a mission,Do It for Love,['Daryl Hall & John Oates'],['77tT1kLj6mCWtFNqiOmP9H'],1.0,0.787,0.903,0.0,0.819615,1.0,0.032541,0.293173,2.5e-05,0.101,0.962,0.481839,2018
1,23,0QCQ1Isa0YPVyIbs6JwpO1,do it for love,Do It for Love,['Daryl Hall & John Oates'],['77tT1kLj6mCWtFNqiOmP9H'],1.0,0.587,0.958,0.363636,0.815822,1.0,0.060537,0.10743,0.0,0.0574,0.832,0.353411,2018
2,24,3kIBEFhsZOeeKGebxRraOb,someday we'll know,Do It for Love,['Daryl Hall & John Oates'],['77tT1kLj6mCWtFNqiOmP9H'],1.0,0.565,0.781,0.090909,0.816953,0.0,0.031818,0.023394,1e-05,0.0819,0.461,0.441792,2018
3,25,5dNDRw6qjDcnbW3luRhElU,forever for you,Do It for Love,['Daryl Hall & John Oates'],['77tT1kLj6mCWtFNqiOmP9H'],1.0,0.651,0.567,0.818182,0.796963,1.0,0.024793,0.564257,6e-06,0.186,0.37,0.389782,2018
4,26,561UU4MvlsCenN1x7leYCh,life's too short,Do It for Love,['Daryl Hall & John Oates'],['77tT1kLj6mCWtFNqiOmP9H'],1.0,0.833,0.805,0.0,0.824672,1.0,0.035847,0.076305,0.0136,0.0731,0.974,0.466039,2018


In [3]:
d2 = pd.read_csv('../dataset/data_w_genres.csv', na_filter=True, na_values='[]')
d2 = d2.dropna()
d2.reset_index(inplace = True)
d2.head()

Unnamed: 0,index,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,8,"['comedy rock', 'comic', 'parody']","""Weird Al"" Yankovic",0.173145,0.662787,218948.196721,0.695393,5e-05,0.161102,-9.768705,0.084536,133.03118,0.751344,34.229508,9,1,122
2,9,"['emo rap', 'florida rap', 'sad rap', 'undergr...",$NOT,0.544467,0.7898,137910.466667,0.532933,0.023063,0.1803,-9.149267,0.293687,112.3448,0.4807,67.533333,1,1,15
3,10,"['dark trap', 'meme rap']",$atori Zoom,0.239,0.883,141519.0,0.625,0.0,0.0765,-4.098,0.245,126.677,0.871,67.0,6,1,2
4,12,"['asian american hip hop', 'cali rap', 'west c...",$tupid Young,0.1481,0.854,190572.0,0.683,2e-06,0.1885,-6.997,0.221,100.7245,0.6255,57.5,1,1,2


In [4]:
d2.shape

(18823, 17)

#### Categorizing genres into (rock, pop, hip hop, rap, electronic)

In [5]:
d2['genres'].value_counts()

genres
['movie tunes']                                                                        69
['show tunes']                                                                         63
['hollywood']                                                                          56
['orchestral performance']                                                             50
['broadway', 'hollywood', 'show tunes']                                                48
                                                                                       ..
['histoire pour enfants', 'musique pour enfants']                                       1
['nordic orchestra']                                                                    1
['avant-garde', 'contemporary classical', 'hungarian contemporary classical']           1
['conscious hip hop', 'dancehall', 'lovers rock', 'modern reggae', 'reggae fusion']     1
['chinese indie', 'chinese indie rock']                                                 1
Nam

In [6]:
for i in range(18823):
    x = d2['genres'][i]
    x = x[1: -1]
    
    li = x.split(', ')
    li1 = list()
    for j in li:
        j = j[1: -1]
        if j.find('rock')!=-1:
            li1.append('rock')
        if j.find('pop')!=-1:
            li1.append('pop')
        if j.find('hip hop')!=-1:
            li1.append('hip hop')
        if j.find('rap')!=-1:
            li1.append('rap')
        if j.find('electronic')!=-1:
            li1.append('electronic')
    
    if len(li1)==0:
        d2['genres'][i] = None
    else:
        d2['genres'][i] = set(li1)

In [7]:
d2 = d2.dropna()
d2.reset_index(inplace = True)
d2.shape

(9616, 18)

In [8]:
d2['genres'].value_counts()

genres
{pop}                               3019
{rock}                              2291
{rock, pop}                         1675
{hip hop, rap}                       612
{rap}                                569
{hip hop, rap, pop}                  369
{hip hop}                            336
{rap, pop}                           250
{electronic}                         123
{rap, rock}                          120
{hip hop, pop}                        56
{electronic, pop}                     41
{electronic, rap}                     36
{electronic, rock, pop}               23
{rap, rock, pop}                      20
{electronic, rap, pop}                18
{hip hop, rap, rock}                  18
{electronic, rock}                    11
{hip hop, rock}                        9
{hip hop, electronic}                  6
{hip hop, rock, pop}                   6
{hip hop, rap, rock, pop}              3
{electronic, rap, hip hop}             2
{electronic, rap, hip hop, pop}        1
{electron

In [9]:
d2 = d2.explode('genres').reset_index(drop=True)
d2.shape

(13361, 18)

#### Normalizing data

In [10]:
d2.drop(columns=['level_0', 'index', 'artists', 'duration_ms', 'popularity', 'count'], inplace=True)

In [11]:
def normalize_col(col):
    max_c = d2[col].max()
    min_c = d2[col].min()
    d2[col] = (d2[col] - min_c)/(max_c - min_c)

col1 = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

for col in col1:
    normalize_col(col)

d2.head()

Unnamed: 0,genres,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode
0,rock,0.173838,0.643095,0.694687,5.1e-05,0.145022,0.703794,0.067237,0.493095,0.758854,0.818182,1.0
1,hip hop,0.546652,0.783348,0.531313,0.023396,0.165066,0.722833,0.296137,0.358006,0.475902,0.090909,1.0
2,rap,0.546652,0.783348,0.531313,0.023396,0.165066,0.722833,0.296137,0.358006,0.475902,0.090909,1.0
3,rap,0.239958,0.886263,0.623898,0.0,0.056692,0.878087,0.242853,0.4516,0.883952,0.545455,1.0
4,hip hop,0.148692,0.85424,0.682224,2e-06,0.173627,0.788984,0.216586,0.282121,0.627287,0.090909,1.0


#### Encoding genres column

In [12]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['genres'])], remainder='passthrough')
d2 = np.array(ct.fit_transform(d2))

In [13]:
d2

array([[0.        , 0.        , 0.        , ..., 0.75885443, 0.81818182,
        1.        ],
       [0.        , 1.        , 0.        , ..., 0.47590173, 0.09090909,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.47590173, 0.09090909,
        1.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 0.22739153, 0.81818182,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.34134867, 1.        ,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.15838996, 0.90909091,
        1.        ]])

#### Spliting data 

In [14]:
x = d2[:, 5:]
y = d2[:, :5]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [16]:
x_train.shape

(10688, 11)

In [17]:
x_test.shape

(2673, 11)

In [18]:
x_train = x_train.reshape(10688, 11, 1)
x_test = x_test.reshape(2673, 11, 1)

#### CNN

In [19]:
cnn = tf.keras.models.Sequential()

In [20]:
cnn.add(tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool1D(pool_size=2, strides=2))
cnn.add(tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
cnn.add(tf.keras.layers.MaxPool1D(pool_size=2, strides=2))
cnn.add(tf.keras.layers.Flatten())
cnn.add(tf.keras.layers.Dense(units=128, activation='relu'))
cnn.add(tf.keras.layers.Dense(units=5, activation='softmax'))

In [21]:
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
cnn.fit(x_train, y_train, batch_size=32, epochs=100, verbose = 0, validation_data=(x_test, y_test))

<keras.src.callbacks.History at 0x1517006a0>

#### Cleaning original data

In [23]:
d3 = d1.copy(deep=True)

In [24]:
d3.drop(columns=['Unnamed: 0', 'id', 'name', 'album', 'artists', 'artist_ids', 'explicit', 'year'], inplace=True)
d3.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.787,0.903,0.0,0.819615,1.0,0.032541,0.293173,2.5e-05,0.101,0.962,0.481839
1,0.587,0.958,0.363636,0.815822,1.0,0.060537,0.10743,0.0,0.0574,0.832,0.353411
2,0.565,0.781,0.090909,0.816953,0.0,0.031818,0.023394,1e-05,0.0819,0.461,0.441792
3,0.651,0.567,0.818182,0.796963,1.0,0.024793,0.564257,6e-06,0.186,0.37,0.389782
4,0.833,0.805,0.0,0.824672,1.0,0.035847,0.076305,0.0136,0.0731,0.974,0.466039


In [25]:
z = d3.iloc[:, [6, 0, 1, 7, 8, 3, 5, 10, 9, 2, 4]].values

In [26]:
z.shape

(519817, 11)

In [27]:
z = z.reshape(519817, 11, 1)

In [28]:
z_genres = cnn.predict(z)



In [29]:
z_genres

array([[0.01352447, 0.10170414, 0.6105069 , 0.07244141, 0.20182312],
       [0.00681892, 0.03090661, 0.43791044, 0.04532983, 0.4790342 ],
       [0.00858429, 0.01768096, 0.5000842 , 0.05871371, 0.4149368 ],
       ...,
       [0.71547663, 0.00195933, 0.1936625 , 0.00447567, 0.08442584],
       [0.10102887, 0.03895463, 0.67940116, 0.03186131, 0.14875403],
       [0.27836573, 0.30965957, 0.17982954, 0.21952356, 0.01262155]],
      dtype=float32)

In [30]:
z_genres.shape

(519817, 5)

In [31]:
d1['genres'] = None

In [32]:
for i in range(519817):
    li = list()
    if z_genres[i][0] > 0.6:
        li.append('electronic')
    if z_genres[i][1] > 0.6:
        li.append('hip hop')
    if z_genres[i][2] > 0.6:
        li.append('pop')
    if z_genres[i][3] > 0.6:
        li.append('rap')
    if z_genres[i][4] > 0.6:
        li.append('rock')
    
    if not li:
        m = np.argmax(z_genres[i])
        li.append(['electronic', 'hip hop', 'pop', 'rap', 'rock'][m])

    d1['genres'][i] = ', '.join(li)

In [33]:
d1['genres'].value_counts()

genres
pop           288244
rock          129334
electronic     38471
rap            37609
hip hop        26159
Name: count, dtype: int64

In [34]:
d1 = d1.iloc[:,1:]

In [35]:
d1.to_csv('../dataset/final_data.csv')