In [1]:
# Importing libraries required to manipulate DataFrame and carry out EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Loading the .csv file containing the raw data as a Pandas DataFrame
# Defining a list with the genres used in the .csv file
songs_raw = pd.read_csv(r'C:\Users\Chetan\Desktop\College\Python\EDM Sub-genre classifier\songs.csv')
genres = ['house','techno','dnb','hardstyle','trap']

In [3]:
songs_raw.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
0,0.786,0.87,2.0,-5.839,1.0,0.0507,0.0493,0.215,0.0863,0.676,126.011,audio_features,7LVEBvmKB5uDTg0CGqDo3p,spotify:track:7LVEBvmKB5uDTg0CGqDo3p,https://api.spotify.com/v1/tracks/7LVEBvmKB5uD...,https://api.spotify.com/v1/audio-analysis/7LVE...,192482.0,4.0,house
1,0.74,0.934,7.0,-3.255,1.0,0.316,0.00822,0.000377,0.116,0.685,124.936,audio_features,0m69QNuZVZDXhGHBGK1mQA,spotify:track:0m69QNuZVZDXhGHBGK1mQA,https://api.spotify.com/v1/tracks/0m69QNuZVZDX...,https://api.spotify.com/v1/audio-analysis/0m69...,154680.0,4.0,house
2,0.889,0.859,11.0,-5.11,1.0,0.205,0.0216,0.106,0.737,0.828,124.958,audio_features,4PfzZIFxK10aHXupOm4qEu,spotify:track:4PfzZIFxK10aHXupOm4qEu,https://api.spotify.com/v1/tracks/4PfzZIFxK10a...,https://api.spotify.com/v1/audio-analysis/4Pfz...,142560.0,4.0,house
3,0.84,0.868,11.0,-6.173,0.0,0.0721,0.0259,0.573,0.0891,0.851,126.001,audio_features,6uABa7vKOgFqXnwvk8NBr5,spotify:track:6uABa7vKOgFqXnwvk8NBr5,https://api.spotify.com/v1/tracks/6uABa7vKOgFq...,https://api.spotify.com/v1/audio-analysis/6uAB...,203810.0,4.0,house
4,0.74,0.989,10.0,-4.193,1.0,0.0469,0.00276,0.956,0.306,0.934,125.001,audio_features,6fcHhf9bix2uLbKHApSkNy,spotify:track:6fcHhf9bix2uLbKHApSkNy,https://api.spotify.com/v1/tracks/6fcHhf9bix2u...,https://api.spotify.com/v1/audio-analysis/6fcH...,142080.0,4.0,house


In [4]:
# Shape of raw DataFrame
print("Shape of the raw database of extracted songs:",songs_raw.shape)

Shape of the raw database of extracted songs: (34496, 19)


In [5]:
# Dropping all duplicate values from the raw dataframe
songs_pref = songs_raw.drop_duplicates(subset = "uri", keep = 'first', inplace = False, ignore_index = True)

In [6]:
# Shape of DataFrame after dropping all duplicate values
print("Shape of the database of extracted songs after removing duplicates:",songs_pref.shape)

Shape of the database of extracted songs after removing duplicates: (24035, 19)


In [7]:
# Checking for any NaN values in the dataframe
songs_pref.isnull().sum()

danceability        1
energy              1
key                 1
loudness            1
mode                1
speechiness         1
acousticness        1
instrumentalness    1
liveness            1
valence             1
tempo               1
type                1
id                  1
uri                 1
track_href          1
analysis_url        1
duration_ms         1
time_signature      1
genre               0
dtype: int64

In [8]:
# Dropping all NaN values from the DataFrame and checking again
songs_pref.dropna(axis = 0, inplace = True)
print("Shape of the database of extracted songs after removing NaN values:",songs_pref.shape)
print("Checking for NaN values again, which should be 0")
print(songs_pref.isnull().sum())

Shape of the database of extracted songs after removing NaN values: (24033, 19)
Checking for NaN values again, which should be 0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
type                0
id                  0
uri                 0
track_href          0
analysis_url        0
duration_ms         0
time_signature      0
genre               0
dtype: int64


In [9]:
# Removing songs which are more than 1000 seconds in duration
songs = songs_pref[songs_pref['duration_ms'] < 1000000]

In [10]:
# Removing songs which are less than 100 seconds in duration
songs = songs[songs['duration_ms'] > 100000]

In [11]:
# Remove songs with energy below .3
songs = songs[songs['energy'] > .3]

In [12]:
songs.shape

(23978, 19)

In [13]:
songs_tempo = songs['tempo']

In [17]:
# A function to deal with outliers in tempo
# The range of tempos will be limited to range from 100 to 200
def tempo_cleaning(tempo):
    
    # Divide any tempo over 200 by 2
    if tempo > 200:
        tempo /= 2
        return tempo
    
    # Multiply any tempo under 100 by 2
    elif tempo < 100:
        tempo *= 2
        return tempo
    
    # Leave all other tempos alone
    else:
        return tempo

In [18]:
# Adjust the tempos for the modeling data
songs['tempo'] = songs['tempo'].map(tempo_cleaning)

In [19]:
# Check that all tempos are now between 100 and 200 beats per minute
print(f"{songs[songs['tempo'] < 100].shape[0]} songs below 100 beats per minute")
print(f"{songs[songs['tempo'] > 200].shape[0]} songs above 200 beats per minute")

0 songs below 100 beats per minute
0 songs above 200 beats per minute


In [20]:
songs['genre'].value_counts()

trap         6778
hardstyle    5879
dnb          4135
techno       4022
house        3164
Name: genre, dtype: int64