In [1]:
# Import inmportant library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Load train and test data
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [3]:
# read the top five rows of train data
train.head()

Unnamed: 0,instance_id,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,voice_gender,mode,speechiness,tempo,musician_category,valence,music_genre
0,MSC_83537.0,Estrellitas y Duendes,49.178,0.970522,0.580508,214625.776,0.192107,0.0,Scale E,0.147134,-14.14,,Major,0.046404,143.78799999999998,Band,0.598965,Jazz
1,MSC_22044.0,Al Norte,59.827,1.00938,0.687542,216232.195,0.265942,3.1e-05,Scale A,0.174655,-13.716,Male,?,0.039949,?,Band,0.357194,Jazz
2,MSC_62017.0,Yeah! (feat. Lil Jon & Ludacris),89.023,0.020041,0.967948,273314.723,0.857411,0.0,Scale D,0.04203,-4.995,Female,Major,0.119917,105.01799999999999,Band,0.635525,Rap
3,MSC_76365.0,Can’t You See,55.762,0.010284,0.616287,189189.605,0.970513,0.00022,Scale D,0.124637,-4.262,Both,Major,0.167493,?,Duet,0.98375,Rock
4,MSC_71493.0,"Sonata III (G Moll), BWV 1029: Adagio",45.095,0.858769,0.280645,410136.987,0.114732,0.002753,Scale F Sharp,0.134782,-26.922,Female,Minor,0.039139,112.18299999999999,Band,0.074412,Classical


In [4]:
# checking the shape od data 
print(train.shape)
print(test.shape)

(15681, 18)
(3921, 17)


In [5]:
# data info
train.describe()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,valence
count,15039.0,15681.0,15125.0,15587.0,15587.0,15586.0,15681.0,15645.0,15655.0,15004.0
mean,50.82272,0.446922,0.570106,248000.8,0.426977,0.26429,0.194089,-11.621471,0.096411,0.4514
std,17.679789,0.416981,0.215045,156831.2,0.514145,0.395788,0.160291,8.663364,0.105647,0.27722
min,0.0,2e-06,0.063962,-1.0,-1.0,0.0,0.016652,-50.054,0.023858,0.0
25%,39.017,0.044521,0.418635,186016.3,0.158277,0.0,0.101674,-15.504,0.039549,0.212242
50%,54.109,0.295272,0.586114,236212.7,0.560389,0.000719,0.128784,-8.611,0.050425,0.432478
75%,63.449,0.929001,0.733073,297559.3,0.791881,0.63893,0.228973,-5.715,0.09423,0.665369
max,108.512,1.09441,1.072089,3410383.0,1.094853,1.078601,1.080959,1.389,1.000445,1.076971


In [6]:
# Checking unique values in train data
train.nunique()

instance_id          15681
track_name           14799
popularity           12743
acousticness         15681
danceability         15125
duration_ms          14011
energy               14371
instrumentalness     11219
key                     12
liveness             15681
loudness             10473
voice_gender             3
mode                     3
speechiness          15655
tempo                12332
musician_category        3
valence              15004
music_genre              7
dtype: int64

In [7]:
# Checking unique values in test data
test.nunique()

instance_id          3921
track_name           3850
popularity           3504
acousticness         3921
danceability         3782
duration_ms          3526
energy               3591
instrumentalness     2855
key                    12
liveness             3921
loudness             3296
voice_gender            3
mode                    3
speechiness          3914
tempo                3401
musician_category       3
valence              3730
dtype: int64

In [8]:
# Checking the missing values in train data
train.isnull().sum()/len(train)*100

instance_id          0.000000
track_name           0.000000
popularity           4.094127
acousticness         0.000000
danceability         3.545692
duration_ms          0.599452
energy               0.599452
instrumentalness     0.605829
key                  0.000000
liveness             0.000000
loudness             0.229577
voice_gender         4.878515
mode                 0.223200
speechiness          0.165806
tempo                0.000000
musician_category    8.672916
valence              4.317327
music_genre          0.000000
dtype: float64

In [9]:
# Checking the missing values in test data
test.isnull().sum()/len(test)*100

instance_id          0.000000
track_name           0.000000
popularity           4.233614
acousticness         0.000000
danceability         3.545014
duration_ms          0.535578
energy               0.841622
instrumentalness     0.586585
key                  0.000000
liveness             0.000000
loudness             0.076511
voice_gender         4.743688
mode                 0.357052
speechiness          0.178526
tempo                0.000000
musician_category    7.957154
valence              4.871206
dtype: float64

In [4]:
# Replace special charcater with mode in train data
train['mode'].replace({"?": "Major"}, inplace=True)
train['tempo'].replace({"?": "0"}, inplace=True)

In [5]:
# Replace special charcater with mode in test data
test['tempo'].replace({"?": "0"}, inplace=True)
test['mode'].replace({"?": "Major"}, inplace=True)

In [6]:
# change the data types
train["tempo"] = pd.to_numeric(train["tempo"])

In [8]:
# change the data types
test["tempo"] = pd.to_numeric(test["tempo"])

In [15]:
train.columns

Index(['instance_id', 'track_name', 'popularity', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'voice_gender', 'mode', 'speechiness', 'tempo',
       'musician_category', 'valence', 'music_genre'],
      dtype='object')

In [9]:
from scipy.stats import mode

In [10]:
# Repalcing missing values in train and test data
train['popularity'].fillna((train['popularity'].mean()),inplace = True)
train['danceability'].fillna((train['danceability'].mean()),inplace = True)
train['duration_ms'].fillna((train['duration_ms'].mean()),inplace = True)
train['energy'].fillna((train['energy'].mean()),inplace = True)
train['instrumentalness'].fillna((train['instrumentalness'].mean()),inplace = True)
train['loudness'].fillna((train['loudness'].mean()),inplace = True)
train['speechiness'].fillna((train['speechiness'].mean()),inplace = True)
train['valence'].fillna((train['speechiness'].mean()),inplace = True)
train['voice_gender'].fillna(mode(train['voice_gender']).mode[0], inplace=True)
train['musician_category'].fillna(mode(train['musician_category']).mode[0], inplace=True)
train['mode'].fillna(mode(train['mode']).mode[0], inplace=True)

####################################################################################################

test['popularity'].fillna((test['popularity'].mean()),inplace = True)
test['danceability'].fillna((test['danceability'].mean()),inplace = True)
test['duration_ms'].fillna((test['duration_ms'].mean()),inplace = True)
test['energy'].fillna((test['energy'].mean()),inplace = True)
test['instrumentalness'].fillna((test['instrumentalness'].mean()),inplace = True)
test['loudness'].fillna((test['loudness'].mean()),inplace = True)
test['speechiness'].fillna((test['speechiness'].mean()),inplace = True)
test['valence'].fillna((test['speechiness'].mean()),inplace = True)
test['voice_gender'].fillna(mode(test['voice_gender']).mode[0], inplace=True)
test['musician_category'].fillna(mode(test['musician_category']).mode[0], inplace=True)
test['mode'].fillna(mode(test['mode']).mode[0], inplace=True)

In [11]:
# Again check muissing values
train.isnull().sum(),test.isnull().sum()

(instance_id          0
 track_name           0
 popularity           0
 acousticness         0
 danceability         0
 duration_ms          0
 energy               0
 instrumentalness     0
 key                  0
 liveness             0
 loudness             0
 voice_gender         0
 mode                 0
 speechiness          0
 tempo                0
 musician_category    0
 valence              0
 music_genre          0
 dtype: int64, instance_id          0
 track_name           0
 popularity           0
 acousticness         0
 danceability         0
 duration_ms          0
 energy               0
 instrumentalness     0
 key                  0
 liveness             0
 loudness             0
 voice_gender         0
 mode                 0
 speechiness          0
 tempo                0
 musician_category    0
 valence              0
 dtype: int64)

In [19]:
#  counts of genre 
train['music_genre'].value_counts()

Classical     3990
Rock          3879
Rap           3207
Jazz          2850
Country        683
Electronic     614
Hip-Hop        458
Name: music_genre, dtype: int64

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15681 entries, 0 to 15680
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   instance_id        15681 non-null  object 
 1   track_name         15681 non-null  object 
 2   popularity         15681 non-null  float64
 3   acousticness       15681 non-null  float64
 4   danceability       15681 non-null  float64
 5   duration_ms        15681 non-null  float64
 6   energy             15681 non-null  float64
 7   instrumentalness   15681 non-null  float64
 8   key                15681 non-null  object 
 9   liveness           15681 non-null  float64
 10  loudness           15681 non-null  float64
 11  voice_gender       15681 non-null  object 
 12  mode               15681 non-null  object 
 13  speechiness        15681 non-null  float64
 14  tempo              15681 non-null  float64
 15  musician_category  15681 non-null  object 
 16  valence            156

In [12]:
# remove the instance_id from train data
train1 = train.drop(columns = ['instance_id'], axis = 1)

In [13]:
# remove the instance_id from train data
test1 = test.drop(columns = ['instance_id'], axis = 1)

In [14]:
# change the data types
test1["tempo"] = pd.to_numeric(test1["tempo"])

In [15]:
test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3921 entries, 0 to 3920
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   track_name         3921 non-null   object 
 1   popularity         3921 non-null   float64
 2   acousticness       3921 non-null   float64
 3   danceability       3921 non-null   float64
 4   duration_ms        3921 non-null   float64
 5   energy             3921 non-null   float64
 6   instrumentalness   3921 non-null   float64
 7   key                3921 non-null   object 
 8   liveness           3921 non-null   float64
 9   loudness           3921 non-null   float64
 10  voice_gender       3921 non-null   object 
 11  mode               3921 non-null   object 
 12  speechiness        3921 non-null   float64
 13  tempo              3921 non-null   float64
 14  musician_category  3921 non-null   object 
 15  valence            3921 non-null   float64
dtypes: float64(11), object(5

In [16]:
# One hot encoding
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()

In [17]:
track_name=enc.fit_transform(train1['track_name'])
key=enc.fit_transform(train1['key'])
voice_gender = enc.fit_transform(train1['voice_gender'])
mode = enc.fit_transform(train1['mode'])
musician_category = enc.fit_transform(train1['musician_category'])
#music_genre = enc.fit_transform(train1['music_genre'])

In [18]:
train1['track_name'] = track_name
train1['key'] = key
train1['voice_gender'] = voice_gender
train1['mode'] = mode
train1['musician_category'] = musician_category
#train1['music_genre'] = music_genre

In [19]:
train1.head()

Unnamed: 0,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,voice_gender,mode,speechiness,tempo,musician_category,valence,music_genre
0,3659,49.178,0.970522,0.580508,214625.776,0.192107,0.0,7,0.147134,-14.14,1,0,0.046404,143.788,0,0.598965,Jazz
1,540,59.827,1.00938,0.687542,216232.195,0.265942,3.1e-05,0,0.174655,-13.716,2,0,0.039949,0.0,0,0.357194,Jazz
2,14449,89.023,0.020041,0.967948,273314.723,0.857411,0.0,5,0.04203,-4.995,1,0,0.119917,105.018,0,0.635525,Rap
3,2022,55.762,0.010284,0.616287,189189.605,0.970513,0.00022,5,0.124637,-4.262,0,0,0.167493,0.0,1,0.98375,Rock
4,11036,45.095,0.858769,0.280645,410136.987,0.114732,0.002753,9,0.134782,-26.922,1,1,0.039139,112.183,0,0.074412,Classical


In [20]:
# one hot encoding in test data
track_name=enc.fit_transform(test1['track_name'])
key=enc.fit_transform(test1['key'])
voice_gender = enc.fit_transform(test1['voice_gender'])
mode = enc.fit_transform(test1['mode'])
musician_category = enc.fit_transform(test1['musician_category'])

In [21]:
test1['track_name'] = track_name
test1['key'] = key
test1['voice_gender'] = voice_gender
test1['mode'] = mode
test1['musician_category'] = musician_category

In [22]:
test1.head()

Unnamed: 0,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,voice_gender,mode,speechiness,tempo,musician_category,valence
0,346,26.101,0.823765,0.11549,211462.287,0.125325,0.945272,1,0.120392,-25.541,0,1,0.038657,80.777,1,0.041238
1,2144,66.325,0.015441,0.592838,249151.238,0.870798,0.000108,7,0.406722,-3.905,2,0,0.051368,76.004,0,0.456309
2,2785,71.871,0.001006,0.51034,215693.24,0.683077,0.0,3,0.084608,-6.825,1,0,0.034303,144.458,2,0.193332
3,112,56.151,0.31565,0.338336,307056.179,0.608904,0.0,0,0.276811,-8.382,1,1,0.036934,118.145,2,0.214906
4,1171,43.687,0.038336,0.773904,352802.872,0.9555,7e-06,5,0.301726,-4.733,0,0,0.079688,95.012,1,0.757347


Scaling the **data**

In [23]:
# split the data into train and validation
X = train1.drop(columns = ['music_genre'],axis = 1)
y = train1.music_genre

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
sc = StandardScaler()

In [26]:
X = sc.fit_transform(X)
test1 = sc.transform(test1)

In [27]:
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.10, random_state=42)

In [28]:
X_train.shape, y_train.shape, X_cv.shape, y_cv.shape

((14112, 16), (14112,), (1569, 16), (1569,))

Model **Training**

In [29]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train,y_train)

LogisticRegression()

In [30]:
# Prediction on validation data
lRpredict = clf.predict(X_cv)

In [31]:
# Print the accuracy
print(accuracy_score(y_cv,lRpredict))

0.7361376673040153


In [32]:
# Print the confusion matrix
confusion_matrix(y_cv,lRpredict)

array([[336,   5,   2,   0,  35,   0,  10],
       [  0,  20,   1,   0,  18,   4,  19],
       [  4,   3,  17,   0,  24,   6,   4],
       [  0,   0,   0,   0,   1,  33,   9],
       [ 60,   7,   9,   0, 184,   9,  29],
       [  0,   0,   1,   0,   6, 267,  62],
       [  2,   3,   1,   0,  16,  31, 331]])

In [36]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)

RandomForestClassifier()

In [37]:
RFpredict = model.predict(X_cv)

In [38]:
print(accuracy_score(y_cv,RFpredict))
print(confusion_matrix(y_cv,RFpredict))
print(classification_report(y_cv,RFpredict))

0.7705544933078394
[[350   2   0   0  29   0   7]
 [  0  26   0   0  10   3  23]
 [  3   0  24   0  24   2   5]
 [  0   0   0   0   0  40   3]
 [ 43   4   8   0 200   9  34]
 [  0   1   1   5   2 270  57]
 [  2   1   0   0  14  28 339]]
              precision    recall  f1-score   support

   Classical       0.88      0.90      0.89       388
     Country       0.76      0.42      0.54        62
  Electronic       0.73      0.41      0.53        58
     Hip-Hop       0.00      0.00      0.00        43
        Jazz       0.72      0.67      0.69       298
         Rap       0.77      0.80      0.78       336
        Rock       0.72      0.88      0.80       384

    accuracy                           0.77      1569
   macro avg       0.65      0.58      0.60      1569
weighted avg       0.75      0.77      0.76      1569



In [39]:
# prediction on test data set
RFpredict1 = model.predict(test1)

In [40]:
# Save the prediction results
submission = pd.DataFrame({
        "instance_id": test['instance_id'],
        "music_genre": RFpredict1
    })
submission.to_csv('music_genreSubmission1.csv', index=False)
print(submission)

      instance_id music_genre
0     MSC_70753.0   Classical
1     MSC_24064.0        Rock
2     MSC_22731.0        Rock
3     MSC_32095.0        Rock
4     MSC_24198.0        Jazz
...           ...         ...
3916  MSC_80955.0   Classical
3917  MSC_72767.0        Rock
3918  MSC_40192.0   Classical
3919  MSC_56067.0        Rock
3920  MSC_90169.0        Jazz

[3921 rows x 2 columns]


In [41]:
# XG Boost
xgb_model = xgb.XGBClassifier(silent=False, 
                              scale_pos_weight=1,
                              learning_rate=0.1,
                              colsample_bytree =0.8,
                              subsample = 0.8,
                              objective='binary:logistic',
                              n_estimators=1000, 
                              max_depth=4, 
                              reg_alpha=0.01,
                              gamma=0,
                              random_state=42)

In [42]:
# Model training
xgb_model.fit(X_train, y_train,
              verbose=True)

XGBClassifier(colsample_bytree=0.8, max_depth=4, n_estimators=1000,
              objective='multi:softprob', random_state=42, reg_alpha=0.01,
              silent=False, subsample=0.8)

In [43]:
# predict on validation data
RFpredict2 = xgb_model.predict(X_cv)

In [44]:
print(accuracy_score(y_cv,RFpredict2))

0.7839388145315488


In [45]:
# prediction on test data
RFpredict3 = xgb_model.predict(test1)

In [46]:
RFpredict3

array(['Classical', 'Rock', 'Rap', ..., 'Classical', 'Country', 'Jazz'],
      dtype=object)

In [48]:
# Save the prediction results
submission = pd.DataFrame({
        "instance_id": test['instance_id'],
        "music_genre": RFpredict3
    })
submission.to_csv('music_genreSubmission3.csv', index=False)
print(submission)

      instance_id music_genre
0     MSC_70753.0   Classical
1     MSC_24064.0        Rock
2     MSC_22731.0         Rap
3     MSC_32095.0        Rock
4     MSC_24198.0        Jazz
...           ...         ...
3916  MSC_80955.0   Classical
3917  MSC_72767.0        Rock
3918  MSC_40192.0   Classical
3919  MSC_56067.0     Country
3920  MSC_90169.0        Jazz

[3921 rows x 2 columns]
