# FIT1043 A2 Assignment - Derek Goh Kai Shen (33521247)

## Part A: Classification

### A1. Supervised Learning
### 1. Definition of supervised machine learning, the notion of labelled data, and train and test datasets.
- Supervised machine learning is a subset of machine learning where the model is trained upon a labelled dataset to yield a desired output that we can predict. Some of the common algorithms used to train models are neural networks, naive bayes, linear regression, logistic regression, support vector machines(SVM) and more. 

- All the data used in training the model is labelled, as in referring to data that has been classified with the correct output. 

- The training dataset is a set of data that is correctly labelled and includes the input and the respective correct output, which allows the model to learn the relationship between the input and output. The model is then tested on a separate dataset, known as the test dataset, to evaluate its loss function and accuracy index. The model is then tweaked to minimise the loss function and improve the accuracy index.


In [17]:
# tensorflow
import tensorflow as tf
keras = tf.keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, TextVectorization, Input
from keras.callbacks import EarlyStopping
from keras import regularizers

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
import platform
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Loading dataset 
data = pd.read_csv('FIT1043-MusicGenre-Dataset.csv')
data = data.dropna()


# for column in data.select_dtypes(include=['float64', 'int64']).columns:
#     print(f'Columns: {column}')
#     print(f'Mean: {data[column].mean()}')
#     print(f'Median: {data[column].median()}')
#     print(f'Variance: {data[column].var()}\n')

# Scaling and normalizing the data
df_scaled = data.copy()

minmax = MinMaxScaler()

# vectorise text data into int
artist_name_vectorizer = TextVectorization(output_mode='int')
artist_name_vectorizer.adapt(df_scaled['artist_name'])
artist_name_vectorized = artist_name_vectorizer(df_scaled['artist_name'])

# flatten
artist_name_vectorized = tf.reduce_mean(artist_name_vectorized, axis=-1)

df_scaled['artist_name'] = artist_name_vectorized.numpy()

track_name_vectorizer = TextVectorization(output_mode='int')
track_name_vectorizer.adapt(df_scaled['track_name'])
track_name_vectorized = track_name_vectorizer(df_scaled['track_name'])

# flatten
track_name_vectorized = tf.reduce_mean(track_name_vectorized, axis=-1)

df_scaled['track_name'] = track_name_vectorized.numpy()

# Normalizing columns
col_norm = ['track_name' ,'artist_name','popularity', 'duration_ms', 'loudness', 'tempo', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
for col in col_norm:
    df_scaled[col] = minmax.fit_transform(df_scaled[col].values.reshape(-1, 1))




#debug
for column in df_scaled.select_dtypes(include=['float64', 'int64']).columns:
    print(f'Columns: {column}')
    print(f'Mean: {df_scaled[column].mean()}')
    print(f'Median: {df_scaled[column].median()}')
    print(f'Variance: {df_scaled[column].var()}\n')

# Seperating features and the label
features = df_scaled.select_dtypes(include=[np.number])
features = features.drop(columns=['music_genre']).drop(columns=['instance_id'])
label = data.iloc[:, -1]

features_train, features_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=42)

# Building the model
#debug
print(f'features_train shape: {features_train.shape}')
input_size = features_train.shape[1]
model = Sequential()
model.add(Input(shape=(input_size,)))
model.add(Dense(13, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer= regularizers.l2(0.01)))
# model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer= regularizers.l2(0.01)))
# Output layer
model.add(Dense(10, activation = 'softmax'))

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the early stopping criteria
stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

# Training the model
model.fit(features_train, label_train, epochs=1000, batch_size=64, validation_data=(features_test, label_test), callbacks=[stop_early])

# Evaluate the model
loss, accuracy = model.evaluate(features_test, label_test)
print(f'Accuracy: {accuracy*100}%')

model.save('model.keras')
#debug
# print(features_train.shape)
# print(features_test.shape)
# print(label_train.shape)
# print(label_test.shape)

Columns: instance_id
Mean: 55769.94125125202
Median: 55740.5
Variance: 431077085.5881422

Columns: artist_name
Mean: 0.06932649260877888
Median: 0.04175365344467641
Variance: 0.006226387340225475

Columns: track_name
Mean: 0.04393164512586653
Median: 0.02404809619238477
Variance: 0.0030559816068378948

Columns: popularity
Mean: 0.44614688373837774
Median: 0.4545454545454546
Variance: 0.024664485986313586

Columns: acousticness
Mean: 0.3090943232240156
Median: 0.1465863453815261
Variance: 0.11796946241528146

Columns: danceability
Mean: 0.5379725391062831
Median: 0.5498704663212434
Variance: 0.03726771827963333

Columns: duration_ms
Mean: 0.05113029037247991
Median: 0.047121970833245944
Variance: 0.0006089345985379015

Columns: energy
Mean: 0.5998788631909636
Median: 0.6453644931717639
Variance: 0.07072631712772043

Columns: instrumentalness
Mean: 0.18368853299591772
Median: 0.0001676706827309237
Variance: 0.10759812421754344

Columns: liveness
Mean: 0.18610160615273116
Median: 0.117465

In [10]:
import tensorflow as tf
keras = tf.keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, TextVectorization
from keras.callbacks import EarlyStopping
from keras import regularizers

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Loading dataset 
data = pd.read_csv('FIT1043-MusicGenre-Dataset.csv')
data = data.dropna()

# Scaling and normalizing the data
df = data.copy()

# vectorise text data into int
artist_name_vectorizer = TextVectorization(output_mode='int')
artist_name_vectorizer.adapt(df['artist_name'])
artist_name_vectorized = artist_name_vectorizer(df['artist_name'])

# flatten
artist_name_vectorized = tf.reduce_mean(artist_name_vectorized, axis=-1)

df['artist_name'] = artist_name_vectorized.numpy()

track_name_vectorizer = TextVectorization(output_mode='int')
track_name_vectorizer.adapt(df['track_name'])
track_name_vectorized = track_name_vectorizer(df['track_name'])

# flatten
track_name_vectorized = tf.reduce_mean(track_name_vectorized, axis=-1)

df['track_name'] = track_name_vectorized.numpy()

# Seperating features and the label
features = df.drop(columns=['music_genre', 'instance_id'])
label = df['music_genre']

# Normalize
sclr = StandardScaler()
features = pd.DataFrame(sclr.fit_transform(features), columns=features.columns)

features_train, features_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=42)

# Building the model
model = Sequential()

model.add(Dense(13, input_dim=features_train.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))

model.add(Dense(32, activation='relu', kernel_regularizer= regularizers.l2(0.01)))

model.add(Dense(32, activation='relu', kernel_regularizer= regularizers.l2(0.01)))

model.add(Dense(32, activation='relu', kernel_regularizer= regularizers.l2(0.01)))
# Output layer
model.add(Dense(10, activation = 'softmax'))

optimizer = keras.optimizers.Adam()

# Compiling the model

loss_fn = keras.losses.SparseCategoricalCrossentropy()

model.compile(loss=loss_fn, optimizer=optimizer, metrics=['accuracy'])

# Define the early stopping criteria
stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# Training the model
model.fit(features_train, label_train, epochs=200, batch_size=64, validation_data=(features_test, label_test), callbacks=[stop_early])

# Evaluate the model
loss, accuracy = model.evaluate(features_test, label_test)
print(f'Accuracy: {accuracy*100}%')

model.save("lnmodel.keras")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 36ms/step - accuracy: 0.2315 - loss: 2.6826 - val_accuracy: 0.4661 - val_loss: 1.7501
Epoch 2/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.4764 - loss: 1.6899 - val_accuracy: 0.4988 - val_loss: 1.5938
Epoch 3/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.5035 - loss: 1.5692 - val_accuracy: 0.4994 - val_loss: 1.5452
Epoch 4/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.5039 - loss: 1.5293 - val_accuracy: 0.5081 - val_loss: 1.5138
Epoch 5/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.5067 - loss: 1.5060 - val_accuracy: 0.5046 - val_loss: 1.5111
Epoch 6/200
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.5074 - loss: 1.4978 - val_accuracy: 0.5087 - val_loss: 1.5060
Epoch 7/200
[1

## A2. Classification (Training)
### 1. Differences between binary and multi-class classification.

- Binary classification is a type of classification where the model is trained to predict between two classes, such as true or false, spam or no, 0 or 1, and so on. The output is a boolean, which is either True or False.

- Multi-class classification is a type of classification where the model is trained to predict between multiple classes, such as classifying between genre of music, colour schemes, dog breeds, and so on. The output is a class label, which is one of the classes that the model is trained to predict.

### 2. Normalising/Scaling Data for Preparation for Classification

- Normalsing or Scaling of data is important as it allows the model gradient descent to converge faster, as all the features are on the same scale. This is important as we want to scale the data when we are using algorithms using distance between data points, such as Support Vector Machines (SVM) and K-Nearest Neighbours (KNN). For example, if we have a dataset with features that have totally different scales, such as age and income, the model will be biased towards the feature with larger scale, which will be the income in this case.

- There are many ways to scale the data, such as Min-Max Scaling, Standard Scaling, Robust Scaling, and Normalisation. Min-Max Scaling scales the data to a range between 0 and 1, Standard Scaling scales the data to have a mean of 0 and a standard deviation of 1, Robust Scaling scales the data to the interquartile range, and Normalisation scales the data to have a magnitude of 1. The best scaling method for predicting the genre of the music is Standard Scaling, as it scales the data to have a mean of 0 and a standard deviation of 1, which is important for algorithms that use distance between data points.

In [23]:
# from sklearn.preprocessing import StandardScaler


# sc = StandardScaler()

# # Normalising training and testing data
# features_train = sc.fit_transform(features_train)
# features_test = sc.transform(features_test)


# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier

# bag_clf = BaggingClassifier(
#     DecisionTreeClassifier(random_state=42), 
#     n_estimators=500,
#     max_samples=100, 
#     bootstrap=True, 
#     n_jobs=-1, 
#     random_state=42)

# bag_clf.fit(features_train, label_train)

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier

# ada_clf = AdaBoostClassifier(
#     DecisionTreeClassifier(max_depth=1), 
#     n_estimators=200,
#     algorithm="SAMME.R", 
#     learning_rate=0.5, 
#     random_state=42)

# ada_clf.fit(features_train, label_train)

### 3. Using Support Vector Machines (SVM) for Classification

- SVM is a supervised learning algorithm used in machine learning to solve classification problems. It's very good in solving binary classification problems, but can also be used for multi-class classification problems. The algorithm works by seperating the classes with a hyperplane that has the maximum distance between the nearest data points of the classes, which can be referred to as the margin. The data points that are closest to that hyperplane are called support vectors. The hyperplane can be linear or non-linear, depending on the kernel used. The most common kernel used is the Radial Basis Function (RBF) kernel, or the Gaussian kernel. RBF kernel, which is a non-linear kernel, is used when the data is not linearly separable, and the linear kernel is used when the data is linearly separable.

- Since SVM are fundamentally binary classifiers, to allow them to support multi-class classifications, we can employ either One-Vs-Rest (OvR) or One-Vs-One(OvO) strategies. OvR trains a binary classifier for each class, which is then used to predict the class with the highest confidence score. OvO trains a binary classifier for each pair of classes, which is then used to predict the class with the most votes. OvR is more efficient than OvO, as it requires less training time, but OvO is more accurate than OvR, as it requires more training time. Thus, we have to balance between efficiency and accuracy when choosing between OvR and OvO.

In [24]:
# Building SVM model to classify the music genre.
from sklearn.svm import SVC
classifier = SVC(kernel= 'rbf', random_state=42)
classifier.fit(features_train, label_train)

# Predicting the test set results
label_pred = classifier.predict(features_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

Confusion Matrix: 
[[184   0   9   0 116  29  61  31  14 100]
 [ 20 312  39  54  20  18   0  10   0   4]
 [ 22  46 283   4  81  32   1  53   0  25]
 [ 12  15  11 426   4  11   0  15   0   1]
 [ 30  14  47   0 272  10  17  49   4  87]
 [ 42  22  32   6  35 300  19  51   7  16]
 [ 19   0   2   0  17   6 322   1 140  35]
 [ 16   9  75  35  38  64  12 213   1  22]
 [ 27   0   2   0  14   3 249   0 152  60]
 [ 61   2   2   0  49   5  12  19  26 359]]
Accuracy: 0.5437211093990755


In [25]:
# Using XGBoost to classify the music genre
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(features_train, label_train)

# Predicting the test set results
label_pred = classifier.predict(features_test)

# Making the confusion matrix
cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

Confusion Matrix: 
[[201   6  15   3  75  21  48  32  35 108]
 [ 15 368  31  22   9  17   0  10   0   5]
 [ 30  22 305   5  63  33   1  65   0  23]
 [ 13  12  16 423   4   8   0  17   0   2]
 [ 40   5  28   0 314  11   8  29   7  88]
 [ 34  21  31   7  18 319  14  61   9  16]
 [ 22   0   2   0   5   6 233   3 245  26]
 [ 19   2  58  27  28  59   9 259   1  23]
 [ 21   1   1   0   2   2 257   4 169  50]
 [ 55   6   8   0  52   3  14   9  31 357]]
Accuracy: 0.5677966101694916


In [26]:
# Using Random Forest to classify the music genre
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(features_train, label_train)

label_pred = classifier.predict(features_test)

cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

Confusion Matrix: 
[[205   6  21   0  86  17  49  22  35 103]
 [ 17 351  32  33  11  22   1   5   1   4]
 [ 38  56 284   7  46  36   3  57   0  20]
 [ 12  21  17 414   5   9   0  13   0   4]
 [ 64  15  39   0 281   6  12  27  11  75]
 [ 51  28  37   7  17 286  11  60  16  17]
 [ 27   1   2   0   7   5 247   3 231  19]
 [ 30  12  76  38  32  55  14 204   3  21]
 [ 31   1   1   0  11   2 268   4 139  50]
 [ 89   5  16   1  75   4  22  17  31 275]]
Accuracy: 0.5173343605546995


In [27]:
# Using Decision Tree to classify the music genre
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)
classifier.fit(features_train, label_train)

label_pred = classifier.predict(features_test)

cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

Confusion Matrix: 
[[156  14  31   8  80  44  42  42  48  79]
 [ 15 323  43  32  12  31   1  13   0   7]
 [ 23  55 219  17  60  60   6  85   2  20]
 [ 12  43  20 375   2  10   0  26   1   6]
 [ 64  16  49   3 225  24  13  54  11  71]
 [ 50  40  49  12  27 244   9  68  10  21]
 [ 35   0   6   0  12  12 214  15 219  29]
 [ 31  16  70  41  40  69   9 182   9  18]
 [ 50   1   3   0  15   6 243   7 138  44]
 [ 89   8  30   2  78  14  31  22  37 224]]
Accuracy: 0.44298921417565484
