# FIT1043 A2 Assignment - Derek Goh Kai Shen (33521247)

## Part A: Classification

### A1. Supervised Learning
### 1. Definition of supervised machine learning, the notion of labelled data, and train and test datasets.
- Supervised machine learning is a subset of machine learning where the model is trained upon a labelled dataset to yield a desired output that we can predict. Some of the common algorithms used to train models are neural networks, naive bayes, linear regression, logistic regression, support vector machines(SVM) and more. 

- All the data used in training the model is labelled, as in referring to data that has been classified with the correct output. 

- The training dataset is a set of data that is correctly labelled and includes the input and the respective correct output, which allows the model to learn the relationship between the input and output. The model is then tested on a separate dataset, known as the test dataset, to evaluate its loss function and accuracy index. The model is then tweaked to minimise the loss function and improve the accuracy index.


In [38]:
# import tensorflow as tf
# keras = tf.keras
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Input
# from keras.callbacks import EarlyStopping
# from keras import regularizers

# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Loading dataset 
# data = pd.read_csv('FIT1043-MusicGenre-Dataset.csv')
# data = data.dropna()

# # Scaling and normalizing the data
# df = data.copy()


# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, max_features=5000, stop_words='english')
# artist_name_vectorized = tfidf.fit_transform(df['artist_name'])

# # Keep the matrix sparse, don't convert to dense with toarray()
# df_artist = pd.DataFrame.sparse.from_spmatrix(artist_name_vectorized, columns=[f'artist_{i}' for i in range(artist_name_vectorized.shape[1])])

# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, max_features=5000, stop_words='english')
# track_name_vectorized = tfidf.fit_transform(df['track_name'])

# # Keep the matrix sparse, don't convert to dense with toarray()
# df_track = pd.DataFrame.sparse.from_spmatrix(track_name_vectorized, columns=[f'track_{i}' for i in range(track_name_vectorized.shape[1])])

# df = df.drop(columns=['artist_name', 'track_name'])
# df = pd.concat([df, df_artist, df_track], axis=1)

# label = df['music_genre']
# features = df.drop(columns=['music_genre', 'instance_id'])

# sclr = StandardScaler()
# features = pd.DataFrame(sclr.fit_transform(features), columns=features.columns)

# features_train, features_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=42)

# # Building the model
# model = Sequential()

# model.add(Input(shape=(features_train.shape[1],)))
# model.add(Dense(13, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# model.add(Dense(20, activation='relu', kernel_regularizer= regularizers.l2(0.005)))

# model.add(Dense(20, activation='relu', kernel_regularizer= regularizers.l2(0.01)))

# # Output layer
# model.add(Dense(10, activation = 'softmax'))

# # Compiling the model
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Define the early stopping criteria
# stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)

# # Training the model
# model.fit(features_train, label_train, epochs=100, batch_size=32, validation_data=(features_test, label_test), callbacks=[stop_early])

# # Evaluate the model
# loss, accuracy = model.evaluate(features_test, label_test)
# print(f'Accuracy: {accuracy*100}%')

# model.save("1000_model.keras")

In [3]:
import tensorflow as tf
keras = tf.keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, TextVectorization, Input
from keras.callbacks import EarlyStopping
from keras import regularizers

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Loading dataset 
data = pd.read_csv('FIT1043-MusicGenre-Dataset.csv')
data = data.dropna()

# Scaling and normalizing the data
df = data.copy()

# vectorise text data into int
artist_name_vectorizer = TextVectorization(output_mode='int')
artist_name_vectorizer.adapt(df['artist_name'])
artist_name_vectorized = artist_name_vectorizer(df['artist_name'])

# flatten
artist_name_vectorized = tf.reduce_mean(artist_name_vectorized, axis=-1)

df['artist_name'] = artist_name_vectorized.numpy()

track_name_vectorizer = TextVectorization(output_mode='int')
track_name_vectorizer.adapt(df['track_name'])
track_name_vectorized = track_name_vectorizer(df['track_name'])

# flatten
track_name_vectorized = tf.reduce_mean(track_name_vectorized, axis=-1)

df['track_name'] = track_name_vectorized.numpy()

# Seperating features and the label
features = df.drop(columns=['music_genre', 'instance_id'])
label = df['music_genre']

# Normalize
sclr = StandardScaler()
features = pd.DataFrame(sclr.fit_transform(features), columns=features.columns)

features_train, features_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=42)

# Building the model
model = Sequential()

model.add(Input(shape=(features_train.shape[1],)))
model.add(Dense(13, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

model.add(Dense(20, activation='relu', kernel_regularizer= regularizers.l2(0.005)))

model.add(Dense(20, activation='relu', kernel_regularizer= regularizers.l2(0.008)))

# Output layer
model.add(Dense(10, activation = 'softmax'))

# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Define the early stopping criteria
stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=1000)

# Training the model
model.fit(features_train, label_train, epochs=10000, batch_size=64, validation_data=(features_test, label_test), callbacks=[stop_early])

# Evaluate the model
loss, accuracy = model.evaluate(features_test, label_test)
print(f'Accuracy: {accuracy*100}%')

model.save("10000_model.keras")

Epoch 1/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 29ms/step - accuracy: 0.1978 - loss: 2.3477 - val_accuracy: 0.4441 - val_loss: 1.6748
Epoch 2/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.4639 - loss: 1.5961 - val_accuracy: 0.5010 - val_loss: 1.4775
Epoch 3/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.5044 - loss: 1.4560 - val_accuracy: 0.5102 - val_loss: 1.4233
Epoch 4/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.5153 - loss: 1.4126 - val_accuracy: 0.5216 - val_loss: 1.3905
Epoch 5/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.5267 - loss: 1.3734 - val_accuracy: 0.5202 - val_loss: 1.3701
Epoch 6/10000
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.5297 - loss: 1.3629 - val_accuracy: 0.5351 - val_loss: 1.3560
Epo

## A2. Classification (Training)
### 1. Differences between binary and multi-class classification.

- Binary classification is a type of classification where the model is trained to predict between two classes, such as true or false, spam or no, 0 or 1, and so on. The output is a boolean, which is either True or False.

- Multi-class classification is a type of classification where the model is trained to predict between multiple classes, such as classifying between genre of music, colour schemes, dog breeds, and so on. The output is a class label, which is one of the classes that the model is trained to predict.

### 2. Normalising/Scaling Data for Preparation for Classification

- Normalsing or Scaling of data is important as it allows the model gradient descent to converge faster, as all the features are on the same scale. This is important as we want to scale the data when we are using algorithms using distance between data points, such as Support Vector Machines (SVM) and K-Nearest Neighbours (KNN). For example, if we have a dataset with features that have totally different scales, such as age and income, the model will be biased towards the feature with larger scale, which will be the income in this case.

- There are many ways to scale the data, such as Min-Max Scaling, Standard Scaling, Robust Scaling, and Normalisation. Min-Max Scaling scales the data to a range between 0 and 1, Standard Scaling scales the data to have a mean of 0 and a standard deviation of 1, Robust Scaling scales the data to the interquartile range, and Normalisation scales the data to have a magnitude of 1. The best scaling method for predicting the genre of the music is Standard Scaling, as it scales the data to have a mean of 0 and a standard deviation of 1, which is important for algorithms that use distance between data points.

In [40]:
# from sklearn.preprocessing import StandardScaler


# sc = StandardScaler()

# # Normalising training and testing data
# features_train = sc.fit_transform(features_train)
# features_test = sc.transform(features_test)


# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier

# bag_clf = BaggingClassifier(
#     DecisionTreeClassifier(random_state=42), 
#     n_estimators=500,
#     max_samples=100, 
#     bootstrap=True, 
#     n_jobs=-1, 
#     random_state=42)

# bag_clf.fit(features_train, label_train)

# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier

# ada_clf = AdaBoostClassifier(
#     DecisionTreeClassifier(max_depth=1), 
#     n_estimators=200,
#     algorithm="SAMME.R", 
#     learning_rate=0.5, 
#     random_state=42)

# ada_clf.fit(features_train, label_train)

### 3. Using Support Vector Machines (SVM) for Classification

- SVM is a supervised learning algorithm used in machine learning to solve classification problems. It's very good in solving binary classification problems, but can also be used for multi-class classification problems. The algorithm works by seperating the classes with a hyperplane that has the maximum distance between the nearest data points of the classes, which can be referred to as the margin. The data points that are closest to that hyperplane are called support vectors. The hyperplane can be linear or non-linear, depending on the kernel used. The most common kernel used is the Radial Basis Function (RBF) kernel, or the Gaussian kernel. RBF kernel, which is a non-linear kernel, is used when the data is not linearly separable, and the linear kernel is used when the data is linearly separable.

- Since SVM are fundamentally binary classifiers, to allow them to support multi-class classifications, we can employ either One-Vs-Rest (OvR) or One-Vs-One(OvO) strategies. OvR trains a binary classifier for each class, which is then used to predict the class with the highest confidence score. OvO trains a binary classifier for each pair of classes, which is then used to predict the class with the most votes. OvR is more efficient than OvO, as it requires less training time, but OvO is more accurate than OvR, as it requires more training time. Thus, we have to balance between efficiency and accuracy when choosing between OvR and OvO.

In [41]:
# Building SVM model to classify the music genre.
from sklearn.svm import SVC
import pickle as pkl

classifier = SVC(kernel= 'rbf', random_state=42)
classifier.fit(features_train, label_train)

# Predicting the test set results
label_pred = classifier.predict(features_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

# Save the model
pkl.dump(classifier, open('svm_model.pkl', 'wb'))

Confusion Matrix: 
[[190   3  11   0 101  25  59  31  19 105]
 [ 16 349  29  39  10  21   0   9   0   4]
 [ 23  33 306   4  70  31   2  56   0  22]
 [  9  18  11 425   4  11   0  16   0   1]
 [ 35  14  36   0 296  15  13  33   5  83]
 [ 43  22  31   7  33 311  20  40  10  13]
 [ 21   0   0   3  11   5 309   2 155  36]
 [ 15   7  58  37  34  61  12 240   1  20]
 [ 23   0   2   0  11   3 239   1 169  59]
 [ 58   2   5   0  40   1  10  15  24 380]]
Accuracy: 0.5729969183359014


In [42]:
# Using XGBoost to classify the music genre
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(features_train, label_train)

# Predicting the test set results
label_pred = classifier.predict(features_test)

# Making the confusion matrix
cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

# saving the model
classifier.save_model('xgb_model.json')

Confusion Matrix: 
[[237   7  13   1  66  18  33  31  37 101]
 [ 12 384  25  14  10  23   0   5   2   2]
 [ 27  23 333   4  47  29   3  57   2  22]
 [ 13   9  14 431   2   3   0  21   0   2]
 [ 35   5  24   0 338  12   5  25  11  75]
 [ 38  12  34   5  17 334   7  55   9  19]
 [ 22   1   1   0   3   5 240   5 238  27]
 [ 19   1  54  23  23  49   6 290   4  16]
 [ 18   1   1   0   3   1 257   2 176  48]
 [ 56   7   8   0  51   3  11  14  33 352]]
Accuracy: 0.5999614791987673


In [43]:
# Using Random Forest to classify the music genre
from sklearn.ensemble import RandomForestClassifier
import pickle as pkl

classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(features_train, label_train)

label_pred = classifier.predict(features_test)

cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

pkl.dump(classifier, open('random_forest_model.pkl', 'wb'))

Confusion Matrix: 
[[182   8  17   0  79  30  48  35  35 110]
 [ 21 361  21  36   9  23   0   4   0   2]
 [ 36  40 293   7  55  35   3  55   2  21]
 [ 12  17  11 424   2   9   0  19   0   1]
 [ 58  13  37   1 288  11  12  25  12  73]
 [ 49  29  46  10  27 270  13  59  10  17]
 [ 32   2   2   0   6   3 262   2 208  25]
 [ 35  11  72  29  32  65  12 211   4  14]
 [ 37   1   2   0  10   3 267   3 147  37]
 [ 91   5   9   1  69   7  25  15  28 285]]
Accuracy: 0.5244607087827426


In [44]:
# Using Decision Tree to classify the music genre
from sklearn.tree import DecisionTreeClassifier
import pickle as pkl

classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)
classifier.fit(features_train, label_train)

label_pred = classifier.predict(features_test)

cm = confusion_matrix(label_test, label_pred)
print(f'Confusion Matrix: \n{cm}')

accuracy = accuracy_score(label_test, label_pred)
print(f'Accuracy: {accuracy}')

# Save the model
pkl.dump(classifier, open('decision_tree_model.pkl', 'wb'))

Confusion Matrix: 
[[168  17  36   8  73  48  39  29  47  79]
 [ 12 319  43  34  12  39   0  13   0   5]
 [ 35  40 234  18  66  47   1  85   5  16]
 [ 13  40   9 381   5   9   0  35   0   3]
 [ 67  16  57   7 232  25  10  43  13  60]
 [ 53  34  38  14  22 255  12  70   8  24]
 [ 43   0   5   0  12  14 210  16 224  18]
 [ 43  12  57  36  39  63   8 200   6  21]
 [ 32   1   4   0  25   5 258   2 127  53]
 [ 80   8  34   4  68  11  35  18  45 232]]
Accuracy: 0.45416024653312786
