In [None]:
import os
import numpy as np
import pandas as pd
import absl.logging
from nlp_title_embedding import BertTitle
from nlp_classifier import CNN
from sklearn import preprocessing
from sklearn import metrics

absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
def train(data_x_lyrics, data_x_title, data_y, nlp_title_embedding, nlp_classifier, label_encoder, dataset_name, epochs=1, batch_size=1000, model_dir='models', start_idx=0):
    '''
    Train a model.
    Parameters:
        data_x_lyrics (Series): Lyrics of the training data.
        data_x_title (Series): Titles of the training data.
        data_y (Series): Genres of the training data.
        nlp_title_embedding (NLPTitleEmbedding): Instance of the title embedding method class.
        nlp_classifier (NLPClassifier): Instance of the classifier class.
        label_encoder (LabelEncoder): Label encoder after fitting.
        dataset_name (str): Name of the dataset.
        epochs (int): Number of epochs.
        batch_size (int): Number of observations taken for single embedding and partial fitting.
        model_dir (str): Name of directory for saved models.
        start_idx (int): In case of interrupted training, from which observation restart the training.
    '''
    print('Training...')
    data_y_enc = label_encoder.transform(data_y)
    classes = np.unique(data_y_enc)
    
    for epoch in range(epochs):
        print(f'Epoch: {str(epoch + 1)}/{str(epochs)}')
        for i in range(start_idx, data_x_lyrics.shape[0], batch_size):
            
            if i + batch_size > data_x_lyrics.shape[0]:
                j = data_x_lyrics.shape[0]
            else:
                j = i + batch_size
            
            print(f'Processing rows: {i} - {j - 1}')

            embeddings = nlp_title_embedding.get_title_lyrics_embedding(data_x_lyrics[i:j], data_x_title[i:j])
            nlp_classifier.partial_fit(embeddings, data_y_enc[i:j], classes=classes)
            nlp_classifier.save(os.path.join(model_dir, dataset_name, f'model_{nlp_title_embedding.name}_{nlp_classifier.name}'))
        start_idx = 0
    
    print('Success!')

In [None]:
def test(data_x_lyrics, data_x_title, nlp_title_embedding, nlp_classifier, label_encoder, dataset_name, batch_size=1000, predictions_dir='predictions', start_idx=0):
    '''
    Test a model and return predictions.
    Parameters:
        data_x_lyrics (Series): Lyrics of the test data.
        data_x_title (Series): Titles of the test data.
        nlp_title_embedding (NLPTitleEmbedding): Instance of the title embedding method class.
        nlp_classifier (NLPClassifier): Instance of the classifier class.
        label_encoder (LabelEncoder): Label encoder after fitting.
        dataset_name (str): Name of the dataset.
        batch_size (int): Number of observations taken for single embedding and prediction.
        predictions_dir (str): Name of directory for predictions.
        start_idx (int): In case of interrupted testing, from which observation restart the testing.
    Returns:
        list: Predictions of genres.
    '''
    print('Testing...')
    fname = os.path.join(predictions_dir, dataset_name, f'model_{nlp_title_embedding.name}_{nlp_classifier.name}.csv')
    predictions_all = []

    if start_idx == 0 and os.path.exists(fname):
        os.remove(fname)
    
    for i in range(start_idx, data_x_lyrics.shape[0], batch_size):

        if i + batch_size > data_x_lyrics.shape[0]:
            j = data_x_lyrics.shape[0]
        else:
            j = i + batch_size
        
        print(f'Processing rows: {i} - {j - 1}')

        embeddings = nlp_title_embedding.get_title_lyrics_embedding(data_x_lyrics[i:j], data_x_title[i:j])
        predictions_enc = nlp_classifier.predict(embeddings)
        predictions = label_encoder.inverse_transform(predictions_enc)
        
        predictions_all.extend(predictions)

        pd.DataFrame(predictions.reshape(-1, 1)).to_csv(fname, mode='a', index=False, header=False)
    
    print('Success!')    
    
    return predictions_all

In [None]:
def get_results(y_true, y_pred):
    '''
    Print accuracy, balanced accuracy and F1 score.
    Parameters:
        y_true (Series): True genres of the test data.
        y_pred (Series): Predicted genres of the test data.
    '''
    print('RESULTS:')
    print(f'accuracy = {metrics.accuracy_score(y_true=y_true, y_pred=y_pred)}')
    print(f'balanced accuracy = {metrics.balanced_accuracy_score(y_true=y_true, y_pred=y_pred)}')
    print(f'f1 score = {metrics.f1_score(y_true=y_true, y_pred=y_pred, average="weighted")}')

In [None]:
# Parameters

max_words = 400
max_words_title = 15
dataset_name = 'small_musicalgenres'

In [None]:
# Creation of necessary directories

if not os.path.exists(f'models/{dataset_name}'):
    os.makedirs(f'models/{dataset_name}')

if not os.path.exists(f'predictions/{dataset_name}'):
    os.makedirs(f'predictions/{dataset_name}')

In [None]:
# Reading training and test data from CSV files

train_data = pd.read_csv(f'data/train/{dataset_name}.csv')
test_data = pd.read_csv(f'data/test/{dataset_name}.csv')

In [None]:
# Label encoder fitting

genres = np.unique(train_data.genre)
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(genres)

In [None]:
# BERT + CNN + unnormalised lyrics and titles

nlp_title_embedding = BertTitle(max_words, max_words_title)
nlp_classifier = CNN((max_words + max_words_title) * nlp_title_embedding.embedding_size, len(genres), 'adam')
train(train_data.lyrics, train_data.title, train_data.genre, nlp_title_embedding, nlp_classifier, label_encoder, dataset_name, epochs=5)
y_pred = test(test_data.lyrics, test_data.title, nlp_title_embedding, nlp_classifier, label_encoder, dataset_name)
get_results(test_data.genre, y_pred)