In [None]:
import os
import numpy as np
import pandas as pd
import absl.logging
from sklearn.metrics import accuracy_score
from nlp_embedding import GloVe
from nlp_classifier import NaiveBayes, SVM, XGBoost, CNN
from sklearn import preprocessing
# Jakieś błędy wyrzuca przy zapisywaniu CNN ale wydaje się działać
absl.logging.set_verbosity(absl.logging.ERROR)

In [None]:
genres = ['Country', 'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Metal', 'Other', 'Pop', 'R&B', 'Rock']
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(genres)

In [None]:
if not os.path.exists('models'):
    os.makedirs('models')

if not os.path.exists('predictions'):
    os.makedirs('predictions')

In [None]:
def train(data_x, data_y, nlp_embedding, nlp_classifier, label_encoder, batch_size=1000, model_dir='models', start_idx=0):
    print('Training...')
    data_y_enc = label_encoder.transform(data_y)
    classes = np.unique(data_y_enc)
    
    for i in range(start_idx, data_x.shape[0], batch_size):
        
        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size
        
        print(f'Processing rows: {i} - {j - 1}')

        embeddings = nlp_embedding.embed_lyrics(data_x[i:j])
        nlp_classifier.partial_fit(embeddings, data_y_enc[i:j], classes=classes)
        nlp_classifier.save(os.path.join(model_dir, f'model_{nlp_embedding.name}_{nlp_classifier.name}'))
    
    print('Success!')

In [None]:
def test(data_x, nlp_embedding, nlp_classifier, label_encoder, batch_size=1000, predictions_dir='predictions', start_idx=0):
    print('Testing...')
    fname = os.path.join(predictions_dir, f'model_{nlp_embedding.name}_{nlp_classifier.name}.csv')
    predictions_all = []

    if start_idx == 0 and os.path.exists(fname):
        os.remove(fname)
    
    for i in range(start_idx, data_x.shape[0], batch_size):

        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size
        
        print(f'Processing rows: {i} - {j - 1}')

        embeddings = nlp_embedding.embed_lyrics(data_x[i:j])
        predictions_enc = nlp_classifier.predict(embeddings)
        predictions = label_encoder.inverse_transform(predictions_enc)
        
        predictions_all.extend(predictions)

        pd.DataFrame(predictions.reshape(-1, 1)).to_csv(fname, mode='a', index=False, header=False)
    
    print('Success!')    
    
    return predictions_all

In [None]:
max_words = 400

In [None]:
train_data = pd.read_csv('data/train/metrolyrics.csv')
test_data = pd.read_csv('data/test/metrolyrics.csv')

In [None]:
train_data = train_data[0:2000]
test_data = test_data[0:2000]

In [None]:
nlp_classifier = NaiveBayes()
nlp_embedding = GloVe(max_words)
train(train_data.lyrics, train_data.genre, nlp_embedding, nlp_classifier, label_encoder)
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier.load('models/model_glove_naive-bayes.joblib')
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier = SVM()
nlp_embedding = GloVe(max_words)
train(train_data.lyrics, train_data.genre, nlp_embedding, nlp_classifier, label_encoder)
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier.load('models/model_glove_svm.joblib')
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier = XGBoost(boost_iter=20)
nlp_embedding = GloVe(max_words)
train(train_data.lyrics, train_data.genre, nlp_embedding, nlp_classifier, label_encoder)
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier.load('models/model_glove_xgboost.json')
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
# vector_length = max_words * embedding_length
nlp_classifier = CNN(max_words * 100, len(genres), "adam")
nlp_embedding = GloVe(max_words)
train(train_data.lyrics, train_data.genre, nlp_embedding, nlp_classifier, label_encoder)
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)

In [None]:
nlp_classifier.load('models/model_glove_cnn')
p = test(test_data.lyrics, nlp_embedding, nlp_classifier, label_encoder)
accuracy_score(test_data.genre.values, p)