In [1]:
# !pip install xgboost

In [2]:
import pandas as pd
import itertools
import os
import numpy as np
import xgboost as xgb

from johnsnowlabs import nlu
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from joblib import dump, load
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [3]:
glove_pipe = nlu.load('glove')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [4]:
data = pd.read_csv('data/train/metrolyrics.csv')

In [5]:
train_data = data[0:1000]
test_data = data[0:100]

In [6]:
max_words = 200
embedding_size = 100
model_dir = "model"
# pred_dir = "predictions"

In [19]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# if not os.path.exists(pred_dir):
#     os.makedirs(pred_dir)

In [8]:
def proc_emb(x):
    x.resize((max_words, embedding_size), refcheck=False)
    x = x.flatten()
    return x

In [9]:
def embed_lyrics(data, emb_pipe):
    embeddings = emb_pipe.predict(data, output_level='document')
    return np.array([proc_emb(x) for x in embeddings.word_embedding_glove])

In [10]:
class NaiveBayes:
    name = "naive-bayes"
    def __init__(self):
        self.model = GaussianNB()
    
    def partial_fit(self, X, Y, classes):
        self.model.partial_fit(X, Y, classes)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def save(self, filename):
        dump(self.model, filename)
    
    def load(self, filename):
        self.model = load(filename)


In [11]:
class SVM:
    name = "svm"
    def __init__(self):
        self.model = SGDClassifier()
    
    def partial_fit(self, X, Y, classes):
        self.model.partial_fit(X, Y, classes)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def save(self, filename):
        dump(self.model, filename)
    
    def load(self, filename):
        self.model = load(filename)

In [12]:
class XGBoost:
    name = "xgboost"
    
    def __init__(self):
        self.params = {'objective': 'multi:softmax'}
        self.model = None
        self.boost_iter = 30
        self.le = preprocessing.LabelEncoder()
    
    def partial_fit(self, X, Y, classes):
        
        if self.model is None:
            self.le.fit(classes)
            data = xgb.DMatrix(X, label=self.le.transform(Y))
            self.params['num_class'] = len(classes)
            self.model = xgb.train(self.params, data, self.boost_iter)
        else:
            data = xgb.DMatrix(X, label=self.le.transform(Y))
            self.model = xgb.train(self.params, data, self.boost_iter, xgb_model=self.model)
    
    def predict(self, X):
        return self.le.inverse_transform(self.model.predict(xgb.DMatrix(X)).astype(int))
    
    def save(self, filename):
        self.model.save_model(filename)
    
    def load(self, filename):
        self.model = xgb.Booster()
        self.model.load_model(filename)

In [13]:
# TODO: class for CNN

In [14]:
def train(data_x, data_y, emb_pipe, emb_name, model, batch_size=100):
    print(f"Training...")
    classes = np.unique(data_y)
    for i in range(0, data_x.shape[0], batch_size):

        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size
        
        print(f"Processing rows: {i} - {j - 1}")

        embeddings = embed_lyrics(data_x[i:j], emb_pipe)
        model.partial_fit(embeddings, data_y[i:j], classes=classes)
        # Zapisywać w nazwie j? Albo j-1 zamiast j?
        model.save(os.path.join(model_dir, f'model-{model.name}-{emb_name}-{j}.joblib'))
    print(f"Success!")


In [15]:
def test(data_x, emb_pipe, emb_name, model, batch_size=100):
    print(f"Testing...")
    predictions_all = []
    for i in range(0, data_x.shape[0], batch_size):

        if i + batch_size > data_x.shape[0]:
            j = data_x.shape[0]
        else:
            j = i + batch_size
        
        print(f"Processing rows: {i} - {j - 1}")

        embeddings = embed_lyrics(data_x[i:j], emb_pipe)
        
        predictions = model.predict(embeddings)
        predictions_all.extend(predictions)
        # Można to zapisywać po każdym batchu ale przydałoby się też gdzieś mądrze to usuwać być może
        # pd.DataFrame(predictions.reshape(-1, 1)).to_csv(os.path.join(pred_dir, f'model-{model.name}-{emb_name}.csv'), mode='a', index=False, header=False)
    print(f"Success!")    
    return predictions_all

In [16]:
model_nb = NaiveBayes()
train(train_data.lyrics, train_data.genre, glove_pipe, "glove", model_nb)
p = test(test_data.lyrics, glove_pipe, "glove", model_nb)
accuracy_score(test_data.genre.values, p)

Training...
Processing rows: 0 - 99
Processing rows: 100 - 199
Processing rows: 200 - 299
Processing rows: 300 - 399
Processing rows: 400 - 499
Processing rows: 500 - 599
Processing rows: 600 - 699
Processing rows: 700 - 799
Processing rows: 800 - 899
Processing rows: 900 - 999
Success!
Testing...
Processing rows: 0 - 99
Success!


0.38

In [17]:
model_svm = SVM()
train(train_data.lyrics, train_data.genre, glove_pipe, "glove", model_svm)
p = test(test_data.lyrics, glove_pipe, "glove", model_svm)
accuracy_score(test_data.genre.values, p)

Training...
Processing rows: 0 - 99
Processing rows: 100 - 199
Processing rows: 200 - 299
Processing rows: 300 - 399
Processing rows: 400 - 499
Processing rows: 500 - 599
Processing rows: 600 - 699
Processing rows: 700 - 799
Processing rows: 800 - 899
Processing rows: 900 - 999
Success!
Testing...
Processing rows: 0 - 99
Success!


0.62

In [18]:
model_xgb = XGBoost()
train(train_data.lyrics, train_data.genre, glove_pipe, "glove", model_xgb)
p = test(test_data.lyrics, glove_pipe, "glove", model_xgb)
accuracy_score(test_data.genre.values, p)

Training...
Processing rows: 0 - 99
Processing rows: 100 - 199
Processing rows: 200 - 299
Processing rows: 300 - 399
Processing rows: 400 - 499
Processing rows: 500 - 599
Processing rows: 600 - 699
Processing rows: 700 - 799
Processing rows: 800 - 899
Processing rows: 900 - 999
Success!
Testing...
Processing rows: 0 - 99
Success!


0.64