In [1]:
import re
import numpy as np
import pandas as pd

#import tensorflow as tf
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Conv1D
from keras.layers import Embedding, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPool1D
from keras.layers import Dropout
from keras.layers import Flatten
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('stopwords')

#metricas de avaliacao
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text) 
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df = pd.read_csv('../input/music-mar21/rock_mar21.csv')

def split_feature_label(data):
    X = data['letras'].apply(clean_text)
    y = data['genero']
    return X,y

def two_places(value):
    rounded= float("{:.2f}".format(value))
    return rounded

#Define o nome do arquivo de saída (compilação de todos os resultados)
def write(row):
    row = [str(w) for w in row]
    output = open("./results_rock_gru_return_sequences_max_pooling_glove.txt","a")#append mode
    output.write(','.join(row)+"\n")
    output.close()

def write_header(target_names):
    header = ['nn','nn_params','word_embeddings_params','accuracy','f1_macro','f1_weighted']
    for target in target_names:
        header.append('{}_precision'.format(target))
        header.append('{}_recall'.format(target))
        header.append('{}_f1_score'.format(target))
        header.append('{}_samples'.format(target))

    header.append('total_samples')
    write(header)


def compile_results(nn,nn_params,custom_we,y_test, y_pred, target_names):
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    results_row = []
    
    results_row.append(nn)
    results_row.append(nn_params)
    results_row.append(custom_we)
    results_row.append(two_places(report['accuracy']))
    results_row.append(two_places(report['macro avg']['f1-score']))
    results_row.append(two_places(report['weighted avg']['f1-score']))
    for target in target_names:
        results_row.append(two_places(report[target]['precision']))
        results_row.append(two_places(report[target]['recall']))
        results_row.append(two_places(report[target]['f1-score']))
        results_row.append(report[target]['support'])

    results_row.append(report['macro avg']['support'])
    
    write(results_row)

X,y = split_feature_label(df)

#Codificando as classes para um vetor numérico 
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

write_header(encoder.classes_)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

num_classes = len(y.unique())
t = Tokenizer()
t.fit_on_texts(X)
vocab_size = len(t.word_index) + 1
max_length = 200

X_train_tokenized = t.texts_to_sequences(X_train)
X_test_tokenized = t.texts_to_sequences(X_test)

#Preenchimento da sequencia de textos para todos ficarem do mesmo tamanho
#Isso é necessário para deixar todas as sequencias com o mesmo tamanho para que possam ser aplicadas à rede neural
X_train_final = keras.preprocessing.sequence.pad_sequences(X_train_tokenized,maxlen=max_length, padding='post')
X_test_final = keras.preprocessing.sequence.pad_sequences(X_test_tokenized,maxlen=max_length, padding='post')

from glob import glob


#Aqui existem duas linhas custom_word_embeddings, a primeira carrega o diretório das embeddings pré-treinadas
#no corpus
#a segunda, eu carrego a embedding pré-treinada Glove

#custom_word_embeddings = []

#models = ['sg','cbow','fasttext'] 
#window_sizes = [5, 10]
#num_dimensions = [100, 300]
#num_max_epochs = [5, 50]

#for model in models:
    #for window_size in window_sizes:
        #for dim_size in num_dimensions:
            #for max_epochs in num_max_epochs:
                 #custom_word_embeddings.append('../input/embedding-matrix/word embedding weight matrix/rock/custom_rock_{}_{}_{}_{}d.txt'.format(model,max_epochs,window_size,dim_size))


custom_word_embeddings = glob('../input/glove-6b/*.txt')

def cnn():
    #REDE NEURAL CNN
    filters_list = [50,100]
    kernel_size_list = [2,3,4,5]

    for word_embeddings in custom_word_embeddings:
        for conv1d_filter in filters_list:
            for conv1d_kernel in kernel_size_list:
                
                # load the whole embedding into memory
                embeddings_index = dict()
                f = open(word_embeddings)
                for line in f:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
                f.close()

                #Número de dimensões da W.E.
                num_dim = len(coefs)

                #print('Loaded %s word vectors.' % len(embeddings_index))
                # create a weight matrix for words in training docs
                embedding_matrix = np.zeros((vocab_size, num_dim))
                for word, i in t.word_index.items():
                    embedding_vector = embeddings_index.get(word)
                    if embedding_vector is not None:
                        embedding_matrix[i] = embedding_vector

                custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

                nn_params = "Embedding_out_dim = {}| Conv1D_filters={}| Conv1D_kernel_size={}".format(num_dim, conv1d_filter, conv1d_kernel)
                
                model = Sequential()
                model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
                model.add(Conv1D(conv1d_filter, conv1d_kernel, activation='relu'))
                model.add(MaxPooling1D(3))
                model.add(Flatten())
                model.add(Dense(num_classes, activation='softmax', kernel_regularizer=keras.regularizers.L2(0.001)))
                #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
                model.compile(loss='mse', optimizer='adam')
                #model.summary()
                #print()
                history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
                #avaliando a rede
                #loss, accuracy = model.evaluate(X_test_final, y_test)
                #print("Acurácia: {}".format(accuracy))
                y_pred = np.argmax(model.predict(X_test_final), axis=-1)
                keras.backend.reset_uids()

                compile_results('cnn',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)
            
def lstm():

    #LSTM
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| LSTM_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(LSTM(units,return_sequences=False))
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('lstm',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)

def lstm_return_sequences_average_pooling():
    #LSTM return_sequences_average_pooling
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| LSTM_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(LSTM(units,return_sequences=True))
            model.add(GlobalAveragePooling1D())
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('lstm_return_sequences_average_pooling',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)
        
def lstm_return_sequences_max_pooling():
    #LSTM_return_sequences_max_pooling
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| LSTM_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(LSTM(units,return_sequences=True))
            model.add(GlobalMaxPool1D())
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('lstm_return_sequences_max_pooling',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)

def gru():
    #GRU
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| GRU_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(GRU(units,return_sequences=False))
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('gru',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)

def gru_return_sequences_average_pooling():
    #GRU_return_sequences_average_pooling
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| GRU_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(GRU(units,return_sequences=True))
            model.add(GlobalAveragePooling1D())
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('gru_return_sequences_average_pooling',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)

def gru_return_sequences_max_pooling():
    #GRU_return_sequences_max_pooling
    units_list = [10,20,50,70,100]

    for word_embeddings in custom_word_embeddings:
        for units in units_list:

            # load the whole embedding into memory
            embeddings_index = dict()
            f = open(word_embeddings)
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            f.close()

            #Número de dimensões da W.E.
            num_dim = len(coefs)

            #print('Loaded %s word vectors.' % len(embeddings_index))
            # create a weight matrix for words in training docs
            embedding_matrix = np.zeros((vocab_size, num_dim))
            for word, i in t.word_index.items():
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector

            custom_embedding_name = word_embeddings.split('/')[-1].replace('.txt','')

            nn_params = "Embedding_out_dim = {}| GRU_units={}".format(num_dim, units)
            print(nn_params)

            model = Sequential()
            model.add(Embedding(vocab_size, num_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
            model.add(GRU(units,return_sequences=True))
            model.add(GlobalMaxPool1D())
            model.add(Dropout(0.5))
            model.add(Dense(num_classes, activation='softmax'))
            #model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
            model.compile(loss='mse', optimizer='adam')
            #model.summary()
            #print()
            history = model.fit(X_train_final, y_train, epochs=100, batch_size=32, validation_data=(X_test_final,y_test), verbose=0, shuffle=False)
            #avaliando a rede
            #loss, accuracy = model.evaluate(X_test_final, y_test)
            #print("Acurácia: {}".format(accuracy))
            y_pred = np.argmax(model.predict(X_test_final), axis=-1)
            keras.backend.reset_uids()

            compile_results('gru_return_sequences_max_pooling',nn_params,custom_embedding_name,y_test, y_pred, encoder.classes_)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
gru_return_sequences_max_pooling()

Embedding_out_dim = 300| GRU_units=10
Embedding_out_dim = 300| GRU_units=20
Embedding_out_dim = 300| GRU_units=50
Embedding_out_dim = 300| GRU_units=70
Embedding_out_dim = 300| GRU_units=100
Embedding_out_dim = 100| GRU_units=10
Embedding_out_dim = 100| GRU_units=20
Embedding_out_dim = 100| GRU_units=50
Embedding_out_dim = 100| GRU_units=70
Embedding_out_dim = 100| GRU_units=100
