# Cabecera
* Importación de librerías
* Definición de variables globales

In [1]:
import sys
import numpy as np
import re
from numpy.random import rand as np_rand
from numpy.random import seed as np_seed
from numpy import array as np_array
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense
from keras.preprocessing import sequence
from sklearn.svm.classes import LinearSVC
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
CLASSES = []
EMB_SEP_CHAR = " "
RE_TOKEN_USER = re.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")

# Lectura de Datos
* Lectura corpus
* Lectura embeddings

In [3]:
def read_corpus(path):
    '''Load the corpus into memory
    '''
    
    ids = []
    labels = []
    tweets = []
    ids_append = ids.append
    classes_append = CLASSES.append
    labels_append = labels.append
    tweets_append = tweets.append
    with(open(path, 'r', encoding='utf-8')) as input_file:
        own_split = str.split
        own_strip = str.strip
        input_file.readline()
        for buffer in input_file:
            buffer_fields = own_split(buffer, ';;;')
            ids_append(own_strip(buffer_fields[0]))
            label = own_strip(buffer_fields[4])
            if(label not in CLASSES):
                classes_append(label)
            labels_append(CLASSES.index(label))
            tweets_append(own_strip(buffer_fields[-1]))
    
    return(ids, labels, tweets)

In [4]:
def read_embeddings(path, offset):
    """Load embeddings file.
    """
    word_embeddings = [[] for i in range(offset)]
    word_indexes = {}
    with open(path, "r", encoding="utf-8") as emb_file:
        emb_file.readline()
        for line in emb_file:
            fields = line.partition(EMB_SEP_CHAR)
            word = fields[0].strip()
            own_strip = str.strip
            emb_values = np_array([float(x) for x in own_strip(fields[-1]).split(EMB_SEP_CHAR)])
            word_indexes[word] = len(word_embeddings)
            word_embeddings.append(emb_values)

    return (word_embeddings, word_indexes)

# Utilidades
* Función para tokenizar.
* Función para generar entrada RNN
* Función para generar entrada RNN con embeddings pre-entrenados.

In [5]:
def tokenize(text):
    """Tokenize an input text
    
    Args:
        text: A String with the text to tokenize
    
    Returns:
        A list of Strings (tokens)
    """
    text_tokenized = TWEET_TOKENIZER.tokenize(text)
    return text_tokenized

In [6]:
def fit_transform_vocabulary(corpus):
    """Creates the vocabulary of the corpus
    
    Args:
        corpus: A list os str (documents)
        
    Returns:
        A tuple whose first element is a dictionary word-index and the second
        element is a list of list in which each position is the index of the 
        token in the vocabulary
    """
    
    vocabulary = {}
    corpus_indexes = []
    corpus_indexes_append = corpus_indexes.append
    index = 2
    for doc in corpus:
        doc_indexes = []
        tokens = tokenize(doc)
        for token in tokens:
            if token not in vocabulary:
                vocabulary[token] = index
                doc_indexes.append(index)
                index += 1
            else:
                doc_indexes.append(vocabulary[token])
        
        corpus_indexes_append(doc_indexes)
        
    return (vocabulary, corpus_indexes)

In [7]:
def fit_transform_vocabulary_pretrain_embeddings(corpus, pre_embeddings_index):
    """Creates the vocabulary of the corpus.
        Index 0: padding
        Index 1: OOV.
    
    Args:
        corpus: A list os str (documents)
        
    Returns:
        A tuple whose first element is a dictionary word-index and the second
        element is a list of list in which each position is the index of the 
        token in the vocabulary
    """
    
    vocabulary = {}
    corpus_indexes = []
    corpus_indexes_append = corpus_indexes.append
    index = 0
    own_lowercase = str.lower
    for doc in corpus:
        doc_indexes = []
        tokens = tokenize(own_lowercase(doc))
        for token in tokens:
            if RE_TOKEN_USER.fullmatch(token):
                token = "@user"
            if token in pre_embeddings_index:
                index = pre_embeddings_index[token]
                doc_indexes.append(index)
                if token not in vocabulary:
                    vocabulary[token] = index
            else:
                index = 1
                doc_indexes.append(index)
                if token not in vocabulary:
                    vocabulary[token] = index
        corpus_indexes_append(doc_indexes)
        
    return (vocabulary, corpus_indexes)

# Clasificadores


*   SVM con TF-IDF
*   RNN-LSTM con TF-IDF
*   RNN-LSTM con embeddings aleatorios
*   RNN-LSTM con embeddings pre-entrenados (FastText)



In [8]:
def classification_linear_svm(tweets, train_index, test_index, labels_train, random_state=None):
    """Classifies using SVM as classifier
    """
    
    
    #Representation
    tfidf_parser = TfidfVectorizer(tokenizer=tokenize, lowercase=False, analyzer='word')
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]
    
    train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform(tweets_train)
    test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test)
    
    
    classifier = LinearSVC(multi_class="ovr", random_state=random_state)
    print("Start SVM training")
    classifier = classifier.fit(train_sparse_matrix_features_tfidf, labels_train)
    print("Finish SVM training")
    y_labels = classifier.predict(test_sparse_matrix_features_tfidf)
    
    return y_labels

In [9]:
def classification_tfidf_rnn(tweets, train_index, test_index, labels_train, random_state=None):
    """Classification using a RNN with tfidf as features
    """
    np_seed(random_state)
    #Representation
    tfidf_parser = TfidfVectorizer(tokenizer=tokenize, lowercase=False, analyzer='word')
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]
    
    train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform(tweets_train)
    test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test)
    
    train_features_tfidf = []
    own_train_features_tfidf_append = train_features_tfidf.append
    lengths_tweets = []
    own_lengths_tweets_append = lengths_tweets.append
    
    for tweet in train_sparse_matrix_features_tfidf:
        own_train_features_tfidf_append(tweet.data)
        own_lengths_tweets_append(len(tweet.data))
    

    test_features_tfidf = [tweet.data for tweet in test_sparse_matrix_features_tfidf]
    #Average length
    max_len_input = int(np.average(lengths_tweets, 0))
    #lstm_output_dim = int(2**np.log2(max_len_input))
    
    #NN model
    nn_model = Sequential()
    nn_model.add(LSTM(64, input_shape=(max_len_input,1)))
    nn_model.add(Dense(32, activation='tanh'))
    nn_model.add(Dense(len(CLASSES), activation='softmax'))
    nn_model.compile(optimizer="adam", 
                     loss="sparse_categorical_crossentropy", 
                     metrics=["accuracy"])
    
    print("Summary of the model")
    print("Training samples:\t{}".format(train_sparse_matrix_features_tfidf.shape[0]))
    print("Training features:\t{}".format(train_sparse_matrix_features_tfidf.shape[-1]))
    print("Training parameters:\t{}".format(train_sparse_matrix_features_tfidf.shape[0]*train_sparse_matrix_features_tfidf.shape[-1]))
    print(nn_model.summary())

    train_features_tfidf_pad = sequence.pad_sequences(train_features_tfidf, maxlen=max_len_input, padding="post", truncating="post", dtype=train_sparse_matrix_features_tfidf.dtype)
    train_features_tfidf_pad = np.expand_dims(train_features_tfidf_pad, axis=-1)
    print("Start RNN LSTM training")
    nn_model.fit(train_features_tfidf_pad, labels_train, batch_size=32, epochs=10, verbose=1)
    print("Finish RNN LSTM training")
    test_features_tfidf_pad = sequence.pad_sequences(test_features_tfidf, maxlen=max_len_input, padding="post", truncating="post", dtype=test_sparse_matrix_features_tfidf.dtype)
    test_features_tfidf_pad = np.expand_dims(test_features_tfidf_pad, axis=-1)
    y_labels = nn_model.predict_classes(test_features_tfidf_pad, batch_size=32, verbose=1)
    
    return y_labels

In [10]:
def classifcation_embedings_rnn(tweets, train_index, test_index, labels_train, random_state=None):
    """Classification with RNN and embedings (no pre-trained)
    """
    np_seed(random_state)
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]

    #Build vocabulary and corpus indexes
    vocabulary_train, corpus_train_index = fit_transform_vocabulary(tweets_train)
    
    
    max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
    
    corpus_test_index = []
    own_corpus_test_index_append = corpus_test_index.append
    for tweet_test in tweets_test:
        tokens_test = tokenize(tweet_test)
        own_corpus_test_index_append([vocabulary_train.get(token_test, 1) for token_test in tokens_test])
    
    nn_model = Sequential()
    nn_model.add(Embedding(len(vocabulary_train)+2, 100, input_length=max_len_input, trainable=False))
    nn_model.add(LSTM(64))
    nn_model.add(Dense(32, activation='tanh'))
    nn_model.add(Dense(len(CLASSES), activation='softmax'))
    nn_model.compile(optimizer="adam", 
                     loss="sparse_categorical_crossentropy", 
                     metrics=["accuracy"])
    
    print("Summary of the model")
    print("Training samples:\t{}".format(len(tweets_train)))
    print("Training features (vocabulary):\t{}".format(len(vocabulary_train)))
    print("Training doc x features:\t{}".format(len(tweets_train)*len(vocabulary_train)))
    print("Training vocabulary embeddings:\t{}".format(len(vocabulary_train)*100))
    print("Training parameters:\t{}".format(len(tweets_train)*len(vocabulary_train)*100))
    print(nn_model.summary())
    

    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
    
    
    print("Start RNN EMBEDDING LSTM training")
    nn_model.fit(train_features_pad, labels_train, batch_size=32, epochs=25, verbose=1)
    print("Finish RNN EMBEDDING LSTM training")
    test_features_pad = sequence.pad_sequences(corpus_test_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_test_index[0][0]))
    y_labels = nn_model.predict_classes(test_features_pad, batch_size=32, verbose=1)
    return y_labels

In [11]:
def classifcation_pretrain_embedings_rnn(tweets, train_index, test_index, labels_train, embeddings_path, random_state=None):
    """Classification with RNN and embedings (no pre-trained)
    """
    
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]

    #Offset = 2; Padding and OOV.
    print("Begin loading embeddings.")
    word_embeddings, word_emb_indexes = read_embeddings(embeddings_path, 2)
    print("End loading embeddings.")
    np_seed(random_state)
    word_embeddings[0] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1
    word_embeddings[1] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1

    #Build vocabulary and corpus indexes
    vocabulary_train, corpus_train_index = fit_transform_vocabulary_pretrain_embeddings(tweets_train, word_emb_indexes)
    
    
    max_len_input = int(np.average([len(tweet_train) for tweet_train in corpus_train_index], 0))
    
    corpus_test_index = []
    own_corpus_test_index_append = corpus_test_index.append
    own_lowercase = str.lower
    for tweet_test in tweets_test:
        tokens_test = tokenize(own_lowercase(tweet_test))
        doc_test_index = []
        for token_test in tokens_test:
            if RE_TOKEN_USER.fullmatch(token_test) is not None:
                token_test = "@user"
            doc_test_index.append(word_emb_indexes.get(token_test, 1))
        own_corpus_test_index_append(doc_test_index)
    
    
    nn_model = Sequential()
    nn_model.add(Embedding(len(word_embeddings), len(word_embeddings[0]), weights=[np_array(word_embeddings)], input_length=max_len_input, trainable=False))
    nn_model.add(LSTM(64))
    nn_model.add(Dense(32, activation='tanh'))
    nn_model.add(Dense(len(CLASSES), activation='softmax'))
    nn_model.compile(optimizer="adam", 
                     loss="sparse_categorical_crossentropy", 
                     metrics=["accuracy"])
    
    print("Summary of the model")
    print("Training samples:\t{}".format(len(tweets_train)))
    print("Training features (vocabulary/embeddings):\t{}".format(len(word_embeddings)))
    print("Training doc x features:\t{}".format(len(tweets_train)*len(word_embeddings)))
    print("Training vocabulary embeddings:\t{}".format(len(word_embeddings)*100))
    print("Training input features:\t{}".format(len(tweets_train)*max_len_input))
    print("Training input features:\t{}".format(len(tweets_train)*max_len_input*100))
    print("Training parameters:\t{}".format(len(tweets_train)*len(word_embeddings)*100))
    print(nn_model.summary())
    

    train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_train_index[0][0]))
    
    
    print("Start RNN PRETRAIN EMBEDDING LSTM training")
    nn_model.fit(train_features_pad, labels_train, batch_size=32, epochs=25, verbose=1)
    print("Finish RNN PRETRAIN EMBEDDING LSTM training")
    test_features_pad = sequence.pad_sequences(corpus_test_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_test_index[0][0]))
    y_labels = nn_model.predict_classes(test_features_pad, batch_size=32, verbose=1)
    return y_labels

# Evaluación

In [12]:
def calculate_quality_performamnce(y_labels, y_classified_labels, model_name):
    
    classes_index = [CLASSES.index(c) for c in CLASSES]
    accruacy = metrics.accuracy_score(y_labels, y_classified_labels)
    macro_precision = metrics.precision_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
    macro_recall = metrics.recall_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
    macro_f1 = metrics.f1_score(y_labels, y_classified_labels, labels=classes_index, average="macro")
    
    print("\n*** Results " + model_name + "***")
    print("Macro-Precision: " + str(macro_precision))
    print("Macro-Recall: " + str(macro_recall))
    print("Macro-F1: " + str(macro_f1))
    print("Accuracy: " + str(accruacy))

# Ejecución

Configuración semilla aleatoria

In [13]:
np.random.seed(seed=7)

In [14]:
input_file_path="tass14_general_corpus_train.csv"
input_embeddings_path="fasttext_spanish_twitter_100d.vec"

Leer corpus

In [15]:
ids, labels, tweets = read_corpus(input_file_path)

Partición entrenamiento y test

In [16]:
train_index, test_index = train_test_split(np.arange(len(tweets)), test_size=0.2, random_state=7)
labels_train = [labels[tweet_index] for tweet_index in train_index]
labels_test = [labels[tweet_index] for tweet_index in test_index]

Ejecución clasificadores

In [17]:
y_labels_svn = classification_linear_svm(tweets, train_index, test_index, labels_train, 7)

Start SVM training
Finish SVM training


In [18]:
y_labels_rnn = classification_tfidf_rnn(tweets, train_index, test_index, labels_train)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Summary of the model
Training samples:	5774
Training features:	18799
Training parameters:	108545426
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                16896     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 132       
Total params: 19,108
Trainable params: 19,108
Non-trainable params: 0
_________________________________________________________________
None
Start RNN LSTM training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [19]:
y_labels_embeddings_rnn = classifcation_embedings_rnn(tweets, train_index, test_index, labels_train)

Summary of the model
Training samples:	5774
Training features (vocabulary):	18799
Training doc x features:	108545426
Training vocabulary embeddings:	1879900
Training parameters:	10854542600
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 19, 100)           1880100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 132       
Total params: 1,924,552
Trainable params: 44,452
Non-trainable params: 1,880,100
_________________________________________________________________
None
Start RNN EMBEDDING LSTM training
Epoch 1/25
Epoch 2/25
Epoch 3/

In [20]:
y_labels_pretrain_embeddings_rnn = classifcation_pretrain_embedings_rnn(tweets, train_index, test_index, labels_train,input_embeddings_path,7)

Begin loading embeddings.
End loading embeddings.
Summary of the model
Training samples:	5774
Training features (vocabulary/embeddings):	513342
Training doc x features:	2964036708
Training vocabulary embeddings:	51334200
Training input features:	109706
Training input features:	10970600
Training parameters:	296403670800
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 19, 100)           51334200  
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 132       
Total params: 51,378,652
Trainable params: 44,452
Non-trainable params: 51,334,200
__

In [21]:
calculate_quality_performamnce(labels_test, y_labels_svn, "SVM")
calculate_quality_performamnce(labels_test, y_labels_rnn, "RNN_LSTM")
calculate_quality_performamnce(labels_test, y_labels_embeddings_rnn, "RNN_EMBEDINGS_LSTM")
calculate_quality_performamnce(labels_test, y_labels_pretrain_embeddings_rnn, "RNN_PRETRAIN_EMBEDINGS_LSTM")


*** Results SVM***
Macro-Precision: 0.4383837005174791
Macro-Recall: 0.4231233155582803
Macro-F1: 0.41892876726107364
Accuracy: 0.5533240997229917

*** Results RNN_LSTM***
Macro-Precision: 0.3449534569476766
Macro-Recall: 0.30921803034448014
Macro-F1: 0.2449266958839656
Accuracy: 0.43005540166204986

*** Results RNN_EMBEDINGS_LSTM***
Macro-Precision: 0.4226924587486431
Macro-Recall: 0.38224974868015993
Macro-F1: 0.372017775860445
Accuracy: 0.4930747922437673

*** Results RNN_PRETRAIN_EMBEDINGS_LSTM***
Macro-Precision: 0.4297365882135207
Macro-Recall: 0.4253656851383887
Macro-F1: 0.4267003418199647
Accuracy: 0.5249307479224377


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
