In [1]:
from __future__ import (
    absolute_import, absolute_import, division, division,
    print_function, print_function
)
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import stanza
stanza.download('pt')

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout


FLAGS = None
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2
MAX_SEQUENCE_LENGTH = 500

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gianpaolo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 15.2MB/s]                    
2020-11-17 17:50:56 INFO: Downloading default packages for language: pt (Portuguese)...
2020-11-17 17:50:57 INFO: File exists: C:\Users\Gianpaolo\stanza_resources\pt\default.zip.
2020-11-17 17:51:00 INFO: Finished downloading models and saved to C:\Users\Gianpaolo\stanza_resources.


In [5]:
def read():
    df = pd.read_csv('dataset/Covid BR Tweets/opcovidbr.csv', index_col='Id')
    return train_test_split(df, test_size=0.3, random_state=123)

def remove_urls(text):
    url_remover = re.compile(r'https?://\S+|www\.\S+')
    return url_remover.sub(r'', text)

def remove_html(text):
    html_remover=re.compile(r'<.*?>')
    return html_remover.sub(r'',text)

def remove_mentions(text):
    mention_remover=re.compile(r'@\w+')
    return mention_remover.sub(r'',text)

def remove_numbers(text):
    number_remover=re.compile(r'\d+')
    return number_remover.sub(r'',text)

def remove_hashtags(text):
    number_remover=re.compile(r'#\w+')
    return number_remover.sub(r'',text)

def remove_punctuation(text):
    punct_remover=re.compile(r'[^\w\s\d]+')
    return punct_remover.sub(r'',text)

def remove_excessive_whitespace(text):
    ws_remover=re.compile(r'\s+')
    return ws_remover.sub(r' ', str(text)).strip()

def remove_stopwords(text, stop_words):


    return " ".join([word for word in text.split(" ") if word not in stop_words])

def lowering(text):
    return text.lower()

def lemmatization(text, nlp):
    doc = nlp(text)
    return ' '.join([f'{word.lemma}' for sent in doc.sentences for word in \
            sent.words])

def clean(df):
    stop_words = set(stopwords.words('portuguese'))
    df = df.loc[:,["twitter", "polarity"]]
    df = df[-df.polarity.isnull()]
    df["score"] = df.polarity
    df["score"] = df.score.apply(lambda x: 0 if x == -1 else 1)
    
    df["text"] = df.twitter
    df["text"] = df.text.apply(lambda x: remove_urls(x))
    df["text"] = df.text.apply(lambda x: remove_mentions(x))
    df["text"] = df.text.apply(lambda x: remove_html(x))
    df["text"] = df.text.apply(lambda x: remove_numbers(x))
    df["text"] = df.text.apply(lambda x: remove_hashtags(x))
    df["text"] = df.text.apply(lambda x: remove_punctuation(x))
    df["text"] = df.text.apply(lambda x: remove_excessive_whitespace(x))
    df["text"] = df.text.apply(lambda x: remove_stopwords(x, stop_words))
    df["text"] = df.text.apply(lambda x: lowering(x))
    # Removing messages that are too short.
    df = df[df.text.apply(lambda x: len(x.split(" ")) > 2)]

    nlp = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')

    df["text"] = df.text.apply(lambda x: lemmatization(x, nlp))


    return df

def cleaning():
    df_train, df_test = read()
    df_train = clean(df_train)
    df_test = clean(df_test)

    return df_train, df_test



In [6]:
def ngram_vectorize(train_texts, train_labels, val_texts):
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train)
    x_val = selector.transform(x_val)

    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')
    return x_train, x_val
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.

    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers - 1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model


def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    # Arguments
        num_classes: int, number of classes.

    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation


def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = 2
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
            unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)

    # Create model instance.
    model = mlp_model(layers=layers,
                      units=units,
                      dropout_rate=dropout_rate,
                      input_shape=x_train.shape[1:],
                      num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
        x_train.toarray(),
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val.toarray(), val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('experimento_2_ii_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

In [7]:
df_train, df_test = cleaning()

print("Reading")
data = (df_train.text, df_train.score), (df_test.text, df_test.score)
print("Training")
train_ngram_model(data, epochs=1000)



2020-11-17 17:54:13 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |

2020-11-17 17:54:13 INFO: Use device: cpu
2020-11-17 17:54:13 INFO: Loading: tokenize
2020-11-17 17:54:13 INFO: Loading: mwt
2020-11-17 17:54:13 INFO: Loading: pos
2020-11-17 17:54:14 INFO: Loading: lemma
2020-11-17 17:54:14 INFO: Done loading processors!
2020-11-17 17:55:11 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |

2020-11-17 17:55:11 INFO: Use device: cpu
2020-11-17 17:55:11 INFO: Loading: tokenize
2020-11-17 17:55:11 INFO: Loading: mwt
2020-11-17 17:55:11 INFO: Loading: pos
2020-11-17 17:55:12 INFO: Loading: lemma
2020-11-17 17:55:12 INFO: Done loading processors!


Reading
Training




Train on 417 samples, validate on 180 samples
Epoch 1/1000
417/417 - 1s - loss: 0.6935 - acc: 0.5084 - val_loss: 0.6914 - val_acc: 0.5111
Epoch 2/1000
417/417 - 0s - loss: 0.6857 - acc: 0.6163 - val_loss: 0.6890 - val_acc: 0.5611
Epoch 3/1000
417/417 - 0s - loss: 0.6768 - acc: 0.7602 - val_loss: 0.6867 - val_acc: 0.6167
Epoch 4/1000
417/417 - 0s - loss: 0.6700 - acc: 0.8345 - val_loss: 0.6842 - val_acc: 0.6500
Epoch 5/1000
417/417 - 0s - loss: 0.6624 - acc: 0.8681 - val_loss: 0.6815 - val_acc: 0.6444
Epoch 6/1000
417/417 - 0s - loss: 0.6519 - acc: 0.9161 - val_loss: 0.6782 - val_acc: 0.6500
Epoch 7/1000
417/417 - 0s - loss: 0.6424 - acc: 0.9113 - val_loss: 0.6744 - val_acc: 0.6833
Epoch 8/1000
417/417 - 0s - loss: 0.6296 - acc: 0.9400 - val_loss: 0.6703 - val_acc: 0.6722
Epoch 9/1000
417/417 - 0s - loss: 0.6190 - acc: 0.9424 - val_loss: 0.6657 - val_acc: 0.6722
Epoch 10/1000
417/417 - 0s - loss: 0.6015 - acc: 0.9424 - val_loss: 0.6607 - val_acc: 0.6833
Epoch 11/1000
417/417 - 0s - loss

(0.7, 0.568766368760003)