In [1]:
from __future__ import (
    absolute_import, absolute_import, division, division,
    print_function, print_function
)

import re

import nltk
nltk.download('stopwords')
import stanza
stanza.download('en')

import pandas as pd

from nltk.corpus import stopwords

import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import f_classif, SelectKBest
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout


FLAGS = None
NGRAM_RANGE = (1, 2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2
MAX_SEQUENCE_LENGTH = 500

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gianpaolo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 40.3MB/s]                    
2020-11-17 15:54:57 INFO: Downloading default packages for language: en (English)...
2020-11-17 15:54:57 INFO: File exists: C:\Users\Gianpaolo\stanza_resources\en\default.zip.
2020-11-17 15:55:03 INFO: Finished downloading models and saved to C:\Users\Gianpaolo\stanza_resources.


In [2]:
def read():
    return pd.read_csv("dataset/Corona_NLP_train.csv"), \
           pd.read_csv("dataset/Corona_NLP_test.csv")

def remove_urls(text):
    url_remover = re.compile(r'https?://\S+|www\.\S+')
    return url_remover.sub(r'', text)

def remove_html(text):
    html_remover=re.compile(r'<.*?>')
    return html_remover.sub(r'',text)

def remove_mentions(text):
    mention_remover=re.compile(r'@\w+')
    return mention_remover.sub(r'',text)

def remove_numbers(text):
    number_remover=re.compile(r'\d+')
    return number_remover.sub(r'',text)

def remove_hashtags(text):
    number_remover=re.compile(r'#\w+')
    return number_remover.sub(r'',text)

def remove_punctuation(text):
    punct_remover=re.compile(r'[^\w\s\d]+')
    return punct_remover.sub(r'',text)

def remove_excessive_whitespace(text):
    ws_remover=re.compile(r'\s+')
    return ws_remover.sub(r' ', str(text)).strip()

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))

    return " ".join([word for word in text.split(" ") if word not in stop_words])

def lowering(text):
    return text.lower()

def lemmatization(text, nlp):
    doc = nlp(text)
    return ' '.join([f'{word.lemma}' for sent in doc.sentences for word in \
            sent.words])
    
def clean(df):
    df = df.loc[:,["OriginalTweet", "Sentiment"]]
    scores = {
        'Neutral': 1,
        'Extremely Negative':0,
        'Negative':0,
        'Positive':2,
        'Extremely Positive':2
    }

    df["score"] = df.Sentiment.apply(lambda x: scores[x])
    df["score"] = df.score.apply(lambda x: score_normalize(x))
    
    df.drop("Sentiment", inplace=True, axis=1)

    df["text"] = df.OriginalTweet
    df["text"] = df.text.apply(lambda x: remove_urls(x))
    df["text"] = df.text.apply(lambda x: remove_mentions(x))
    df["text"] = df.text.apply(lambda x: remove_html(x))
    df["text"] = df.text.apply(lambda x: remove_numbers(x))
    df["text"] = df.text.apply(lambda x: remove_hashtags(x))
    df["text"] = df.text.apply(lambda x: remove_punctuation(x))
    df["text"] = df.text.apply(lambda x: remove_excessive_whitespace(x))
    df["text"] = df.text.apply(lambda x: remove_stopwords(x))
    df["text"] = df.text.apply(lambda x: lowering(x))

    df = df[df.text.apply(lambda x: len(x.split(" ")) > 2)]

    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', use_gpu=True)
    df["text"] = df.text.apply(lambda x: lemmatization(x, nlp))

    return df

def cleaning():
    df_train, df_test = read()
    df_train = clean(df_train)
    df_test = clean(df_test)

    return df_train, df_test



In [13]:
def ngram_vectorize(train_texts, train_labels, val_texts):    
    kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)    
    x_train = vectorizer.fit_transform(train_texts)
    x_val = vectorizer.transform(val_texts)

    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train)
    x_val = selector.transform(x_val)

    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')
    return x_train, x_val


def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers - 1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model


def _get_last_layer_units_and_activation(num_classes):
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation


def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = 3
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
            unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)

    # Create model instance.
    model = mlp_model(layers=layers,
                      units=units,
                      dropout_rate=dropout_rate,
                      input_shape=x_train.shape[1:],
                      num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
        x_train.toarray(),
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val.toarray(), val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('experimento_2_i_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

In [4]:
df_train, df_test = cleaning()

2020-11-17 15:56:34 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-11-17 15:56:34 INFO: Use device: cpu
2020-11-17 15:56:34 INFO: Loading: tokenize
2020-11-17 15:56:34 INFO: Loading: pos
2020-11-17 15:56:35 INFO: Loading: lemma
2020-11-17 15:56:35 INFO: Done loading processors!
2020-11-17 16:57:26 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-11-17 16:57:26 INFO: Use device: cpu
2020-11-17 16:57:26 INFO: Loading: tokenize
2020-11-17 16:57:26 INFO: Loading: pos
2020-11-17 16:57:27 INFO: Loading: lemma
2020-11-17 16:57:27 INFO: Done loading processors!


In [8]:
def score_normalize(x):
    if x == 1:
        return 2
    if x == 0:
        return 1
    if x == -1:
        return 0

In [14]:
print("Training")
data = (df_train.text, df_train.score), (df_test.text, df_test.score)
train_ngram_model(data, epochs=1000)


Training




Train on 40808 samples, validate on 3786 samples
Epoch 1/1000
40808/40808 - 18s - loss: 0.8457 - acc: 0.6363 - val_loss: 0.6707 - val_acc: 0.7433
Epoch 2/1000
40808/40808 - 16s - loss: 0.5247 - acc: 0.8138 - val_loss: 0.5641 - val_acc: 0.7895
Epoch 3/1000
40808/40808 - 16s - loss: 0.4175 - acc: 0.8547 - val_loss: 0.5544 - val_acc: 0.7861
Epoch 4/1000
40808/40808 - 15s - loss: 0.3612 - acc: 0.8741 - val_loss: 0.5627 - val_acc: 0.7903
Epoch 5/1000
40808/40808 - 15s - loss: 0.3285 - acc: 0.8838 - val_loss: 0.5760 - val_acc: 0.7837
Validation accuracy: 0.7836766839027405, loss: 0.576000353487723


(0.7836767, 0.576000353487723)